作者 刘锟

updatye

... ... @@ -11,7 +11,6 @@ use App\Services\ProjectServer;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Redis;
/**
... ... @@ -358,7 +357,7 @@ class HtmlCollect extends Command
continue;
}
$path_arr = explode('.', $vcs);
if (end($path_arr) == 'html') {
if (in_array(end($path_arr), ['html', 'php', 'com', 'xml'])) {
continue;
}
... ...
... ... @@ -9,6 +9,7 @@ use App\Models\RouteMap\RouteMap;
use App\Services\CosService;
use App\Services\ProjectServer;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Redis;
... ... @@ -75,29 +76,8 @@ class HtmlLanguageCollect extends Command
$collect_info->status = CollectTask::STATUS_ING;
$collect_info->save();
//获取英文站域名
$domain = $collect_info->domain;
if (strpos($domain, '/') !== false) {
$domain = substr($domain, 0, strpos($domain, '/'));
} else {
$domain = str_replace($collect_info->language, 'www', $domain);
}
$web_url_domain = $domain;
$home_url = $domain;
$url_web_config = 'https://' . $domain . '/wp-content/cache/user_config.text';
$data_config = curl_c($url_web_config);
if ($data_config) {
$web_url_arr = parse_url($data_config['web_url_domain'] ?? '');
if (isset($web_url_arr['host'])) {
$web_url_domain = $web_url_arr['host'];
}
$home_url_arr = parse_url($data_config['home_url'] ?? '');
if (isset($home_url_arr['host'])) {
$home_url = $home_url_arr['host'];
}
}
//获取站点正式和测试域名
$old_info = $this->getOldDomain($project_id, $collect_info->domain);
//采集html页面,下载资源到本地并替换
try {
... ... @@ -110,7 +90,15 @@ class HtmlLanguageCollect extends Command
return true;
}
$source_list = $this->html_preg($html, $project_id, $collect_info->domain, $web_url_domain, $home_url);
//如果有base64图片,先替换掉,再进行资源匹配
$new_html = $html;
preg_match_all("/data:([^;]*);base64,(.*)?\"/", $new_html, $result_img);
$img_base64 = $result_img[2] ?? [];
foreach ($img_base64 as $v64) {
$new_html = str_replace($v64, '', $new_html);
}
$source_list = $this->html_preg($new_html, $project_id, $collect_info->domain, $old_info['web_url_domain'], $old_info['home_url']);
if ($source_list) {
$html = $this->upload_source($html, $source_list, $project_id);
... ... @@ -162,9 +150,6 @@ class HtmlLanguageCollect extends Command
case 'blog':
$source = RouteMap::SOURCE_BLOG;
break;
case 'tag':
$source = RouteMap::SOURCE_PRODUCT_KEYWORD;
break;
default:
$source = RouteMap::SOURCE_PRODUCT;
break;
... ... @@ -197,6 +182,42 @@ class HtmlLanguageCollect extends Command
return $task_id;
}
//获取站点老域名
protected function getOldDomain($project_id, $domain)
{
$key = 'project_collect_lan_domain_' . $project_id;
$data = Cache::get($key);
if (!$data) {
$web_url_domain = $domain;
$home_url = $domain;
$url_web_config = 'https://' . $domain . '/wp-content/cache/user_config.text';
$data_config = curl_c($url_web_config);
if ($data_config) {
$web_url_arr = parse_url($data_config['web_url_domain'] ?? '');
if (isset($web_url_arr['host'])) {
$web_url_domain = $web_url_arr['host'];
}
$home_url_arr = parse_url($data_config['home_url'] ?? '');
if (isset($home_url_arr['host'])) {
$home_url = $home_url_arr['host'];
}
}
$data = [
'web_url_domain' => $web_url_domain,
'home_url' => $home_url,
];
Cache::add($key, $data, 3600);//缓存1小时
}
return $data;
}
//正则匹配html资源
protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
{
... ... @@ -246,6 +267,13 @@ class HtmlLanguageCollect extends Command
$check_vc_b && $source[] = $check_vc_b;
}
//a标签下载资源
preg_match_all('/<a\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_a);
$down = $result_a[2] ?? [];
foreach ($down as $vd) {
$check_vd = $this->url_check($vd, $project_id, $domain, $web_url_domain, $home_url);
$check_vd && $source[] = $check_vd;
}
return $source;
}
... ... @@ -265,7 +293,7 @@ class HtmlLanguageCollect extends Command
(empty($host) || $host == $web_url_domain || $host == $home_url)
&& $path
&& (strpos($path, '.') !== false)
&& (end($path_arr) != 'html')
&& (!in_array(end($path_arr), ['html', 'php', 'com', 'xml']))
) {
$source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
if (!$source) {
... ... @@ -329,7 +357,7 @@ class HtmlLanguageCollect extends Command
continue;
}
$path_arr = explode('.', $vcs);
if(end($path_arr) == 'html'){
if(in_array(end($path_arr), ['html', 'php', 'com', 'xml'])){
continue;
}
... ...