作者 刘锟

合并分支 'akun' 到 'master'

Akun



查看合并请求 !153
@@ -11,7 +11,6 @@ use App\Services\ProjectServer; @@ -11,7 +11,6 @@ use App\Services\ProjectServer;
11 use Illuminate\Console\Command; 11 use Illuminate\Console\Command;
12 use Illuminate\Support\Facades\Cache; 12 use Illuminate\Support\Facades\Cache;
13 use Illuminate\Support\Facades\DB; 13 use Illuminate\Support\Facades\DB;
14 -use Illuminate\Support\Facades\Log;  
15 use Illuminate\Support\Facades\Redis; 14 use Illuminate\Support\Facades\Redis;
16 15
17 /** 16 /**
@@ -358,7 +357,7 @@ class HtmlCollect extends Command @@ -358,7 +357,7 @@ class HtmlCollect extends Command
358 continue; 357 continue;
359 } 358 }
360 $path_arr = explode('.', $vcs); 359 $path_arr = explode('.', $vcs);
361 - if (end($path_arr) == 'html') { 360 + if (in_array(end($path_arr), ['html', 'php', 'com', 'xml'])) {
362 continue; 361 continue;
363 } 362 }
364 363
@@ -9,6 +9,7 @@ use App\Models\RouteMap\RouteMap; @@ -9,6 +9,7 @@ use App\Models\RouteMap\RouteMap;
9 use App\Services\CosService; 9 use App\Services\CosService;
10 use App\Services\ProjectServer; 10 use App\Services\ProjectServer;
11 use Illuminate\Console\Command; 11 use Illuminate\Console\Command;
  12 +use Illuminate\Support\Facades\Cache;
12 use Illuminate\Support\Facades\DB; 13 use Illuminate\Support\Facades\DB;
13 use Illuminate\Support\Facades\Redis; 14 use Illuminate\Support\Facades\Redis;
14 15
@@ -75,29 +76,8 @@ class HtmlLanguageCollect extends Command @@ -75,29 +76,8 @@ class HtmlLanguageCollect extends Command
75 $collect_info->status = CollectTask::STATUS_ING; 76 $collect_info->status = CollectTask::STATUS_ING;
76 $collect_info->save(); 77 $collect_info->save();
77 78
78 - //获取英文站域名  
79 - $domain = $collect_info->domain;  
80 - if (strpos($domain, '/') !== false) {  
81 - $domain = substr($domain, 0, strpos($domain, '/'));  
82 - } else {  
83 - $domain = str_replace($collect_info->language, 'www', $domain);  
84 - }  
85 -  
86 - $web_url_domain = $domain;  
87 - $home_url = $domain;  
88 - $url_web_config = 'https://' . $domain . '/wp-content/cache/user_config.text';  
89 - $data_config = curl_c($url_web_config);  
90 - if ($data_config) {  
91 - $web_url_arr = parse_url($data_config['web_url_domain'] ?? '');  
92 - if (isset($web_url_arr['host'])) {  
93 - $web_url_domain = $web_url_arr['host'];  
94 - }  
95 -  
96 - $home_url_arr = parse_url($data_config['home_url'] ?? '');  
97 - if (isset($home_url_arr['host'])) {  
98 - $home_url = $home_url_arr['host'];  
99 - }  
100 - } 79 + //获取站点正式和测试域名
  80 + $old_info = $this->getOldDomain($project_id, $collect_info->domain);
101 81
102 //采集html页面,下载资源到本地并替换 82 //采集html页面,下载资源到本地并替换
103 try { 83 try {
@@ -110,7 +90,15 @@ class HtmlLanguageCollect extends Command @@ -110,7 +90,15 @@ class HtmlLanguageCollect extends Command
110 return true; 90 return true;
111 } 91 }
112 92
113 - $source_list = $this->html_preg($html, $project_id, $collect_info->domain, $web_url_domain, $home_url); 93 + //如果有base64图片,先替换掉,再进行资源匹配
  94 + $new_html = $html;
  95 + preg_match_all("/data:([^;]*);base64,(.*)?\"/", $new_html, $result_img);
  96 + $img_base64 = $result_img[2] ?? [];
  97 + foreach ($img_base64 as $v64) {
  98 + $new_html = str_replace($v64, '', $new_html);
  99 + }
  100 +
  101 + $source_list = $this->html_preg($new_html, $project_id, $collect_info->domain, $old_info['web_url_domain'], $old_info['home_url']);
114 102
115 if ($source_list) { 103 if ($source_list) {
116 $html = $this->upload_source($html, $source_list, $project_id); 104 $html = $this->upload_source($html, $source_list, $project_id);
@@ -162,9 +150,6 @@ class HtmlLanguageCollect extends Command @@ -162,9 +150,6 @@ class HtmlLanguageCollect extends Command
162 case 'blog': 150 case 'blog':
163 $source = RouteMap::SOURCE_BLOG; 151 $source = RouteMap::SOURCE_BLOG;
164 break; 152 break;
165 - case 'tag':  
166 - $source = RouteMap::SOURCE_PRODUCT_KEYWORD;  
167 - break;  
168 default: 153 default:
169 $source = RouteMap::SOURCE_PRODUCT; 154 $source = RouteMap::SOURCE_PRODUCT;
170 break; 155 break;
@@ -197,6 +182,42 @@ class HtmlLanguageCollect extends Command @@ -197,6 +182,42 @@ class HtmlLanguageCollect extends Command
197 return $task_id; 182 return $task_id;
198 } 183 }
199 184
  185 + //获取站点老域名
  186 + protected function getOldDomain($project_id, $domain)
  187 + {
  188 + $key = 'project_collect_lan_domain_' . $project_id;
  189 +
  190 + $data = Cache::get($key);
  191 +
  192 + if (!$data) {
  193 + $web_url_domain = $domain;
  194 + $home_url = $domain;
  195 +
  196 + $url_web_config = 'https://' . $domain . '/wp-content/cache/user_config.text';
  197 + $data_config = curl_c($url_web_config);
  198 + if ($data_config) {
  199 + $web_url_arr = parse_url($data_config['web_url_domain'] ?? '');
  200 + if (isset($web_url_arr['host'])) {
  201 + $web_url_domain = $web_url_arr['host'];
  202 + }
  203 +
  204 + $home_url_arr = parse_url($data_config['home_url'] ?? '');
  205 + if (isset($home_url_arr['host'])) {
  206 + $home_url = $home_url_arr['host'];
  207 + }
  208 + }
  209 +
  210 + $data = [
  211 + 'web_url_domain' => $web_url_domain,
  212 + 'home_url' => $home_url,
  213 + ];
  214 +
  215 + Cache::add($key, $data, 3600);//缓存1小时
  216 + }
  217 +
  218 + return $data;
  219 + }
  220 +
200 //正则匹配html资源 221 //正则匹配html资源
201 protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url) 222 protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
202 { 223 {
@@ -246,6 +267,13 @@ class HtmlLanguageCollect extends Command @@ -246,6 +267,13 @@ class HtmlLanguageCollect extends Command
246 $check_vc_b && $source[] = $check_vc_b; 267 $check_vc_b && $source[] = $check_vc_b;
247 } 268 }
248 269
  270 + //a标签下载资源
  271 + preg_match_all('/<a\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_a);
  272 + $down = $result_a[2] ?? [];
  273 + foreach ($down as $vd) {
  274 + $check_vd = $this->url_check($vd, $project_id, $domain, $web_url_domain, $home_url);
  275 + $check_vd && $source[] = $check_vd;
  276 + }
249 277
250 return $source; 278 return $source;
251 } 279 }
@@ -265,7 +293,7 @@ class HtmlLanguageCollect extends Command @@ -265,7 +293,7 @@ class HtmlLanguageCollect extends Command
265 (empty($host) || $host == $web_url_domain || $host == $home_url) 293 (empty($host) || $host == $web_url_domain || $host == $home_url)
266 && $path 294 && $path
267 && (strpos($path, '.') !== false) 295 && (strpos($path, '.') !== false)
268 - && (end($path_arr) != 'html') 296 + && (!in_array(end($path_arr), ['html', 'php', 'com', 'xml']))
269 ) { 297 ) {
270 $source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first(); 298 $source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
271 if (!$source) { 299 if (!$source) {
@@ -329,7 +357,7 @@ class HtmlLanguageCollect extends Command @@ -329,7 +357,7 @@ class HtmlLanguageCollect extends Command
329 continue; 357 continue;
330 } 358 }
331 $path_arr = explode('.', $vcs); 359 $path_arr = explode('.', $vcs);
332 - if(end($path_arr) == 'html'){ 360 + if(in_array(end($path_arr), ['html', 'php', 'com', 'xml'])){
333 continue; 361 continue;
334 } 362 }
335 363