|
@@ -9,6 +9,7 @@ use App\Models\RouteMap\RouteMap; |
|
@@ -9,6 +9,7 @@ use App\Models\RouteMap\RouteMap; |
|
9
|
use App\Services\CosService;
|
9
|
use App\Services\CosService;
|
|
10
|
use App\Services\ProjectServer;
|
10
|
use App\Services\ProjectServer;
|
|
11
|
use Illuminate\Console\Command;
|
11
|
use Illuminate\Console\Command;
|
|
|
|
12
|
+use Illuminate\Support\Facades\Cache;
|
|
12
|
use Illuminate\Support\Facades\DB;
|
13
|
use Illuminate\Support\Facades\DB;
|
|
13
|
use Illuminate\Support\Facades\Redis;
|
14
|
use Illuminate\Support\Facades\Redis;
|
|
14
|
|
15
|
|
|
@@ -75,29 +76,8 @@ class HtmlLanguageCollect extends Command |
|
@@ -75,29 +76,8 @@ class HtmlLanguageCollect extends Command |
|
75
|
$collect_info->status = CollectTask::STATUS_ING;
|
76
|
$collect_info->status = CollectTask::STATUS_ING;
|
|
76
|
$collect_info->save();
|
77
|
$collect_info->save();
|
|
77
|
|
78
|
|
|
78
|
- //获取英文站域名
|
|
|
|
79
|
- $domain = $collect_info->domain;
|
|
|
|
80
|
- if (strpos($domain, '/') !== false) {
|
|
|
|
81
|
- $domain = substr($domain, 0, strpos($domain, '/'));
|
|
|
|
82
|
- } else {
|
|
|
|
83
|
- $domain = str_replace($collect_info->language, 'www', $domain);
|
|
|
|
84
|
- }
|
|
|
|
85
|
-
|
|
|
|
86
|
- $web_url_domain = $domain;
|
|
|
|
87
|
- $home_url = $domain;
|
|
|
|
88
|
- $url_web_config = 'https://' . $domain . '/wp-content/cache/user_config.text';
|
|
|
|
89
|
- $data_config = curl_c($url_web_config);
|
|
|
|
90
|
- if ($data_config) {
|
|
|
|
91
|
- $web_url_arr = parse_url($data_config['web_url_domain'] ?? '');
|
|
|
|
92
|
- if (isset($web_url_arr['host'])) {
|
|
|
|
93
|
- $web_url_domain = $web_url_arr['host'];
|
|
|
|
94
|
- }
|
|
|
|
95
|
-
|
|
|
|
96
|
- $home_url_arr = parse_url($data_config['home_url'] ?? '');
|
|
|
|
97
|
- if (isset($home_url_arr['host'])) {
|
|
|
|
98
|
- $home_url = $home_url_arr['host'];
|
|
|
|
99
|
- }
|
|
|
|
100
|
- }
|
79
|
+ //获取站点正式和测试域名
|
|
|
|
80
|
+ $old_info = $this->getOldDomain($project_id, $collect_info->domain);
|
|
101
|
|
81
|
|
|
102
|
//采集html页面,下载资源到本地并替换
|
82
|
//采集html页面,下载资源到本地并替换
|
|
103
|
try {
|
83
|
try {
|
|
@@ -110,7 +90,15 @@ class HtmlLanguageCollect extends Command |
|
@@ -110,7 +90,15 @@ class HtmlLanguageCollect extends Command |
|
110
|
return true;
|
90
|
return true;
|
|
111
|
}
|
91
|
}
|
|
112
|
|
92
|
|
|
113
|
- $source_list = $this->html_preg($html, $project_id, $collect_info->domain, $web_url_domain, $home_url);
|
93
|
+ //如果有base64图片,先替换掉,再进行资源匹配
|
|
|
|
94
|
+ $new_html = $html;
|
|
|
|
95
|
+ preg_match_all("/data:([^;]*);base64,(.*)?\"/", $new_html, $result_img);
|
|
|
|
96
|
+ $img_base64 = $result_img[2] ?? [];
|
|
|
|
97
|
+ foreach ($img_base64 as $v64) {
|
|
|
|
98
|
+ $new_html = str_replace($v64, '', $new_html);
|
|
|
|
99
|
+ }
|
|
|
|
100
|
+
|
|
|
|
101
|
+ $source_list = $this->html_preg($new_html, $project_id, $collect_info->domain, $old_info['web_url_domain'], $old_info['home_url']);
|
|
114
|
|
102
|
|
|
115
|
if ($source_list) {
|
103
|
if ($source_list) {
|
|
116
|
$html = $this->upload_source($html, $source_list, $project_id);
|
104
|
$html = $this->upload_source($html, $source_list, $project_id);
|
|
@@ -162,9 +150,6 @@ class HtmlLanguageCollect extends Command |
|
@@ -162,9 +150,6 @@ class HtmlLanguageCollect extends Command |
|
162
|
case 'blog':
|
150
|
case 'blog':
|
|
163
|
$source = RouteMap::SOURCE_BLOG;
|
151
|
$source = RouteMap::SOURCE_BLOG;
|
|
164
|
break;
|
152
|
break;
|
|
165
|
- case 'tag':
|
|
|
|
166
|
- $source = RouteMap::SOURCE_PRODUCT_KEYWORD;
|
|
|
|
167
|
- break;
|
|
|
|
168
|
default:
|
153
|
default:
|
|
169
|
$source = RouteMap::SOURCE_PRODUCT;
|
154
|
$source = RouteMap::SOURCE_PRODUCT;
|
|
170
|
break;
|
155
|
break;
|
|
@@ -197,6 +182,42 @@ class HtmlLanguageCollect extends Command |
|
@@ -197,6 +182,42 @@ class HtmlLanguageCollect extends Command |
|
197
|
return $task_id;
|
182
|
return $task_id;
|
|
198
|
}
|
183
|
}
|
|
199
|
|
184
|
|
|
|
|
185
|
+ //获取站点老域名
|
|
|
|
186
|
+ protected function getOldDomain($project_id, $domain)
|
|
|
|
187
|
+ {
|
|
|
|
188
|
+ $key = 'project_collect_lan_domain_' . $project_id;
|
|
|
|
189
|
+
|
|
|
|
190
|
+ $data = Cache::get($key);
|
|
|
|
191
|
+
|
|
|
|
192
|
+ if (!$data) {
|
|
|
|
193
|
+ $web_url_domain = $domain;
|
|
|
|
194
|
+ $home_url = $domain;
|
|
|
|
195
|
+
|
|
|
|
196
|
+ $url_web_config = 'https://' . $domain . '/wp-content/cache/user_config.text';
|
|
|
|
197
|
+ $data_config = curl_c($url_web_config);
|
|
|
|
198
|
+ if ($data_config) {
|
|
|
|
199
|
+ $web_url_arr = parse_url($data_config['web_url_domain'] ?? '');
|
|
|
|
200
|
+ if (isset($web_url_arr['host'])) {
|
|
|
|
201
|
+ $web_url_domain = $web_url_arr['host'];
|
|
|
|
202
|
+ }
|
|
|
|
203
|
+
|
|
|
|
204
|
+ $home_url_arr = parse_url($data_config['home_url'] ?? '');
|
|
|
|
205
|
+ if (isset($home_url_arr['host'])) {
|
|
|
|
206
|
+ $home_url = $home_url_arr['host'];
|
|
|
|
207
|
+ }
|
|
|
|
208
|
+ }
|
|
|
|
209
|
+
|
|
|
|
210
|
+ $data = [
|
|
|
|
211
|
+ 'web_url_domain' => $web_url_domain,
|
|
|
|
212
|
+ 'home_url' => $home_url,
|
|
|
|
213
|
+ ];
|
|
|
|
214
|
+
|
|
|
|
215
|
+ Cache::add($key, $data, 3600);//缓存1小时
|
|
|
|
216
|
+ }
|
|
|
|
217
|
+
|
|
|
|
218
|
+ return $data;
|
|
|
|
219
|
+ }
|
|
|
|
220
|
+
|
|
200
|
//正则匹配html资源
|
221
|
//正则匹配html资源
|
|
201
|
protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
|
222
|
protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
|
|
202
|
{
|
223
|
{
|
|
@@ -246,6 +267,13 @@ class HtmlLanguageCollect extends Command |
|
@@ -246,6 +267,13 @@ class HtmlLanguageCollect extends Command |
|
246
|
$check_vc_b && $source[] = $check_vc_b;
|
267
|
$check_vc_b && $source[] = $check_vc_b;
|
|
247
|
}
|
268
|
}
|
|
248
|
|
269
|
|
|
|
|
270
|
+ //a标签下载资源
|
|
|
|
271
|
+ preg_match_all('/<a\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_a);
|
|
|
|
272
|
+ $down = $result_a[2] ?? [];
|
|
|
|
273
|
+ foreach ($down as $vd) {
|
|
|
|
274
|
+ $check_vd = $this->url_check($vd, $project_id, $domain, $web_url_domain, $home_url);
|
|
|
|
275
|
+ $check_vd && $source[] = $check_vd;
|
|
|
|
276
|
+ }
|
|
249
|
|
277
|
|
|
250
|
return $source;
|
278
|
return $source;
|
|
251
|
}
|
279
|
}
|
|
@@ -265,7 +293,7 @@ class HtmlLanguageCollect extends Command |
|
@@ -265,7 +293,7 @@ class HtmlLanguageCollect extends Command |
|
265
|
(empty($host) || $host == $web_url_domain || $host == $home_url)
|
293
|
(empty($host) || $host == $web_url_domain || $host == $home_url)
|
|
266
|
&& $path
|
294
|
&& $path
|
|
267
|
&& (strpos($path, '.') !== false)
|
295
|
&& (strpos($path, '.') !== false)
|
|
268
|
- && (end($path_arr) != 'html')
|
296
|
+ && (!in_array(end($path_arr), ['html', 'php', 'com', 'xml']))
|
|
269
|
) {
|
297
|
) {
|
|
270
|
$source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
|
298
|
$source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
|
|
271
|
if (!$source) {
|
299
|
if (!$source) {
|
|
@@ -329,7 +357,7 @@ class HtmlLanguageCollect extends Command |
|
@@ -329,7 +357,7 @@ class HtmlLanguageCollect extends Command |
|
329
|
continue;
|
357
|
continue;
|
|
330
|
}
|
358
|
}
|
|
331
|
$path_arr = explode('.', $vcs);
|
359
|
$path_arr = explode('.', $vcs);
|
|
332
|
- if(end($path_arr) == 'html'){
|
360
|
+ if(in_array(end($path_arr), ['html', 'php', 'com', 'xml'])){
|
|
333
|
continue;
|
361
|
continue;
|
|
334
|
}
|
362
|
}
|
|
335
|
|
363
|
|