|
@@ -9,7 +9,9 @@ use App\Models\RouteMap\RouteMap; |
|
@@ -9,7 +9,9 @@ use App\Models\RouteMap\RouteMap; |
|
9
|
use App\Services\CosService;
|
9
|
use App\Services\CosService;
|
|
10
|
use App\Services\ProjectServer;
|
10
|
use App\Services\ProjectServer;
|
|
11
|
use Illuminate\Console\Command;
|
11
|
use Illuminate\Console\Command;
|
|
|
|
12
|
+use Illuminate\Support\Facades\Cache;
|
|
12
|
use Illuminate\Support\Facades\DB;
|
13
|
use Illuminate\Support\Facades\DB;
|
|
|
|
14
|
+use Illuminate\Support\Facades\Log;
|
|
13
|
use Illuminate\Support\Facades\Redis;
|
15
|
use Illuminate\Support\Facades\Redis;
|
|
14
|
|
16
|
|
|
15
|
/**
|
17
|
/**
|
|
@@ -39,7 +41,7 @@ class HtmlCollect extends Command |
|
@@ -39,7 +41,7 @@ class HtmlCollect extends Command |
|
39
|
public function handle()
|
41
|
public function handle()
|
|
40
|
{
|
42
|
{
|
|
41
|
while (true) {
|
43
|
while (true) {
|
|
42
|
- $this->start_collect();
|
44
|
+ $this->start_collect();
|
|
43
|
}
|
45
|
}
|
|
44
|
}
|
46
|
}
|
|
45
|
|
47
|
|
|
@@ -76,34 +78,28 @@ class HtmlCollect extends Command |
|
@@ -76,34 +78,28 @@ class HtmlCollect extends Command |
|
76
|
$collect_info->save();
|
78
|
$collect_info->save();
|
|
77
|
|
79
|
|
|
78
|
//获取站点正式和测试域名
|
80
|
//获取站点正式和测试域名
|
|
79
|
- $web_url_domain = $collect_info->domain;
|
|
|
|
80
|
- $home_url = $collect_info->domain;
|
|
|
|
81
|
- $url_web_config = 'https://' . $collect_info->domain . '/wp-content/cache/user_config.text';
|
|
|
|
82
|
- $data_config = curl_c($url_web_config);
|
|
|
|
83
|
- if ($data_config) {
|
|
|
|
84
|
- $web_url_arr = parse_url($data_config['web_url_domain'] ?? '');
|
|
|
|
85
|
- if (isset($web_url_arr['host'])) {
|
|
|
|
86
|
- $web_url_domain = $web_url_arr['host'];
|
|
|
|
87
|
- }
|
|
|
|
88
|
-
|
|
|
|
89
|
- $home_url_arr = parse_url($data_config['home_url'] ?? '');
|
|
|
|
90
|
- if (isset($home_url_arr['host'])) {
|
|
|
|
91
|
- $home_url = $home_url_arr['host'];
|
|
|
|
92
|
- }
|
|
|
|
93
|
- }
|
81
|
+ $old_info = $this->getOldDomain($project_id, $collect_info->domain);
|
|
94
|
|
82
|
|
|
95
|
//采集html页面,下载资源到本地并替换
|
83
|
//采集html页面,下载资源到本地并替换
|
|
96
|
try {
|
84
|
try {
|
|
97
|
$html = curl_c('https://' . $collect_info->domain . $collect_info->route, false);
|
85
|
$html = curl_c('https://' . $collect_info->domain . $collect_info->route, false);
|
|
98
|
- if($html == '0'){
|
86
|
+ if ($html == '0') {
|
|
99
|
$collect_info->status = CollectTask::STATUS_FAIL;
|
87
|
$collect_info->status = CollectTask::STATUS_FAIL;
|
|
100
|
$collect_info->save();
|
88
|
$collect_info->save();
|
|
101
|
- echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: no html' . PHP_EOL;
|
89
|
+ echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', collect_id: ' . $collect_id . ', error: no html' . PHP_EOL;
|
|
102
|
sleep(2);
|
90
|
sleep(2);
|
|
103
|
return true;
|
91
|
return true;
|
|
104
|
}
|
92
|
}
|
|
105
|
|
93
|
|
|
106
|
- $source_list = $this->html_preg($html, $project_id, $collect_info->domain, $web_url_domain, $home_url);
|
94
|
+ //如果有base64图片,先替换掉,再进行资源匹配
|
|
|
|
95
|
+ $new_html = $html;
|
|
|
|
96
|
+ preg_match_all("/data:([^;]*);base64,(.*)?\"/", $new_html, $result_img);
|
|
|
|
97
|
+ $img_base64 = $result_img[2] ?? [];
|
|
|
|
98
|
+ foreach ($img_base64 as $v64) {
|
|
|
|
99
|
+ $new_html = str_replace($v64, '', $new_html);
|
|
|
|
100
|
+ }
|
|
|
|
101
|
+
|
|
|
|
102
|
+ $source_list = $this->html_preg($new_html, $project_id, $collect_info->domain, $old_info['web_url_domain'], $old_info['home_url']);
|
|
107
|
|
103
|
|
|
108
|
if ($source_list) {
|
104
|
if ($source_list) {
|
|
109
|
$html = $this->upload_source($html, $source_list, $project_id);
|
105
|
$html = $this->upload_source($html, $source_list, $project_id);
|
|
@@ -187,6 +183,42 @@ class HtmlCollect extends Command |
|
@@ -187,6 +183,42 @@ class HtmlCollect extends Command |
|
187
|
return $task_id;
|
183
|
return $task_id;
|
|
188
|
}
|
184
|
}
|
|
189
|
|
185
|
|
|
|
|
186
|
+ //获取站点老域名
|
|
|
|
187
|
+ protected function getOldDomain($project_id, $domain)
|
|
|
|
188
|
+ {
|
|
|
|
189
|
+ $key = 'project_collect_domain_' . $project_id;
|
|
|
|
190
|
+
|
|
|
|
191
|
+ $data = Cache::get($key);
|
|
|
|
192
|
+
|
|
|
|
193
|
+ if (!$data) {
|
|
|
|
194
|
+ $web_url_domain = $domain;
|
|
|
|
195
|
+ $home_url = $domain;
|
|
|
|
196
|
+
|
|
|
|
197
|
+ $url_web_config = 'https://' . $domain . '/wp-content/cache/user_config.text';
|
|
|
|
198
|
+ $data_config = curl_c($url_web_config);
|
|
|
|
199
|
+ if ($data_config) {
|
|
|
|
200
|
+ $web_url_arr = parse_url($data_config['web_url_domain'] ?? '');
|
|
|
|
201
|
+ if (isset($web_url_arr['host'])) {
|
|
|
|
202
|
+ $web_url_domain = $web_url_arr['host'];
|
|
|
|
203
|
+ }
|
|
|
|
204
|
+
|
|
|
|
205
|
+ $home_url_arr = parse_url($data_config['home_url'] ?? '');
|
|
|
|
206
|
+ if (isset($home_url_arr['host'])) {
|
|
|
|
207
|
+ $home_url = $home_url_arr['host'];
|
|
|
|
208
|
+ }
|
|
|
|
209
|
+ }
|
|
|
|
210
|
+
|
|
|
|
211
|
+ $data = [
|
|
|
|
212
|
+ 'web_url_domain' => $web_url_domain,
|
|
|
|
213
|
+ 'home_url' => $home_url,
|
|
|
|
214
|
+ ];
|
|
|
|
215
|
+
|
|
|
|
216
|
+ Cache::add($key, $data, 3600);//缓存1小时
|
|
|
|
217
|
+ }
|
|
|
|
218
|
+
|
|
|
|
219
|
+ return $data;
|
|
|
|
220
|
+ }
|
|
|
|
221
|
+
|
|
190
|
//正则匹配html资源
|
222
|
//正则匹配html资源
|
|
191
|
protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
|
223
|
protected function html_preg($html, $project_id, $domain, $web_url_domain, $home_url)
|
|
192
|
{
|
224
|
{
|
|
@@ -236,7 +268,6 @@ class HtmlCollect extends Command |
|
@@ -236,7 +268,6 @@ class HtmlCollect extends Command |
|
236
|
$check_vc_b && $source[] = $check_vc_b;
|
268
|
$check_vc_b && $source[] = $check_vc_b;
|
|
237
|
}
|
269
|
}
|
|
238
|
|
270
|
|
|
239
|
-
|
|
|
|
240
|
return $source;
|
271
|
return $source;
|
|
241
|
}
|
272
|
}
|
|
242
|
|
273
|
|
|
@@ -319,7 +350,7 @@ class HtmlCollect extends Command |
|
@@ -319,7 +350,7 @@ class HtmlCollect extends Command |
|
319
|
continue;
|
350
|
continue;
|
|
320
|
}
|
351
|
}
|
|
321
|
$path_arr = explode('.', $vcs);
|
352
|
$path_arr = explode('.', $vcs);
|
|
322
|
- if(end($path_arr) == 'html'){
|
353
|
+ if (end($path_arr) == 'html') {
|
|
323
|
continue;
|
354
|
continue;
|
|
324
|
}
|
355
|
}
|
|
325
|
|
356
|
|