|
...
|
...
|
@@ -39,14 +39,15 @@ class HtmlCollect extends Command |
|
|
|
|
|
|
|
public function handle()
|
|
|
|
{
|
|
|
|
while (true) {
|
|
|
|
// while (true) {
|
|
|
|
$this->start_collect();
|
|
|
|
}
|
|
|
|
// }
|
|
|
|
}
|
|
|
|
|
|
|
|
protected function start_collect()
|
|
|
|
{
|
|
|
|
$task_id = $this->get_task();
|
|
|
|
// $task_id = $this->get_task();
|
|
|
|
$task_id = '437_5995';
|
|
|
|
if ($task_id === false) {
|
|
|
|
//所有项目采集完成
|
|
|
|
sleep(60);
|
|
...
|
...
|
@@ -336,17 +337,21 @@ class HtmlCollect extends Command |
|
|
|
]);
|
|
|
|
$html = str_replace($vs['url'], getImageUrl($new_source), $html);
|
|
|
|
|
|
|
|
if (substr($new_source, -3, 3) == 'css') {
|
|
|
|
// 下载css文件中的资源
|
|
|
|
$css_html = curl_c($vs['url_complete'], false);
|
|
|
|
preg_match_all("/url\(['\"](\s*[^>]+?)['\"]\)/i", $css_html, $result_css_source);
|
|
|
|
$css_source = $result_css_source[1] ?? [];
|
|
|
|
if (substr($new_source, -3, 3) == 'css' || substr($new_source, -2, 2) == 'js') {
|
|
|
|
$source_html = curl_c($vs['url_complete'], false);
|
|
|
|
if (substr($new_source, -3, 3) == 'css') {
|
|
|
|
preg_match_all("/url\(['\"](\s*[^>]+?)['\"]\)/i", $source_html, $result_source);
|
|
|
|
} else {
|
|
|
|
preg_match_all("/[large|thumb]+URL:['\"]+(\s*[^>]+?)['\"]+,/i", $source_html, $result_source);
|
|
|
|
}
|
|
|
|
$source_list = $result_source[1] ?? [];
|
|
|
|
|
|
|
|
$url_arr = explode('/', $vs['url_complete']);
|
|
|
|
$target_arr = explode('/', $new_source);
|
|
|
|
foreach ($css_source as $vcs) {
|
|
|
|
foreach ($source_list as $vcs) {
|
|
|
|
$vcs = str_replace('"', '', $vcs);
|
|
|
|
$vcs_arr = parse_url($vcs);
|
|
|
|
|
|
|
|
if (isset($vcs_arr['domain'])) {
|
|
|
|
//不是相对路径,不下载
|
|
|
|
continue;
|
...
|
...
|
|