作者 刘锟

html采集

... ... @@ -49,14 +49,15 @@ class HtmlCollect extends Command
public function handle()
{
while (true) {
// while (true) {
$this->start_update();
}
// }
}
protected function start_update()
{
$task_id = $this->get_task();
// $task_id = $this->get_task();
$task_id = '298_1';
if ($task_id === false) {
//所有项目采集完成
sleep(60);
... ... @@ -204,7 +205,6 @@ class HtmlCollect extends Command
$scheme = $arr['scheme'] ?? '';
$host = $arr['host'] ?? '';
$path = $arr['path'] ?? '';
$query = $arr['query'] ?? '';
if ((strpos($host, '.globalso.') === false)
&& (strpos($host, '.goodao.') === false)
... ... @@ -214,7 +214,7 @@ class HtmlCollect extends Command
if (!$source) {
return [
'url' => $url,
'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path . ($query ? '?' . $query : '')
'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path
];
} else {
return false;
... ... @@ -236,12 +236,12 @@ class HtmlCollect extends Command
if ($new_source) {
CollectSource::insert([
'project_id' => $project_id,
'origin' => $vs['ur'],
'origin' => $vs['url'],
'target' => $new_source,
'created_at' => date('Y-m-d H:i:s'),
'updated_at' => date('Y-m-d H:i:s'),
]);
$html = str_replace($vs['ur'], $new_source, $html);
$html = str_replace($vs['url'], $new_source, $html);
}
}
... ...