作者 刘锟

html采集

@@ -49,14 +49,15 @@ class HtmlCollect extends Command @@ -49,14 +49,15 @@ class HtmlCollect extends Command
49 49
50 public function handle() 50 public function handle()
51 { 51 {
52 - while (true) {  
53 - $this->start_update();  
54 - } 52 +// while (true) {
  53 + $this->start_update();
  54 +// }
55 } 55 }
56 56
57 protected function start_update() 57 protected function start_update()
58 { 58 {
59 - $task_id = $this->get_task(); 59 +// $task_id = $this->get_task();
  60 + $task_id = '298_1';
60 if ($task_id === false) { 61 if ($task_id === false) {
61 //所有项目采集完成 62 //所有项目采集完成
62 sleep(60); 63 sleep(60);
@@ -204,7 +205,6 @@ class HtmlCollect extends Command @@ -204,7 +205,6 @@ class HtmlCollect extends Command
204 $scheme = $arr['scheme'] ?? ''; 205 $scheme = $arr['scheme'] ?? '';
205 $host = $arr['host'] ?? ''; 206 $host = $arr['host'] ?? '';
206 $path = $arr['path'] ?? ''; 207 $path = $arr['path'] ?? '';
207 - $query = $arr['query'] ?? '';  
208 208
209 if ((strpos($host, '.globalso.') === false) 209 if ((strpos($host, '.globalso.') === false)
210 && (strpos($host, '.goodao.') === false) 210 && (strpos($host, '.goodao.') === false)
@@ -214,7 +214,7 @@ class HtmlCollect extends Command @@ -214,7 +214,7 @@ class HtmlCollect extends Command
214 if (!$source) { 214 if (!$source) {
215 return [ 215 return [
216 'url' => $url, 216 'url' => $url,
217 - 'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path . ($query ? '?' . $query : '') 217 + 'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path
218 ]; 218 ];
219 } else { 219 } else {
220 return false; 220 return false;
@@ -236,12 +236,12 @@ class HtmlCollect extends Command @@ -236,12 +236,12 @@ class HtmlCollect extends Command
236 if ($new_source) { 236 if ($new_source) {
237 CollectSource::insert([ 237 CollectSource::insert([
238 'project_id' => $project_id, 238 'project_id' => $project_id,
239 - 'origin' => $vs['ur'], 239 + 'origin' => $vs['url'],
240 'target' => $new_source, 240 'target' => $new_source,
241 'created_at' => date('Y-m-d H:i:s'), 241 'created_at' => date('Y-m-d H:i:s'),
242 'updated_at' => date('Y-m-d H:i:s'), 242 'updated_at' => date('Y-m-d H:i:s'),
243 ]); 243 ]);
244 - $html = str_replace($vs['ur'], $new_source, $html); 244 + $html = str_replace($vs['url'], $new_source, $html);
245 } 245 }
246 } 246 }
247 247