作者 赵彬吉
  1 +<?php
  2 +
  3 +namespace App\Console\Commands\Update;
  4 +
  5 +use App\Models\Collect\CollectSource;
  6 +use App\Models\Collect\CollectTask;
  7 +use App\Models\Com\UpdateLog;
  8 +use App\Services\CosService;
  9 +use App\Services\ProjectServer;
  10 +use Illuminate\Console\Command;
  11 +use Illuminate\Support\Facades\DB;
  12 +use Illuminate\Support\Facades\Redis;
  13 +
  14 +/**
  15 + * 4.0,5.0升级到6.0,页面采集
  16 + * Class ProjectImport
  17 + * @package App\Console\Commands
  18 + * @author Akun
  19 + * @date 2023/11/10 16:04
  20 + */
  21 +class HtmlCollect extends Command
  22 +{
  23 + /**
  24 + * The name and signature of the console command.
  25 + *
  26 + * @var string
  27 + */
  28 + protected $signature = 'project_html_collect';
  29 +
  30 + /**
  31 + * The console command description.
  32 + *
  33 + * @var string
  34 + */
  35 + protected $description = '执行项目html页面采集';
  36 +
  37 +
  38 + public function handle()
  39 + {
  40 +// while (true) {
  41 + $this->start_collect();
  42 +// }
  43 + }
  44 +
  45 + protected function start_collect()
  46 + {
  47 + $task_id = $this->get_task();
  48 + if ($task_id === false) {
  49 + //所有项目采集完成
  50 + sleep(60);
  51 + return true;
  52 + } elseif ($task_id === 0) {
  53 + //当前项目采集完成
  54 + sleep(2);
  55 + return true;
  56 + }
  57 +
  58 + $task_arr = explode('_', $task_id);
  59 + $project_id = $task_arr[0];
  60 + $collect_id = $task_arr[1];
  61 +
  62 + //设置数据库
  63 + $project = ProjectServer::useProject($project_id);
  64 + if ($project) {
  65 + $collect_info = CollectTask::select(['id', 'domain', 'route'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->first();
  66 +
  67 + if (!$collect_info) {
  68 + sleep(2);
  69 + return true;
  70 + }
  71 +
  72 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', task_type: ' . $collect_id . ', collect start' . PHP_EOL;
  73 +
  74 + $collect_info->status = CollectTask::STATUS_ING;
  75 + $collect_info->save();
  76 +
  77 + //采集html页面,下载资源到本地并替换
  78 + try {
  79 + $html = file_get_contents('https://' . $collect_info->domain . $collect_info->route);
  80 + $source_list = $this->html_preg($html, $project_id, $collect_info->domain);
  81 +
  82 + if ($source_list) {
  83 + $html = $this->upload_source($html, $source_list, $project_id);
  84 + }
  85 + } catch (\Exception $e) {
  86 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', task_type: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;
  87 + return true;
  88 + }
  89 +
  90 + $collect_info->html = $html;
  91 + $collect_info->status = CollectTask::STATUS_COM;
  92 + $collect_info->save();
  93 +
  94 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', task_type: ' . $collect_id . ', collect end' . PHP_EOL;
  95 + }
  96 + //关闭数据库
  97 + DB::disconnect('custom_mysql');
  98 +
  99 + sleep(2);
  100 + }
  101 +
  102 + //获取任务
  103 + protected function get_task()
  104 + {
  105 + $key = 'console_html_collect_task';
  106 + $task_id = Redis::rpop($key);
  107 + if ($task_id) {
  108 + return $task_id;
  109 + }
  110 +
  111 +
  112 + $update_log = UpdateLog::where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_UN)->orderBy('project_id', 'asc')->first();
  113 + if (!$update_log) {
  114 + return false;
  115 + }
  116 +
  117 + $complete = false;
  118 + //设置数据库
  119 + $project = ProjectServer::useProject($update_log->project_id);
  120 + if ($project) {
  121 + $collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('status', CollectTask::STATUS_UN)->limit(50)->get();
  122 +
  123 + if ($collect_list->count() == 0) {
  124 + $complete = true;
  125 + } else {
  126 + foreach ($collect_list as $collect) {
  127 + Redis::lpush($key, $collect['project_id'] . '_' . $collect['id']);
  128 + }
  129 + }
  130 + }
  131 + //关闭数据库
  132 + DB::disconnect('custom_mysql');
  133 +
  134 + if ($complete) {
  135 + $update_log->collect_status = UpdateLog::COLLECT_STATUS_COM;
  136 + return 0;
  137 + }
  138 +
  139 + $task_id = Redis::rpop($key);
  140 + return $task_id;
  141 + }
  142 +
  143 + //正则匹配html资源
  144 + protected function html_preg($html, $project_id, $domain)
  145 + {
  146 + $source = [];
  147 +
  148 + if (!$html) {
  149 + return $source;
  150 + }
  151 +
  152 + //图片
  153 + preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
  154 + $img = $result_img[2] ?? [];
  155 + foreach ($img as $vi) {
  156 + $check_vi = $this->url_check($vi, $project_id, $domain);
  157 + $check_vi && $source[] = $check_vi;
  158 + }
  159 +
  160 + //js
  161 + preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
  162 + $js = $result_js[2] ?? [];
  163 + foreach ($js as $vj) {
  164 + $check_vj = $this->url_check($vj, $project_id, $domain);
  165 + $check_vj && $source[] = $check_vj;
  166 + }
  167 +
  168 + //video
  169 + preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
  170 + $video = $result_video[2] ?? [];
  171 + foreach ($video as $vv) {
  172 + $check_vv = $this->url_check($vv, $project_id, $domain);
  173 + $check_vv && $source[] = $check_vv;
  174 + }
  175 +
  176 + //css
  177 + preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
  178 + $css = $result_css[2] ?? [];
  179 + foreach ($css as $vc) {
  180 + $check_vc = $this->url_check($vc, $project_id, $domain);
  181 + $check_vc && $source[] = $check_vc;
  182 + }
  183 +
  184 + return $source;
  185 + }
  186 +
  187 + //判断资源是否需要下载
  188 + protected function url_check($url, $project_id, $domain)
  189 + {
  190 + if ($url) {
  191 + $arr = parse_url($url);
  192 + $scheme = $arr['scheme'] ?? '';
  193 + $host = $arr['host'] ?? '';
  194 + $path = $arr['path'] ?? '';
  195 +
  196 + if ((strpos($host, '.globalso.') === false)
  197 + && (strpos($host, '.goodao.') === false)
  198 + && $path && (strpos($path, '.') !== false)) {
  199 +
  200 + $source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
  201 + if (!$source) {
  202 + return [
  203 + 'download' => true,
  204 + 'url' => $url,
  205 + 'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path
  206 + ];
  207 + } else {
  208 + return [
  209 + 'download' => false,
  210 + 'url' => $url,
  211 + 'url_complete' => $source['target']
  212 + ];
  213 + }
  214 + } else {
  215 + return false;
  216 + }
  217 + } else {
  218 + return false;
  219 + }
  220 + }
  221 +
  222 + //下载并替换资源
  223 + protected function upload_source($html, $source, $project_id)
  224 + {
  225 + foreach ($source as $vs) {
  226 +
  227 + if ($vs['download']) {
  228 + $new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);
  229 + if ($new_source) {
  230 + CollectSource::insert([
  231 + 'project_id' => $project_id,
  232 + 'origin' => $vs['url'],
  233 + 'target' => $new_source,
  234 + 'created_at' => date('Y-m-d H:i:s'),
  235 + 'updated_at' => date('Y-m-d H:i:s'),
  236 + ]);
  237 + $html = str_replace($vs['url'], getImageUrl($new_source), $html);
  238 + }
  239 + } else {
  240 + $html = str_replace($vs['url'], getImageUrl($vs['url_complete']), $html);
  241 + }
  242 + }
  243 +
  244 + return $html;
  245 + }
  246 +}
@@ -4,8 +4,8 @@ namespace App\Console\Commands\Update; @@ -4,8 +4,8 @@ namespace App\Console\Commands\Update;
4 4
5 use App\Helper\Arr; 5 use App\Helper\Arr;
6 use App\Http\Logic\Bside\Product\CategoryLogic; 6 use App\Http\Logic\Bside\Product\CategoryLogic;
7 -use App\Http\Logic\Bside\Product\KeywordLogic;  
8 use App\Models\Blog\Blog; 7 use App\Models\Blog\Blog;
  8 +use App\Models\Collect\CollectTask;
9 use App\Models\Com\UpdateLog; 9 use App\Models\Com\UpdateLog;
10 use App\Models\News\News; 10 use App\Models\News\News;
11 use App\Models\Product\Category; 11 use App\Models\Product\Category;
@@ -77,7 +77,7 @@ class ProjectUpdate extends Command @@ -77,7 +77,7 @@ class ProjectUpdate extends Command
77 $task->save(); 77 $task->save();
78 78
79 //设置数据库 79 //设置数据库
80 - $project = ProjectServer::useProject($task->project_id); 80 + $project = ProjectServer::useProject($project_id);
81 if ($project) { 81 if ($project) {
82 if ($api_type == 'category') { 82 if ($api_type == 'category') {
83 //分类 83 //分类
@@ -86,48 +86,6 @@ class ProjectUpdate extends Command @@ -86,48 +86,6 @@ class ProjectUpdate extends Command
86 if (isset($data['code']) && $data['code'] == 200) { 86 if (isset($data['code']) && $data['code'] == 200) {
87 $items = $data['data'] ?? []; 87 $items = $data['data'] ?? [];
88 $this->category_insert($project_id, $items, 0); 88 $this->category_insert($project_id, $items, 0);
89 -// $model = new Category();  
90 -// foreach ($items as $item) {  
91 -// $parent = $model->read(['pid' => 0, 'title' => $item['name']], 'id');  
92 -// if (!$parent) {  
93 -// try {  
94 -// $parent_id = $model->addReturnId([  
95 -// 'project_id' => $project_id,  
96 -// 'title' => $item['name'],  
97 -// 'pid' => 0,  
98 -// 'keywords' => $item['keywords'],  
99 -// 'describe' => $item['description']  
100 -// ]);  
101 -// $route = RouteMap::setRoute($item['url'] ? $this->get_url_route($item['url']) : $item['name'], RouteMap::SOURCE_PRODUCT_CATE, $parent_id, $project_id);  
102 -// $model->edit(['route' => $route], ['id' => $parent_id]);  
103 -// } catch (\Exception $e) {  
104 -// echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;  
105 -// continue;  
106 -// }  
107 -// } else {  
108 -// $parent_id = $parent['id'];  
109 -// }  
110 -//  
111 -// foreach ($item['children'] as $child) {  
112 -// $child_info = $model->read(['pid' => $parent_id, 'title' => $child['name']]);  
113 -// if (!$child_info) {  
114 -// try {  
115 -// $child_id = $model->addReturnId([  
116 -// 'project_id' => $project_id,  
117 -// 'title' => $child['name'],  
118 -// 'pid' => $parent_id,  
119 -// 'keywords' => $child['keywords'],  
120 -// 'describe' => $child['description']  
121 -// ]);  
122 -// $route = RouteMap::setRoute($child['url'] ? $this->get_url_route($child['url']) : $child['name'], RouteMap::SOURCE_PRODUCT_CATE, $child_id, $project_id);  
123 -// $model->edit(['route' => $route], ['id' => $child_id]);  
124 -// } catch (\Exception $e) {  
125 -// echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;  
126 -// continue;  
127 -// }  
128 -// }  
129 -// }  
130 -// }  
131 } else { 89 } else {
132 return true; 90 return true;
133 } 91 }
@@ -164,7 +122,11 @@ class ProjectUpdate extends Command @@ -164,7 +122,11 @@ class ProjectUpdate extends Command
164 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL; 122 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
165 continue; 123 continue;
166 } 124 }
  125 + } else {
  126 + $id = $keyword['id'];
167 } 127 }
  128 +
  129 + CollectTask::_insert($item['url'], $project_id, RouteMap::SOURCE_PRODUCT_KEYWORD, $id);
168 } 130 }
169 } 131 }
170 } 132 }
@@ -279,7 +241,11 @@ class ProjectUpdate extends Command @@ -279,7 +241,11 @@ class ProjectUpdate extends Command
279 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL; 241 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
280 continue; 242 continue;
281 } 243 }
  244 + } else {
  245 + $id = $product['id'];
282 } 246 }
  247 +
  248 + CollectTask::_insert($item['url'], $project_id, RouteMap::SOURCE_PRODUCT, $id);
283 } 249 }
284 } 250 }
285 } 251 }
@@ -329,7 +295,11 @@ class ProjectUpdate extends Command @@ -329,7 +295,11 @@ class ProjectUpdate extends Command
329 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL; 295 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
330 continue; 296 continue;
331 } 297 }
  298 + } else {
  299 + $id = $news['id'];
332 } 300 }
  301 +
  302 + CollectTask::_insert($item['url'], $project_id, $api_type == 'news' ? RouteMap::SOURCE_NEWS : RouteMap::SOURCE_BLOG, $id);
333 } 303 }
334 } 304 }
335 } 305 }
@@ -373,7 +343,11 @@ class ProjectUpdate extends Command @@ -373,7 +343,11 @@ class ProjectUpdate extends Command
373 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL; 343 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
374 continue; 344 continue;
375 } 345 }
  346 + } else {
  347 + $id = $custom['id'];
376 } 348 }
  349 +
  350 + CollectTask::_insert($item['url'], $project_id, RouteMap::SOURCE_PAGE, $id);
377 } 351 }
378 } 352 }
379 } 353 }
@@ -423,6 +397,7 @@ class ProjectUpdate extends Command @@ -423,6 +397,7 @@ class ProjectUpdate extends Command
423 return $arr[count($arr) - 2]; 397 return $arr[count($arr) - 2];
424 } 398 }
425 399
  400 + //多级分类入库
426 protected function category_insert($project_id, $items, $pid = 0) 401 protected function category_insert($project_id, $items, $pid = 0)
427 { 402 {
428 $model = new Category(); 403 $model = new Category();
@@ -69,8 +69,6 @@ class CustomTemplateLogic extends BaseLogic @@ -69,8 +69,6 @@ class CustomTemplateLogic extends BaseLogic
69 }else{ 69 }else{
70 if($this->param['url'] == $this->model::NOT_FOUND_PAGE_URL){ 70 if($this->param['url'] == $this->model::NOT_FOUND_PAGE_URL){
71 $this->fail('404页面已存在'); 71 $this->fail('404页面已存在');
72 - }else{  
73 - $this->param['url'] = $this->param['url'].'-tag';  
74 } 72 }
75 $this->param['project_id'] = $this->user['project_id']; 73 $this->param['project_id'] = $this->user['project_id'];
76 $id = $this->model->addReturnId($this->param); 74 $id = $this->model->addReturnId($this->param);
  1 +<?php
  2 +
  3 +namespace App\Models\Collect;
  4 +
  5 +use App\Models\Base;
  6 +
  7 +class CollectSource extends Base
  8 +{
  9 + //设置关联表名
  10 + protected $table = 'gl_collect_source';
  11 +
  12 + //连接数据库
  13 + protected $connection = 'custom_mysql';
  14 +
  15 +
  16 +}
  1 +<?php
  2 +
  3 +namespace App\Models\Collect;
  4 +
  5 +use App\Models\Base;
  6 +
  7 +class CollectTask extends Base
  8 +{
  9 + //设置关联表名
  10 + protected $table = 'gl_collect_task';
  11 +
  12 + //连接数据库
  13 + protected $connection = 'custom_mysql';
  14 +
  15 + const STATUS_UN = 0;
  16 + const STATUS_ING = 1;
  17 + const STATUS_COM= 2;
  18 +
  19 + public static function _insert($url, $project_id, $source, $source_id)
  20 + {
  21 + if(!$url){
  22 + return;
  23 + }
  24 +
  25 + $url_arr = parse_url($url);
  26 +
  27 + $data = [
  28 + 'project_id' => $project_id,
  29 + 'source' => $source,
  30 + 'source_id' => $source_id,
  31 + 'domain' => $url_arr['host'],
  32 + 'route' => $url_arr['path']
  33 + ];
  34 +
  35 + $task = self::where($data)->first();
  36 + if(!$task){
  37 + $data['created_at'] = $data['updated_at'] = date('Y-m-d H:i:s');
  38 + self::insert($data);
  39 + }
  40 + }
  41 +}
@@ -13,6 +13,9 @@ class UpdateLog extends Model @@ -13,6 +13,9 @@ class UpdateLog extends Model
13 const STATUS_ING = 1;//导入中 13 const STATUS_ING = 1;//导入中
14 const STATUS_COM = 2;//导入完成 14 const STATUS_COM = 2;//导入完成
15 15
  16 + const COLLECT_STATUS_UN = 0;//未开始
  17 + const COLLECT_STATUS_COM = 1;//采集完成
  18 +
16 /** 19 /**
17 * 创建更新日志 20 * 创建更新日志
18 * @param $project_id 21 * @param $project_id
@@ -30,6 +33,7 @@ class UpdateLog extends Model @@ -30,6 +33,7 @@ class UpdateLog extends Model
30 $log->api_type = $type; 33 $log->api_type = $type;
31 $log->api_url = $url; 34 $log->api_url = $url;
32 $log->sort = $type == 'category' ? 0 :1; 35 $log->sort = $type == 'category' ? 0 :1;
  36 + $log->collect_status = in_array($type, ['website_info', 'category']) ? 1 : 0;
33 return $log->save(); 37 return $log->save();
34 } 38 }
35 return true; 39 return true;