作者 刘锟

合并分支 'akun' 到 'master'

Akun



查看合并请求 !34
  1 +<?php
  2 +
  3 +namespace App\Console\Commands\Update;
  4 +
  5 +use App\Helper\Arr;
  6 +use App\Http\Logic\Bside\Product\CategoryLogic;
  7 +use App\Http\Logic\Bside\Product\KeywordLogic;
  8 +use App\Models\Blog\Blog;
  9 +use App\Models\Collect\CollectSource;
  10 +use App\Models\Collect\CollectTask;
  11 +use App\Models\Com\UpdateLog;
  12 +use App\Models\News\News;
  13 +use App\Models\Product\Category;
  14 +use App\Models\Product\Keyword;
  15 +use App\Models\Product\Product;
  16 +use App\Models\RouteMap\RouteMap;
  17 +use App\Models\Template\BCustomTemplate;
  18 +use App\Models\WebSetting\WebSettingReceiving;
  19 +use App\Services\CosService;
  20 +use App\Services\ProjectServer;
  21 +use App\Utils\HttpUtils;
  22 +use Illuminate\Console\Command;
  23 +use Illuminate\Support\Facades\DB;
  24 +use Illuminate\Support\Facades\Redis;
  25 +
  26 +/**
  27 + * 4.0,5.0升级到6.0,页面采集
  28 + * Class ProjectImport
  29 + * @package App\Console\Commands
  30 + * @author Akun
  31 + * @date 2023/11/10 16:04
  32 + */
  33 +class HtmlCollect extends Command
  34 +{
  35 + /**
  36 + * The name and signature of the console command.
  37 + *
  38 + * @var string
  39 + */
  40 + protected $signature = 'project_html_collect';
  41 +
  42 + /**
  43 + * The console command description.
  44 + *
  45 + * @var string
  46 + */
  47 + protected $description = '执行项目html页面采集';
  48 +
  49 +
  50 + public function handle()
  51 + {
  52 + while (true) {
  53 + $this->start_update();
  54 + }
  55 + }
  56 +
  57 + protected function start_update()
  58 + {
  59 + $task_id = $this->get_task();
  60 + if ($task_id === false) {
  61 + //所有项目采集完成
  62 + sleep(60);
  63 + return true;
  64 + } elseif ($task_id === 0) {
  65 + //当前项目采集完成
  66 + sleep(2);
  67 + return true;
  68 + }
  69 +
  70 + $task_arr = explode('_', $task_id);
  71 + $project_id = $task_arr[0];
  72 + $collect_id = $task_arr[1];
  73 +
  74 + //设置数据库
  75 + $project = ProjectServer::useProject($project_id);
  76 + if ($project) {
  77 + $collect_info = CollectTask::select(['id', 'domain', 'route'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->first();
  78 +
  79 + if (!$collect_info) {
  80 + sleep(2);
  81 + return true;
  82 + }
  83 +
  84 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', task_type: ' . $collect_id . ', collect start' . PHP_EOL;
  85 +
  86 + $collect_info->status = CollectTask::STATUS_ING;
  87 + $collect_info->save();
  88 +
  89 + //采集html页面,下载资源到本地并替换
  90 + try {
  91 + $html = file_get_contents('https://' . $collect_info->domain . $collect_info->route);
  92 + $source_list = $this->html_preg($html, $project_id, $collect_info->domain);
  93 +
  94 + if ($source_list) {
  95 + $html = $this->upload_source($html, $source_list, $project_id);
  96 + }
  97 + } catch (\Exception $e) {
  98 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', task_type: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;
  99 + return true;
  100 + }
  101 +
  102 + $collect_info->html = $html;
  103 + $collect_info->status = CollectTask::STATUS_COM;
  104 + $collect_info->save();
  105 +
  106 + echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', task_type: ' . $collect_id . ', collect end' . PHP_EOL;
  107 + }
  108 + //关闭数据库
  109 + DB::disconnect('custom_mysql');
  110 +
  111 + sleep(2);
  112 + }
  113 +
  114 + //获取任务
  115 + protected function get_task()
  116 + {
  117 + $key = 'console_html_collect_task';
  118 + $task_id = Redis::rpop($key);
  119 + if ($task_id) {
  120 + return $task_id;
  121 + }
  122 +
  123 +
  124 + $update_log = UpdateLog::where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_UN)->orderBy('project_id', 'asc')->first();
  125 + if (!$update_log) {
  126 + return false;
  127 + }
  128 +
  129 + $complete = false;
  130 + //设置数据库
  131 + $project = ProjectServer::useProject($update_log->project_id);
  132 + if ($project) {
  133 + $collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('status', CollectTask::STATUS_UN)->limit(50)->get();
  134 +
  135 + if ($collect_list->count() == 0) {
  136 + $complete = true;
  137 + } else {
  138 + foreach ($collect_list as $collect) {
  139 + Redis::lpush($key, $collect['project_id'] . '_' . $collect['id']);
  140 + }
  141 + }
  142 + }
  143 + //关闭数据库
  144 + DB::disconnect('custom_mysql');
  145 +
  146 + if ($complete) {
  147 + $update_log->collect_status = UpdateLog::COLLECT_STATUS_COM;
  148 + return 0;
  149 + }
  150 +
  151 + $task_id = Redis::rpop($key);
  152 + return $task_id;
  153 + }
  154 +
  155 + //正则匹配html资源
  156 + protected function html_preg($html, $project_id, $domain)
  157 + {
  158 + $source = [];
  159 +
  160 + if (!$html) {
  161 + return $source;
  162 + }
  163 +
  164 + //图片
  165 + preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
  166 + $img = $result_img[2] ?? [];
  167 + foreach ($img as $vi) {
  168 + $check_vi = $this->url_check($vi, $project_id, $domain);
  169 + $check_vi && $source[] = $check_vi;
  170 + }
  171 +
  172 + //js
  173 + preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
  174 + $js = $result_js[2] ?? [];
  175 + foreach ($js as $vj) {
  176 + $check_vj = $this->url_check($vj, $project_id, $domain);
  177 + $check_vj && $source[] = $check_vj;
  178 + }
  179 +
  180 + //video
  181 + preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
  182 + $video = $result_video[2] ?? [];
  183 + foreach ($video as $vv) {
  184 + $check_vv = $this->url_check($vv, $project_id, $domain);
  185 + $check_vv && $source[] = $check_vv;
  186 + }
  187 +
  188 + //css
  189 + preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
  190 + $css = $result_css[2] ?? [];
  191 + foreach ($css as $vc) {
  192 + $check_vc = $this->url_check($vc, $project_id, $domain);
  193 + $check_vc && $source[] = $check_vc;
  194 + }
  195 +
  196 + return $source;
  197 + }
  198 +
  199 + //判断资源是否需要下载
  200 + protected function url_check($url, $project_id, $domain)
  201 + {
  202 + if ($url) {
  203 + $arr = parse_url($url);
  204 + $scheme = $arr['scheme'] ?? '';
  205 + $host = $arr['host'] ?? '';
  206 + $path = $arr['path'] ?? '';
  207 + $query = $arr['query'] ?? '';
  208 +
  209 + if ((strpos($host, '.globalso.') === false)
  210 + && (strpos($host, '.goodao.') === false)
  211 + && $path && (strpos($path, '.') !== false)) {
  212 +
  213 + $source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
  214 + if (!$source) {
  215 + return [
  216 + 'url' => $url,
  217 + 'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path . ($query ? '?' . $query : '')
  218 + ];
  219 + } else {
  220 + return false;
  221 + }
  222 + } else {
  223 + return false;
  224 + }
  225 + } else {
  226 + return false;
  227 + }
  228 + }
  229 +
  230 + //下载并替换资源
  231 + protected function upload_source($html, $source, $project_id)
  232 + {
  233 + foreach ($source as $vs) {
  234 +
  235 + $new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);
  236 + if ($new_source) {
  237 + CollectSource::insert([
  238 + 'project_id' => $project_id,
  239 + 'origin' => $vs['ur'],
  240 + 'target' => $new_source,
  241 + 'created_at' => date('Y-m-d H:i:s'),
  242 + 'updated_at' => date('Y-m-d H:i:s'),
  243 + ]);
  244 + $html = str_replace($vs['ur'], $new_source, $html);
  245 + }
  246 + }
  247 +
  248 + return $html;
  249 + }
  250 +}
@@ -6,6 +6,7 @@ use App\Helper\Arr; @@ -6,6 +6,7 @@ use App\Helper\Arr;
6 use App\Http\Logic\Bside\Product\CategoryLogic; 6 use App\Http\Logic\Bside\Product\CategoryLogic;
7 use App\Http\Logic\Bside\Product\KeywordLogic; 7 use App\Http\Logic\Bside\Product\KeywordLogic;
8 use App\Models\Blog\Blog; 8 use App\Models\Blog\Blog;
  9 +use App\Models\Collect\CollectTask;
9 use App\Models\Com\UpdateLog; 10 use App\Models\Com\UpdateLog;
10 use App\Models\News\News; 11 use App\Models\News\News;
11 use App\Models\Product\Category; 12 use App\Models\Product\Category;
@@ -77,7 +78,7 @@ class ProjectUpdate extends Command @@ -77,7 +78,7 @@ class ProjectUpdate extends Command
77 $task->save(); 78 $task->save();
78 79
79 //设置数据库 80 //设置数据库
80 - $project = ProjectServer::useProject($task->project_id); 81 + $project = ProjectServer::useProject($project_id);
81 if ($project) { 82 if ($project) {
82 if ($api_type == 'category') { 83 if ($api_type == 'category') {
83 //分类 84 //分类
@@ -86,48 +87,6 @@ class ProjectUpdate extends Command @@ -86,48 +87,6 @@ class ProjectUpdate extends Command
86 if (isset($data['code']) && $data['code'] == 200) { 87 if (isset($data['code']) && $data['code'] == 200) {
87 $items = $data['data'] ?? []; 88 $items = $data['data'] ?? [];
88 $this->category_insert($project_id, $items, 0); 89 $this->category_insert($project_id, $items, 0);
89 -// $model = new Category();  
90 -// foreach ($items as $item) {  
91 -// $parent = $model->read(['pid' => 0, 'title' => $item['name']], 'id');  
92 -// if (!$parent) {  
93 -// try {  
94 -// $parent_id = $model->addReturnId([  
95 -// 'project_id' => $project_id,  
96 -// 'title' => $item['name'],  
97 -// 'pid' => 0,  
98 -// 'keywords' => $item['keywords'],  
99 -// 'describe' => $item['description']  
100 -// ]);  
101 -// $route = RouteMap::setRoute($item['url'] ? $this->get_url_route($item['url']) : $item['name'], RouteMap::SOURCE_PRODUCT_CATE, $parent_id, $project_id);  
102 -// $model->edit(['route' => $route], ['id' => $parent_id]);  
103 -// } catch (\Exception $e) {  
104 -// echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;  
105 -// continue;  
106 -// }  
107 -// } else {  
108 -// $parent_id = $parent['id'];  
109 -// }  
110 -//  
111 -// foreach ($item['children'] as $child) {  
112 -// $child_info = $model->read(['pid' => $parent_id, 'title' => $child['name']]);  
113 -// if (!$child_info) {  
114 -// try {  
115 -// $child_id = $model->addReturnId([  
116 -// 'project_id' => $project_id,  
117 -// 'title' => $child['name'],  
118 -// 'pid' => $parent_id,  
119 -// 'keywords' => $child['keywords'],  
120 -// 'describe' => $child['description']  
121 -// ]);  
122 -// $route = RouteMap::setRoute($child['url'] ? $this->get_url_route($child['url']) : $child['name'], RouteMap::SOURCE_PRODUCT_CATE, $child_id, $project_id);  
123 -// $model->edit(['route' => $route], ['id' => $child_id]);  
124 -// } catch (\Exception $e) {  
125 -// echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;  
126 -// continue;  
127 -// }  
128 -// }  
129 -// }  
130 -// }  
131 } else { 90 } else {
132 return true; 91 return true;
133 } 92 }
@@ -164,7 +123,11 @@ class ProjectUpdate extends Command @@ -164,7 +123,11 @@ class ProjectUpdate extends Command
164 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL; 123 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
165 continue; 124 continue;
166 } 125 }
  126 + } else {
  127 + $id = $keyword['id'];
167 } 128 }
  129 +
  130 + CollectTask::_insert($item['url'], $project_id, RouteMap::SOURCE_PRODUCT_KEYWORD, $id);
168 } 131 }
169 } 132 }
170 } 133 }
@@ -279,7 +242,11 @@ class ProjectUpdate extends Command @@ -279,7 +242,11 @@ class ProjectUpdate extends Command
279 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL; 242 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
280 continue; 243 continue;
281 } 244 }
  245 + } else {
  246 + $id = $product['id'];
282 } 247 }
  248 +
  249 + CollectTask::_insert($item['url'], $project_id, RouteMap::SOURCE_PRODUCT, $id);
283 } 250 }
284 } 251 }
285 } 252 }
@@ -329,7 +296,11 @@ class ProjectUpdate extends Command @@ -329,7 +296,11 @@ class ProjectUpdate extends Command
329 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL; 296 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
330 continue; 297 continue;
331 } 298 }
  299 + } else {
  300 + $id = $news['id'];
332 } 301 }
  302 +
  303 + CollectTask::_insert($item['url'], $project_id, $api_type == 'news' ? RouteMap::SOURCE_NEWS : RouteMap::SOURCE_BLOG, $id);
333 } 304 }
334 } 305 }
335 } 306 }
@@ -373,7 +344,11 @@ class ProjectUpdate extends Command @@ -373,7 +344,11 @@ class ProjectUpdate extends Command
373 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL; 344 echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
374 continue; 345 continue;
375 } 346 }
  347 + } else {
  348 + $id = $custom['id'];
376 } 349 }
  350 +
  351 + CollectTask::_insert($item['url'], $project_id, RouteMap::SOURCE_PAGE, $id);
377 } 352 }
378 } 353 }
379 } 354 }
@@ -423,6 +398,7 @@ class ProjectUpdate extends Command @@ -423,6 +398,7 @@ class ProjectUpdate extends Command
423 return $arr[count($arr) - 2]; 398 return $arr[count($arr) - 2];
424 } 399 }
425 400
  401 + //多级分类入库
426 protected function category_insert($project_id, $items, $pid = 0) 402 protected function category_insert($project_id, $items, $pid = 0)
427 { 403 {
428 $model = new Category(); 404 $model = new Category();
  1 +<?php
  2 +
  3 +namespace App\Models\Collect;
  4 +
  5 +use App\Models\Base;
  6 +
  7 +class CollectSource extends Base
  8 +{
  9 + //设置关联表名
  10 + protected $table = 'gl_collect_source';
  11 +
  12 + //连接数据库
  13 + protected $connection = 'custom_mysql';
  14 +
  15 +
  16 +}
  1 +<?php
  2 +
  3 +namespace App\Models\Collect;
  4 +
  5 +use App\Models\Base;
  6 +
  7 +class CollectTask extends Base
  8 +{
  9 + //设置关联表名
  10 + protected $table = 'gl_collect_task';
  11 +
  12 + //连接数据库
  13 + protected $connection = 'custom_mysql';
  14 +
  15 + const STATUS_UN = 0;
  16 + const STATUS_ING = 1;
  17 + const STATUS_COM= 2;
  18 +
  19 + public static function _insert($url, $project_id, $source, $source_id)
  20 + {
  21 + if(!$url){
  22 + return;
  23 + }
  24 +
  25 + $url_arr = parse_url($url);
  26 +
  27 + $data = [
  28 + 'project_id' => $project_id,
  29 + 'source' => $source,
  30 + 'source_id' => $source_id,
  31 + 'domain' => $url_arr['host'],
  32 + 'route' => $url_arr['path']
  33 + ];
  34 +
  35 + $task = self::where($data)->first();
  36 + if(!$task){
  37 + $data['created_at'] = $data['updated_at'] = date('Y-m-d H:i:s');
  38 + self::insert($data);
  39 + }
  40 + }
  41 +}
@@ -13,6 +13,9 @@ class UpdateLog extends Model @@ -13,6 +13,9 @@ class UpdateLog extends Model
13 const STATUS_ING = 1;//导入中 13 const STATUS_ING = 1;//导入中
14 const STATUS_COM = 2;//导入完成 14 const STATUS_COM = 2;//导入完成
15 15
  16 + const COLLECT_STATUS_UN = 0;//未开始
  17 + const COLLECT_STATUS_COM = 1;//采集完成
  18 +
16 /** 19 /**
17 * 创建更新日志 20 * 创建更新日志
18 * @param $project_id 21 * @param $project_id
@@ -30,6 +33,7 @@ class UpdateLog extends Model @@ -30,6 +33,7 @@ class UpdateLog extends Model
30 $log->api_type = $type; 33 $log->api_type = $type;
31 $log->api_url = $url; 34 $log->api_url = $url;
32 $log->sort = $type == 'category' ? 0 :1; 35 $log->sort = $type == 'category' ? 0 :1;
  36 + $log->collect_status = in_array($type, ['website_info', 'category']) ? 1 : 0;
33 return $log->save(); 37 return $log->save();
34 } 38 }
35 return true; 39 return true;