作者 刘锟

合并分支 'akun' 到 'master'

Akun



查看合并请求 !34
<?php
namespace App\Console\Commands\Update;
use App\Helper\Arr;
use App\Http\Logic\Bside\Product\CategoryLogic;
use App\Http\Logic\Bside\Product\KeywordLogic;
use App\Models\Blog\Blog;
use App\Models\Collect\CollectSource;
use App\Models\Collect\CollectTask;
use App\Models\Com\UpdateLog;
use App\Models\News\News;
use App\Models\Product\Category;
use App\Models\Product\Keyword;
use App\Models\Product\Product;
use App\Models\RouteMap\RouteMap;
use App\Models\Template\BCustomTemplate;
use App\Models\WebSetting\WebSettingReceiving;
use App\Services\CosService;
use App\Services\ProjectServer;
use App\Utils\HttpUtils;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Redis;
/**
* 4.0,5.0升级到6.0,页面采集
* Class ProjectImport
* @package App\Console\Commands
* @author Akun
* @date 2023/11/10 16:04
*/
class HtmlCollect extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'project_html_collect';
/**
* The console command description.
*
* @var string
*/
protected $description = '执行项目html页面采集';
public function handle()
{
while (true) {
$this->start_update();
}
}
protected function start_update()
{
$task_id = $this->get_task();
if ($task_id === false) {
//所有项目采集完成
sleep(60);
return true;
} elseif ($task_id === 0) {
//当前项目采集完成
sleep(2);
return true;
}
$task_arr = explode('_', $task_id);
$project_id = $task_arr[0];
$collect_id = $task_arr[1];
//设置数据库
$project = ProjectServer::useProject($project_id);
if ($project) {
$collect_info = CollectTask::select(['id', 'domain', 'route'])->where('id', $collect_id)->where('status', CollectTask::STATUS_UN)->first();
if (!$collect_info) {
sleep(2);
return true;
}
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', task_type: ' . $collect_id . ', collect start' . PHP_EOL;
$collect_info->status = CollectTask::STATUS_ING;
$collect_info->save();
//采集html页面,下载资源到本地并替换
try {
$html = file_get_contents('https://' . $collect_info->domain . $collect_info->route);
$source_list = $this->html_preg($html, $project_id, $collect_info->domain);
if ($source_list) {
$html = $this->upload_source($html, $source_list, $project_id);
}
} catch (\Exception $e) {
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', task_type: ' . $collect_id . ', error: ' . $e->getMessage() . PHP_EOL;
return true;
}
$collect_info->html = $html;
$collect_info->status = CollectTask::STATUS_COM;
$collect_info->save();
echo 'date:' . date('Y-m-d H:i:s') . ', project_id: ' . $project_id . ', task_type: ' . $collect_id . ', collect end' . PHP_EOL;
}
//关闭数据库
DB::disconnect('custom_mysql');
sleep(2);
}
//获取任务
protected function get_task()
{
$key = 'console_html_collect_task';
$task_id = Redis::rpop($key);
if ($task_id) {
return $task_id;
}
$update_log = UpdateLog::where('status', UpdateLog::STATUS_COM)->where('collect_status', UpdateLog::COLLECT_STATUS_UN)->orderBy('project_id', 'asc')->first();
if (!$update_log) {
return false;
}
$complete = false;
//设置数据库
$project = ProjectServer::useProject($update_log->project_id);
if ($project) {
$collect_list = CollectTask::select(['id', 'project_id'])->where('project_id', $update_log['project_id'])->where('status', CollectTask::STATUS_UN)->limit(50)->get();
if ($collect_list->count() == 0) {
$complete = true;
} else {
foreach ($collect_list as $collect) {
Redis::lpush($key, $collect['project_id'] . '_' . $collect['id']);
}
}
}
//关闭数据库
DB::disconnect('custom_mysql');
if ($complete) {
$update_log->collect_status = UpdateLog::COLLECT_STATUS_COM;
return 0;
}
$task_id = Redis::rpop($key);
return $task_id;
}
//正则匹配html资源
protected function html_preg($html, $project_id, $domain)
{
$source = [];
if (!$html) {
return $source;
}
//图片
preg_match_all('/<img\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_img);
$img = $result_img[2] ?? [];
foreach ($img as $vi) {
$check_vi = $this->url_check($vi, $project_id, $domain);
$check_vi && $source[] = $check_vi;
}
//js
preg_match_all('/<script\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_js);
$js = $result_js[2] ?? [];
foreach ($js as $vj) {
$check_vj = $this->url_check($vj, $project_id, $domain);
$check_vj && $source[] = $check_vj;
}
//video
preg_match_all('/<source\s+[^>]*?src\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_video);
$video = $result_video[2] ?? [];
foreach ($video as $vv) {
$check_vv = $this->url_check($vv, $project_id, $domain);
$check_vv && $source[] = $check_vv;
}
//css
preg_match_all('/<link\s+[^>]*?href\s*=\s*(\'|\")(.*?)\\1[^>]*?\/?\s*>/i', $html, $result_css);
$css = $result_css[2] ?? [];
foreach ($css as $vc) {
$check_vc = $this->url_check($vc, $project_id, $domain);
$check_vc && $source[] = $check_vc;
}
return $source;
}
//判断资源是否需要下载
protected function url_check($url, $project_id, $domain)
{
if ($url) {
$arr = parse_url($url);
$scheme = $arr['scheme'] ?? '';
$host = $arr['host'] ?? '';
$path = $arr['path'] ?? '';
$query = $arr['query'] ?? '';
if ((strpos($host, '.globalso.') === false)
&& (strpos($host, '.goodao.') === false)
&& $path && (strpos($path, '.') !== false)) {
$source = CollectSource::where('project_id', $project_id)->where('origin', $url)->first();
if (!$source) {
return [
'url' => $url,
'url_complete' => ($scheme ?: 'https') . '://' . ($host ?: $domain) . $path . ($query ? '?' . $query : '')
];
} else {
return false;
}
} else {
return false;
}
} else {
return false;
}
}
//下载并替换资源
protected function upload_source($html, $source, $project_id)
{
foreach ($source as $vs) {
$new_source = CosService::uploadRemote($project_id, 'source', $vs['url_complete']);
if ($new_source) {
CollectSource::insert([
'project_id' => $project_id,
'origin' => $vs['ur'],
'target' => $new_source,
'created_at' => date('Y-m-d H:i:s'),
'updated_at' => date('Y-m-d H:i:s'),
]);
$html = str_replace($vs['ur'], $new_source, $html);
}
}
return $html;
}
}
... ...
... ... @@ -6,6 +6,7 @@ use App\Helper\Arr;
use App\Http\Logic\Bside\Product\CategoryLogic;
use App\Http\Logic\Bside\Product\KeywordLogic;
use App\Models\Blog\Blog;
use App\Models\Collect\CollectTask;
use App\Models\Com\UpdateLog;
use App\Models\News\News;
use App\Models\Product\Category;
... ... @@ -77,7 +78,7 @@ class ProjectUpdate extends Command
$task->save();
//设置数据库
$project = ProjectServer::useProject($task->project_id);
$project = ProjectServer::useProject($project_id);
if ($project) {
if ($api_type == 'category') {
//分类
... ... @@ -86,48 +87,6 @@ class ProjectUpdate extends Command
if (isset($data['code']) && $data['code'] == 200) {
$items = $data['data'] ?? [];
$this->category_insert($project_id, $items, 0);
// $model = new Category();
// foreach ($items as $item) {
// $parent = $model->read(['pid' => 0, 'title' => $item['name']], 'id');
// if (!$parent) {
// try {
// $parent_id = $model->addReturnId([
// 'project_id' => $project_id,
// 'title' => $item['name'],
// 'pid' => 0,
// 'keywords' => $item['keywords'],
// 'describe' => $item['description']
// ]);
// $route = RouteMap::setRoute($item['url'] ? $this->get_url_route($item['url']) : $item['name'], RouteMap::SOURCE_PRODUCT_CATE, $parent_id, $project_id);
// $model->edit(['route' => $route], ['id' => $parent_id]);
// } catch (\Exception $e) {
// echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
// continue;
// }
// } else {
// $parent_id = $parent['id'];
// }
//
// foreach ($item['children'] as $child) {
// $child_info = $model->read(['pid' => $parent_id, 'title' => $child['name']]);
// if (!$child_info) {
// try {
// $child_id = $model->addReturnId([
// 'project_id' => $project_id,
// 'title' => $child['name'],
// 'pid' => $parent_id,
// 'keywords' => $child['keywords'],
// 'describe' => $child['description']
// ]);
// $route = RouteMap::setRoute($child['url'] ? $this->get_url_route($child['url']) : $child['name'], RouteMap::SOURCE_PRODUCT_CATE, $child_id, $project_id);
// $model->edit(['route' => $route], ['id' => $child_id]);
// } catch (\Exception $e) {
// echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
// continue;
// }
// }
// }
// }
} else {
return true;
}
... ... @@ -164,7 +123,11 @@ class ProjectUpdate extends Command
echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
continue;
}
} else {
$id = $keyword['id'];
}
CollectTask::_insert($item['url'], $project_id, RouteMap::SOURCE_PRODUCT_KEYWORD, $id);
}
}
}
... ... @@ -279,7 +242,11 @@ class ProjectUpdate extends Command
echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
continue;
}
} else {
$id = $product['id'];
}
CollectTask::_insert($item['url'], $project_id, RouteMap::SOURCE_PRODUCT, $id);
}
}
}
... ... @@ -329,7 +296,11 @@ class ProjectUpdate extends Command
echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
continue;
}
} else {
$id = $news['id'];
}
CollectTask::_insert($item['url'], $project_id, $api_type == 'news' ? RouteMap::SOURCE_NEWS : RouteMap::SOURCE_BLOG, $id);
}
}
}
... ... @@ -373,7 +344,11 @@ class ProjectUpdate extends Command
echo 'date:' . date('Y-m-d H:i:s') . ', task_id: ' . $task->id . ', error: ' . $e->getMessage() . PHP_EOL;
continue;
}
} else {
$id = $custom['id'];
}
CollectTask::_insert($item['url'], $project_id, RouteMap::SOURCE_PAGE, $id);
}
}
}
... ... @@ -423,6 +398,7 @@ class ProjectUpdate extends Command
return $arr[count($arr) - 2];
}
//多级分类入库
protected function category_insert($project_id, $items, $pid = 0)
{
$model = new Category();
... ...
<?php
namespace App\Models\Collect;
use App\Models\Base;
class CollectSource extends Base
{
//设置关联表名
protected $table = 'gl_collect_source';
//连接数据库
protected $connection = 'custom_mysql';
}
... ...
<?php
namespace App\Models\Collect;
use App\Models\Base;
class CollectTask extends Base
{
//设置关联表名
protected $table = 'gl_collect_task';
//连接数据库
protected $connection = 'custom_mysql';
const STATUS_UN = 0;
const STATUS_ING = 1;
const STATUS_COM= 2;
public static function _insert($url, $project_id, $source, $source_id)
{
if(!$url){
return;
}
$url_arr = parse_url($url);
$data = [
'project_id' => $project_id,
'source' => $source,
'source_id' => $source_id,
'domain' => $url_arr['host'],
'route' => $url_arr['path']
];
$task = self::where($data)->first();
if(!$task){
$data['created_at'] = $data['updated_at'] = date('Y-m-d H:i:s');
self::insert($data);
}
}
}
... ...
... ... @@ -13,6 +13,9 @@ class UpdateLog extends Model
const STATUS_ING = 1;//导入中
const STATUS_COM = 2;//导入完成
const COLLECT_STATUS_UN = 0;//未开始
const COLLECT_STATUS_COM = 1;//采集完成
/**
* 创建更新日志
* @param $project_id
... ... @@ -30,6 +33,7 @@ class UpdateLog extends Model
$log->api_type = $type;
$log->api_url = $url;
$log->sort = $type == 'category' ? 0 :1;
$log->collect_status = in_array($type, ['website_info', 'category']) ? 1 : 0;
return $log->save();
}
return true;
... ...