[PhpSelenium] 3.定时爬虫 + 多任务爬虫 + 代理池 PHP Selenium 发布于2020-02-18 / 更新于2021-01-02 07:04
仅用于交流和学习,禁止利用本资源从事任何违反本国(地区)法律法规的活动,一切遵守《网络安全法》 Tips:只是提供一个思路,实际项目中还需维护代理池可用性等细节部分 实战步骤 框架及核心库部署 定时更新代理池进程 定时爬取列表页进程 主进程定时从Redis中读取列表页任务,有则将每一项丢给异步任务执行 环境 CentOS 7.2 PHP7.2 Swoole 4.3.5 Google Chrome 78.0.3904.108 ChromeDriver 78.0.3904.105 Composer facebook/webdriver=1.7 easyswoole/easyswoole=3.1.18 easyswoole/curl=1.0.1 框架及核心库部署 1、安装EasySwoole 3.1.18版本
[ root@ar414.com phpseleniumdemo] composer require easyswoole/easyswoole= 3.1 .18
[ root@ar414.com phpseleniumdemo] php vendor/easyswoole/easyswoole/bin/easyswoole install
______ _____ _
| ____| / ____| | |
| | __ __ _ ___ _ _ | ( ___ __ __ ___ ___ | | ___
| __| / _` | / __| | | | | \ ___ \ \ \ /\ / / / _ \ / _ \ | | / _ \
| | ____ | ( _| | \ __ \ | | _| | ____) | \ V V / | ( _) | | ( _) | | | | __/
| ______| \ __,_| | ___/ \ __, | | _____/ \ _/\ _/ \ ___/ \ ___/ | _| \ ___|
__/ |
| ___/
install success,enjoy!
2.安装核心库facebook/webdriver、easyswoole/curl
[ root@ar414.com phpseleniumdemo] # composer require facebook/webdriver=1.7
[ root@ar414.com phpseleniumdemo] # composer require easyswoole/curl=1.0.1
3、确认运行没报错
[ root@ar414.com phpseleniumdemo] # php easyswoole start
| ____| / ____| | |
| | __ __ _ ___ _ _ | ( ___ __ __ ___ ___ | | ___
>| __| / _` | / __| | | | | \ ___ \ \ \ /\ / / / _ \ / _ \ | | / _ \
>| | ____ | ( _| | \ __ \ | | _| | ____) | \ V V / | ( _) | | ( _) | | | | __/
>| ______| \ __,_| | ___/ \ __, | | _____/ \ _/\ _/ \ ___/ \ ___/ | _| \ ___|
> __/ |
> | ___/
main server SWOOLE_WEB
listen address 0.0 .0.0
listen port 9501
sub server1 CONSOLE = > SWOOLE_TCP@127.0.0.1:9500
.. ..
定时更新代理池进程 Tips:代理资源请自行解决,这里只提供例子,实际是用不了的
1、 创建项目主目录
[root@ar414.com phpseleniumdemo]# mkdir App
#composer 指定App作用域
[root@ar414.com phpseleniumdemo]# cat composer.json
{
"autoload": {
"psr-4": { "App\\": "App/"
}
},
"require": {
"easyswoole/easyswoole": "3.1.18",
"facebook/webdriver": "^1.7",
"easyswoole/curl": "1.0.1"
}
}
#更新composer autoload
[root@ar414.com phpseleniumdemo]# composer dump-autoload
2、创建进程目录(将代理池更新作为一个子进程随项目启动运行)
[root@ar414.com phpseleniumdemo]# mkdir App/Process
3、代理池定时爬取(使用Redis List类型保证最新代理IP在头部,爬虫逻辑每次从头部获取,一个代理IP只用一次)
Tips:代理资源请自行解决,这里只提供例子,实际是用不了的
完整代码链接
<?php
/**
* Created by PhpStorm.
* User: ar414.com@gmail.com
* Date: 2019/12/7
* Time: 21:00
*/
namespace App\ Process ;
use App\ Lib\ Curl ;
use App\ Lib\ Kv ;
use EasySwoole\ Component\ Process\ AbstractProcess ;
class UpdateProxyPool extends AbstractProcess
{
//这里的代理IP都只支持socks5协议
private $proxyListApi = "http://www.zdopen.com/ShortS5Proxy/GetIP/?api=%s&akey=%s&order=2&type=3" ;
const PROXY_KV_KEY = 'spider:proxy:list' ;
const TIMER = 15 ;
protected function initProxyListApi ( )
{
// $this->proxyListApi = sprintf($this->proxyListApi,$_ENV['PROXY_LIST_API'],$_ENV['PROXY_LIST_KEY']);
$this - > proxyListApi = sprintf ( $this - > proxyListApi , 20191231231237085 , '72axxxae0fe34' ) ;
}
public function run ( $arg )
{
$this - > initProxyListApi ( ) ;
//依赖 composer require easyswoole/curl=1.0.1
while ( true )
{
$ret = Curl: : get ( $this - > proxyListApi ) ;
var_dump ( $ret ) ;
if ( $ret ) {
$ret = json_decode ( $ret , true ) ;
if ( $ret [ 'code' ] == 10001 && isset ( $ret [ 'data' ] [ 'proxy_list' ] ) && ! empty ( $ret [ 'data' ] [ 'proxy_list' ] ) ) {
foreach ( $ret [ 'data' ] [ 'proxy_list' ] as $proxy ) {
$proxyItem = $proxy [ 'ip' ] . ':' . $proxy [ 'port' ] ;
Kv: : redis ( ) - > lPush ( self: : PROXY_KV_KEY , $proxyItem ) ;
}
}
}
sleep ( self: : TIMER ) ;
}
}
}
4、配置代理池更新进程随项目启动时启动(完整代码链接 )
public static function mainServerCreate ( EventRegister $register )
{
//更新代理池进程
ServerManager: : getInstance ( ) - > getSwooleServer ( ) - > addProcess ( ( new \ App\ Process\ UpdateProxyPool( 'UpdateProxyPool' , [ ] ) ) - > getProcess ( ) ) ;
}
定时爬取列表页进程 爬取列表页进程(完整代码链接 )
<?php
/**
* Created by PhpStorm.
* User: ar414.com@gmail.com
* Date: 2019/12/7
* Time: 22:01
*/
namespace App\ Process ;
use App\ Lib\ ChromeDriver ;
use App\ Lib\ Kv ;
use EasySwoole\ Component\ Process\ AbstractProcess ;
use EasySwoole\ EasySwoole\ Logger ;
class ListSpider extends AbstractProcess
{
const API = 'https://www.188-sb.com/SportsBook.API/web?lid=1&zid=3&pd=%23AC%23B151%23C1%23D50%23E10%23F163%23&cid=42&ctid=42' ;
const LIST_KV_KEY = 'spider:list' ;
const TIMER = 20 ; //20秒执行一次
public function run ( $arg )
{
while ( true )
{
try
{
$driver = ( new ChromeDriver ( true ) ) - > getDriver ( ) ;
$driver - > get ( self: : API ) ;
$listStr = $driver - > getPageSource ( ) ;
var_dump ( $listStr ) ;
file_put_contents ( "/www/wwwroot/blog/phpseleniumdemo/listStr.html" , $listStr ) ;
preg_match_all ( "/PD=(.*);/U" , $listStr , $list ) ;
$list = array_unique ( $list [ 1 ] ) ;
if ( $list )
{
Kv: : redis ( ) - > set ( self: : LIST_KV_KEY , json_encode ( $list ) ) ;
}
var_dump ( 'done' ) ;
$driver - > close ( ) ;
$driver - > quit ( ) ;
}
catch ( \Throwable $throwable )
{
$driver - > close ( ) ;
$driver - > quit ( ) ;
Logger: : getInstance ( ) - > log ( $throwable - > getMessage ( ) , 'ListSpiderError' ) ;
var_dump ( $throwable - > getMessage ( ) ) ;
}
sleep ( self: : TIMER ) ;
}
}
}
主进程定时从Redis中读取列表页任务,有则将每一项丢给异步任务执行 1、完整代码链接
public static function mainServerCreate ( EventRegister $register )
{
//更新代理池进程
ServerManager: : getInstance ( ) - > getSwooleServer ( ) - > addProcess ( ( new \ App\ Process\ UpdateProxyPool( 'UpdateProxyPool' , [ ] ) ) - > getProcess ( ) ) ;
//列表爬取进程
ServerManager: : getInstance ( ) - > getSwooleServer ( ) - > addProcess ( ( new \ App\ Process\ ListSpider( 'ListSpider' , [ ] ) ) - > getProcess ( ) ) ;
$register - > set ( $register : : onWorkerStart, function ( \swoole_server $server , $workerId ) {
if ( $workerId == 0 )
{
Timer: : getInstance ( ) - > loop ( 30000 , function ( ) {
$ret = Kv: : redis ( ) - > get ( ListSpider: : LIST_KV_KEY ) ;
if ( $ret ) {
$ret = json_decode ( $ret , true ) ;
foreach ( $ret as $item ) {
TaskManager: : async ( function ( ) use ( $item ) {
( new ItemSpider ( true ) ) - > run ( $item ) ;
return true ;
} , function ( ) use ( $item ) {
var_dump ( "{ $item } Done" ) ;
} ) ;
}
}
} ) ;
}
} ) ;
}
2、ItemSpider逻辑代码(完整代码链接 )
<?php
/**
* Created by PhpStorm.
* User: ar414.com@gmail.com
* Date: 2019/12/7
* Time: 22:35
*/
namespace App\ Spider ;
use App\ Lib\ ChromeDriver ;
use EasySwoole\ EasySwoole\ Logger ;
use Facebook\ WebDriver\ WebDriverBy ;
use Facebook\ WebDriver\ WebDriverExpectedCondition ;
class ItemSpider
{
public function run ( $itemPath )
{
$driver = ( new ChromeDriver ( true ) ) - > getDriver ( ) ;
$itemPath = str_replace ( '#' , '/' , $itemPath ) ;
$url = "https://www.188-sb.com/#{ $itemPath } " ;
var_dump ( $url ) ;
try
{
$driver - > get ( $url ) ;
$driver - > wait ( ChromeDriver: : WAIT_SECONDS ) - > until (
WebDriverExpectedCondition: : visibilityOfElementLocated (
WebDriverBy: : className ( 'gl-MarketGroupButton_Text' )
)
) ;
Logger: : getInstance ( ) - > console ( "The title is '" . $driver - > getTitle ( ) . "'\n" ) ;
Logger: : getInstance ( ) - > console ( "The current URI is '" . $driver - > getCurrentURL ( ) . "'\n" ) ;
$body = $driver - > getPageSource ( ) ;
var_dump ( $body ) ;
$driver - > close ( ) ;
$driver - > quit ( ) ;
//TODO 清洗数据 入库
}
catch ( \Throwable $throwable )
{
Logger: : getInstance ( ) - > log ( $throwable - > getMessage ( ) , 'Bet365ApiRun' ) ;
$driver - > close ( ) ;
$driver - > quit ( ) ;
}
return ;
}
}
3、运行
[ root@ar414.com phpseleniumdemo] # php easyswoole start
讨论