登录  注册 退出
苏杭php自学网
记录个人学习过程中碰见的问题和感受
要温柔,也要有屠龙的勇气
本站有学习资源,可自由下载学习资料
  1. 首页 > PHP自学教程

基于phpQuery的网站数据采集

作者:苏杭 日期:2021-04-08 17:39:38 点击数:

源码: 

基于phpQuery的网站数据采集(图1)1-21040QJ205U1.rar


使用 phpQuery 这个数据采集类库,进行了一次 ,目标网站的数据爬取,具体代码实现如下:


<?php
require_once(Yii::app()->basePath."/components/phpQuery/phpQuery.php");
defined('YII_DEBUG') or define('YII_DEBUG', true);
class CurlController extends Controller
{

   public function actionBb(){
       echo 1;die();
   }

   public function getNursing()    //术后护理 stat
   {

       $coun = pq(".nursing-title")->size();

       $tian2 = [];
       $tian3 = [];
       for ($l = 0; $l < ($coun * 2); $l++) {
           if ($l % 2 === 0) {
               array_push($tian2, pq(".nursing-des:eq($l)")->text());
           } else {
               array_push($tian3, pq(".nursing-des:eq($l)")->text());
           }
       }

       $tian = [];
       for ($p = 0; $p < $coun; $p++) {
           $tian[$p][0] = pq(".nursing-title:eq($p)")->text();
       }

       foreach ($tian as $k=>$y){
           $tian[$k][1] = $tian2[$k];
           $tian[$k][2] = $tian3[$k];
       }

       return json_encode($tian);
   }


   public function actionIndex()
   {
       ini_set('max_execution_time', 6000);
       phpQuery::newDocumentFile('https://www.soyoung.com/itemk/jiatifengetou/');

       //项目标题列表
       $list = '["zhibo","","meiliriji","anxingou","quanziiclassquani","xinyang","","congyezherenzheng","jigouruzhu","lianxiwomen","xinyang","","","","dalei","erjilei","sanjilei","boniaosuanlongbi","boniaosuanquanliantianchong","boniaosuanfengetou","boniaosuandianxiaba","boniaosuanfengpingguoji","boniaosuanfengtaiyangxue","boniaosuanfengchun","boniaosuanfengwocan","boniaosuanfengmeigong","boniaosuanfengmianjia","boniaosuanfengchunzhu","boniaosuanfengbijidi","boniaosuanfengleigou","boniaosuanfengerchui","boniaosuanfengyanwo","boniaosuanqujingwen","boniaosuanqujingtaiwen","boniaosuanqufalingwen","boniaosuanqumianbuxiwen","boniaosuanquheiyanquan","rongjiemei","roudusuquyuweiwen","roudusuqutaitouwen","roudusuquchuanziwen","roudusuqumuouwen","roudusuqudongtaiwen","roudusuqufalingwen","roudusuqumianbuxiwen","roudusuqubibeiwen","roudusuqukouzhouwen","roudusushoulian","roudusushouxiaotui","roudusushoujianji","roudususuobitou","roudusuquhuchou","roudusuzhiliaoduohan","roudusuquluxiao","roudusujifangsong","feiluojia","meisumeifu","remaji","chaoshengtisheng","shepinjinfu","relati","shaonvzhen","tongyanzhen","pimiaojiguang","tiqujingwen","ganxibaoliaofa","qingchunjiemayimeifu","meibaizhushe","heilianwawa","caiguangnenfu","leishejingfu","meibaidaoru","boniaosuandaoru","baiciwawa","jiguangban","pimiaojiguang","leishejingfu","jiguangdian","guangban","10ban","22wangzhezhiguan","honglanguangdou","chaoweixiaoqipao","xingrensuanhuanfu","guosuanhuanfu","qingdouzhen","shuiguangbushui","shuiyanghuofu","boniaosuandaoru","zhongzhimianmo","wuzhenshuiguang","shuiguangwawa","huangjinhuanfu","weizhenmeisu","shepinweizhen","caiguangnenfu","guangnenfu","guangzinenfu","dianzhenjiguang","shepinweizhen","weizhenmeisu","xiangsujiguang","huaxuehuanfu","yifuquanweizhen","huangjinweizhen","zhusheba","shoushuba","jiguangba","wuchuangbahenxiufu","jiguangquwenshen","qurenshenwen","ximei","xiyanxian","xichunxian","jiguanghongxuesi","anmojinzhi","jingluojingpai","jinghuadaoru","shepinmeifu","shenlanshepin","relati","kangminxiufu","kaineiyanjiao","kaiwaiyanjiao","maixianshuangyanpi","qiekaishuangyanpi","dingdianshuangyanpi","neiqiequyandai","waiqiequyandai","kuanggezhifangshifang","jiguangquyandai","jiguangquheiyanquan","boniaosuanquheiyanquan","xiongmaozhenquheiyanquan","boniaosuanfengwocan","zitizhifangfengwocan","qiemeishu","shangyanquzhi","shangtiji","boniaosuanfengyanwo","xiayanxiazhishu","boniaosuanfengmeigong","jiatidianmeigong","zitizhifangfengmeigong","yanbuduoxiang","yanbuxiufu","pengtilongbi","guijiaolongbi","erruangufuhelongbi","boniaosuanlongbi","maixianlongbi","zitizhifanglongbi","bizhonggeruangulongbi","leiruangulongbi","zitizhenpilongbi","jiaoyuandanbailongbi","mantebolongbi","chaotilongbi","zitiruangudianbijian","shoushusuobitou","roudususuobitou","shoushusuobiyi","yinggoubijiaozheng","kuanbijiaozheng","tuofengbijiaozheng","changbijiaozheng","waibijiaozheng","shoushuyanchangbixiaozhu","bibuduoxiang","boniaosuanfengbijidi","zitizhifangfengbijidi","zitiruangudianbijidi","jiatidianbijidi","bibuxiufu","bibuguijiaojiatiquchu","bibupengtijiatiquchu","bikongjiaozheng","zitizhifangfengetou","jiatifengetou","etousuoxiao","boniaosuanfengetou","zitizhifangfengtaiyangxue","jiatifengtaiyangxue","boniaosuanfengtaiyangxue","quanguneitui","quangongjiangdi","quanguquangongzhengxingshu","shangxiashoushutuzuijiaozheng","shangqiantutianbaodi","xiaqiantudibaotian","mianbubuduichengjiaozheng","xiabajiegushu","guijiaodianxiaba","pengtidianxiaba","boniaosuandianxiaba","zitizhifangdianxiaba","xiabaxizhishuqushuangxiaba","mantebolongxiaba","qujiazhidianshoulian","rongzhishoulian","roudusushoulian","mianbuxizhi","shepinrongzhishoulian","guangxianrongzhishoulian","jiguangrongzhishoulian","guazilianshoushu","xiajiaozhengxing","jiuwochengxingshu","shoushuqufalingwen","boniaosuanqufalingwen","roudusuqufalingwen","boniaosuanfengpingguoji","zitizhifangfengpingguoji","jiaoyuandanbaitianchongpingguoji","zitizhifangfengmianjia","boniaosuanfengmianjia","maixiantisheng","chuzhoushu","boniaosuanfengmeigong","zitizhifangfengmeigong","jiatidianmeigong","renzhongsuoduanshu","xiabaguijiaojiatiquchu","xiabapengtijiatiquchu","zitizhifangfengpingguoji","zitizhifangfengmianjia","zitizhifangfengtaiyangxue","zitizhifangfengetou","zitizhifangfengleigou","zitizhifanglongbi","zitizhifangdianxiaba","zitizhifangfengbijidi","zitizhifangfengmeigong","zitizhifangfengyanwo","zitizhifangquanliantianchong","zitizhifangfengwocan","zitizhifangqufalingwen","zitizhifangfengchun","zitizhifangfengerchui","zitizhifangfengtun","zitizhifanglongxiong","zhifangtianchongshibaixiufu","jiatilongxiong","zitizhifanglongxiong","fuhelongxiong","rutouneixianjiaozheng","rutousuoxiao","xizhiqufuru","shoushuqufuru","ruyunpiaohong","ruyunsuoxiao","rufangxiachuijiaozheng","nvxingrufangsuoxiao","nanxingrufangfeidajiaozheng","longxiongxiufu","xiongbujiatiquchu","shepinrongzhishoushoubi","shepinrongzhishouyaobu","shepinrongzhishoufubu","shepinrongzhishoutunbu","shepinrongzhishoudatui","shepinrongzhishouxiaotui","shepinrongzhishoubeibu","shepinrongzhishoujianbang","lengdongrongzhishouyaobu","lengdongrongzhishoufubu","lengdongrongzhishoutunbu","lengdongrongzhishoudatui","lengdongrongzhishouxiaotui","lengdongrongzhishoujianbang","lengdongrongzhishoushoubi","lengdongrongzhishoubeibu","xizhishouyaobu","xizhishoufubu","xizhishoudatui","xizhishouxiaotui","xizhishoutunbu","xizhishoushoubi","xizhishoujianbang","xizhiqufuru","xizhishibaixiufu","xizhishouquanshen","xizhishouyaofu","xizhishoubeibu","xizhishoushuangtui","gongzhenxizhishu","junengzhenboxizhi","jiatifengtun","zitizhifangfengtun","xiaotuishenjingzuduanshu","fubichengxingshu","chaoshengrongzhishoushoubi","chaoshengrongzhishouyaobu","chaoshengrongzhishoufubu","chaoshengrongzhishoudatui","chaoshengrongzhishouxiaotui","chaoshengrongzhishoutunbu","chaoshengrongzhishoubeibu","chaoshengrongzhishoujianbu","youlisu","relisu","gekongrongzhishoushoubi","gekongrongzhishouyaobu","gekongrongzhishoufubu","gekongrongzhishoudatui","gekongrongzhishouxiaotui","gekongrongzhishoubeibu","gekongrongzhishoutunbu","gekongrongzhishoujianbu","dongyangshoushoubi","dongyangshouyaobu","dongyangshoufubu","dongyangshoudatui","dongyangshouxiaotui","dongyangshoubeibu","dongyangshoujianbang","dongyangshoutunbu","anmojianfei","maixianjianfei","maixiantisheng","chaoshengtisheng","remaji","relati","shepinjinfu","lapishoushutisheng","neikuijingshoushutisheng","xiaoqiekoushoushutisheng","meitisu","tiqujingwen","shaonvzhen","tongyanzhen","feiluojia","fuyanzhen","lishazitikangshuai","boniaosuanqujingtaiwen","roudusuqudongtaiwen","penshaxiya","chaoshengboxiya","yachipaoguang","lengguangmeibai","chimeibai","hejinkaociya","quanciya","guijinshukaociya","eryanghuaquanciya","jinshutuocaojiaozheng","yinxingjiaozheng","yinxingtuocaojiaozheng","shecejiaozheng","zisuotuocaojiaozheng","mianzhengji","shuzhitiemian","citiemian","zhongzhiya","baya","genguanzhiliao","buya","yazhouhuli","bazhichi","xianweijinggenguanzhiliao","yachizonghe","yayan","kouqiangkuiyang","yasuiyan","mianzhengji","banyongjiuwenmei","banyongjiuwenchun","banyongjiuwenfajixian","banyongjiumeitongxian","banyongjiujiemaoxian","banyongjiuwenyanxian","yunjieshu","jiejiemao","tuoshoubi","tuodaxiaotui","tuoyemao","tuochunmao","tuofajixian","bijinixiantuomao","simituomao","tuoshoujiaomao","tuobeibu","tuoluosaihu","zhongzhifajixian","toudingjiamizhongzhi","meirenjianzhongzhi","bahenzhongzhi","zhongzhimeimao","zhongzhijiemao","zhongzhijiao","zhongzhihuxu","zhongzhixiongmao","zhongzhisimimaofa","shoushujinsuoyindao","jiguangjinsuoyindao","chunvmoxiufu","xiaoyinchunshoushu","yindizhengxingshu","simipiaohong","shuqianjiancha","simiqingjie","dianzhushe","simichaoshengtisheng","simihuli","qiebaopi","yinjingzengda","houchungaibao","chunshoushu","zuijiaoshangyangshu","shoushuquluxiao","roudusuquluxiao","chunbuxiufu","chunlieshoushu","boniaosuanfengchun","boniaosuanfengchunzhu","zitizhifangfengchun","renzhongsuoduanshu","chunbuzonghe","zhaofengerjiaozheng","boniaosuanfengerchui","zitizhifangfengerchui","erzaizao","yanbuxiufu","bibuguijiaojiatiquchu","bibupengtijiatiquchu","bibuxiufu","chunbuxiufu","chunlieshoushu","chunvmoxiufu","xiongbujiatiquchu","longxiongxiufu","xizhishibaixiufu","zhifangtianchongshibaixiufu","zhushewuquchu","rongjiemei","shoushuchuyechou","jiguangchuyechou","jiaoyuandanbai","aibeizhushe","dajiaogujiaozheng","gongjingaiyimiao","baguanjianfei","zhenjiujianfei","bianxingshoushu","zonghexiangmu","sirendingzhi","jiguangchuyechou","zhenggushu","quanfeimiaojiguangshu","feimiaojiguangshoushu","jingtizhirushoushu","hougongmojiagushu","lvfeimiaojiguangshu","zhunfenzijiguangshu","quanjiguangshoushu","yuanshijiaozheng","xieshijiaozheng","ruoshijiaozheng","qingguangyan","jiemoyan","sanguang","jiaomoyan","shiwangmotuoluo","shayan","shilijiancha","beiyunjiancha","tongjingjiancha","yindaoyanjiancha","fuketijian","yuejingbudiao","chanhousuxing","chanhouxunzheng","pendijixiufu","yindaoxiufu","chanhourufanghuli","ruxianshutong","zigongjiliu","luanchaonangzhong","gongjingyan","penqiangyan","nvxingtijian","jiyinjiance","gongjingaishaicha","ruxianaishaicha","changguitijian","zhonglaoniantijian","jingluoshutong","aijiuliliao","guabaguan","zhongyiyaoyu","zhongyianmo","fangtuoliliao","wufaliliao","shencengqingjie","yangfahuli","toubu","toupihuli","quanshen","beibuhuli","meixionghuli","jianjinghuli","tuibuhuli","tunbuhuli","shoubuhuli","zuliao","yanbuhuli","","zhengguijigou","","jiagetouming","","dingjinshandiantui","","zhenshidianping","","zhongguozhengxingmeirongxiehuihuiyuandanwei","","beijingyishixiehuizhengxingwaikezhuanjiahuizhenzhongxin","guanzhuxinyanggongzhonghao","xiazaixinyang","","","","","","","","","","chakanxiangqing","chakanxiangqing","chakanxiangqing","chakanxiangqing"]';
       $list = json_decode($list, true);

       //采集url
       $url = 'https://www.soyoung.com/itemk/';

       //开始采集
   $data = [];
       for ($i=0;$i<=50;$i++) {
           phpQuery::newDocumentFile($url.$list[$i]);
           $data[$i]['title'] = pq("h1")->text();
           if($data[$i]['title'] == null) {               //如果一开始检测到空页面则跳出
               $data[$i] = array_filter($data[$i]);
               continue;
        }

           $archives = [];            //操作档案
           for($m=0;$m<8;$m++){
               $archives['arch_'.($m+1)] = pq(".value:eq($m)")->text();
           }

           $data[$i]['alias'] = pq(".alias")->text();
           $data[$i]['abstract'] = pq(".desc:first")->text();
           $data[$i]['features'] = pq(".p1:first")->text();
           $data[$i]['efficacy'] = pq(".labels:first")->text();
           $data[$i]['feature'] = pq(".labels:last")->text();
           $data[$i]['suitable'] = pq("#crowd > p")->text();
           $data[$i]['virtue'] = pq("#merit > p")->text();
           $data[$i]['defect'] = pq("#defect > p")->text();
           $data[$i]['taboo'] = pq("#limit_crowd > p")->text();
           $data[$i]['archives'] = json_encode($archives);                           //操作档案
    $data[$i]['nursing'] =  $this -> getNursing();                            //术后护理
    if($data[$i]['nursing'] == "[]") unset($data[$i]['nursing']);             //去除空数据
    $data[$i] = array_filter($data[$i]);                                      //去除空数组
            Yii::app()->db_press->createCommand()->insert('ym_baikes',$data[$i])->execute(); //存进mysql
       }

           $data = array_filter($data);
           var_dump($data);
           die();
   }

}


phpQuery 类似 于 jquery,基于dome操作的,非常容易操作,对着文档就可以操作:


phpQuery 文档 入口



随便看看
QQ在线咨询
电话:
182 7047 6708
交流微信:
su18270476708