今天看啥  ›  专栏  ›  猫哥的技术博客

PHP爬虫 -- 018 作业源码(豆瓣top250)

猫哥的技术博客  · 掘金  ·  · 2019-07-18 18:02
阅读 4

PHP爬虫 -- 018 作业源码(豆瓣top250)

首先, 创建数据表

CREATE TABLE `douban_movie` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(255) NOT NULL,
  `img` varchar(255) NOT NULL,
  `info` varchar(255) NOT NULL,
  `rank` int(11) NOT NULL,
  `score` decimal(2,1) NOT NULL,
  `comment` varchar(255) NOT NULL,
  PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
复制代码

编写爬虫代码

<?php
require 'vendor/autoload.php';
use QL\QueryList;
use Medoo\Medoo;
// 创建数据库连接
$database = new medoo([
    'database_type' => 'mysql',
    'database_name' => 'demo_db',
    'server' => 'localhost',
    'username' => 'root',
    'password' => 'root',
    'charset' => 'utf8',
]);



/* 
 * @Description: 从html页面中通过css选择器, 获取我们想要的数据
 * @param: 目标url
 * @return: 包含我们需要数据的二维数组
*/ 
function get_data($url) {
    $ql = new QueryList();
    echo "function get_data is running ... \n";
    $movie_html = get_html_source($url);
    $data = $ql->html($movie_html)->rules([
        "title" => ['#content > div > div.article > ol > li > div > div.info > div.hd > a > span:nth-child(1)','text'],
        "img" => ['#content > div > div.article > ol > li > div > div.pic > a > img','src'],
        "info" => ['#content > div > div.article > ol > li > div > div.info > div.bd > p:nth-child(1)','text'],
        "rank" => ['#content > div > div.article > ol > li > div > div.pic > em','text'],
        "score" => ['#content > div > div.article > ol > li > div > div.info > div.bd > div > span.rating_num','text'],
    ])->queryData();

    foreach ($data as $key => $value) {
        $num = $key+1;
        $res = $ql->html($movie_html)->find("#content > div > div.article > ol > li:nth-child({$num}) > div > div.info > div.bd > p.quote > span")->text();
        if($res){

            $value['comment'] = $res;
        }else{
            $value['comment'] = "好看到无话可说...";
        }
        $data[$key] = $value;
    }
    return $data;
}



/* 
 * @Description: 保存数据到电影表, 传一个二维数组, 批量插入
 * @param: 二维数组
 * @return: 没有返回值
*/ 
function save_movie($data){
    echo "function save_movie is running ... \n";
    global $database;
    $database->insert('douban_movie',$data);
}

/* 
 * @Description: 使用IP代理, 获取html代码
 * @param: 目标url
 * @return: html代码
*/ 
function get_html_source($url) {
    echo "function get_html_source is running ... \n";
    $result = false;
    while (!$result) {
        $targetUrl = $url;
        $proxyServer = "http://http-dyn.abuyun.com:9020";
        $proxyUser = "H19D75L76VK89Q8D";
        $proxyPass = "8C17B0A80F475BD8";
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $targetUrl);
        curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, false);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
        curl_setopt($ch, CURLOPT_PROXY, $proxyServer);
        curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC);
        curl_setopt($ch, CURLOPT_PROXYUSERPWD, "{$proxyUser}:{$proxyPass}");
        curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;)");
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
        curl_setopt($ch, CURLOPT_TIMEOUT, 5);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        $result = curl_exec($ch);
        if (!$result) {
            sleep(2);
        }
        curl_close($ch);
    }
    return $result;
}


// 基础的url, 为后面拼接具体的url做准备
$base_url = "https://movie.douban.com/top250?start=";
// 只修改最后的数字, 拼接出来10条url
for ($i = 0; $i < 250; $i += 25) {
    $current_url = $base_url . $i;
    echo $current_url."\n";
    $data = get_data($current_url);
    save_movie($data);
}
复制代码

看一下最终数据库的效果

从数据中读取数据, 写入markdown文件

<?php
require 'vendor\autoload.php';
use Medoo\Medoo;
// 创建数据库连接
$database = new medoo([
    'database_type' => 'mysql',
    'database_name' => 'demo_db',
    'server' => 'localhost',
    'username' => 'root',
    'password' => 'root',
    'charset' => 'utf8',
]);
/* 
 * @Description: 从数据库中取出电影的相关信息
 * @return: 二维数组
*/ 
function get_data(){
    global $database;
    $data = $database->select('douban_movie','*');
    return $data;
}
/* 
 * @Description: 生成markdown文件
 * @param: 二维数组(电影信息)
 * @return: 没有返回值
*/ 
function make_markdown($data){
    $md_obj = fopen('doubanmd.md','w+');
    foreach ($data as $key => $value) {
        $img = $value['img'];
        $rank = $value['rank'];
        $title = $value['title'];
        $score = $value['score'];
        $info = $value['info'];
        $comment = $value['comment'];
        
        $info = preg_split("/\s{3,}/", $info);
        fwrite($md_obj,"![]({$img})\n");
        fwrite($md_obj,"## {$rank}-{$title}-{$score}\n");
        fwrite($md_obj,"```\n");
        fwrite($md_obj,"{$info[0]}\n");
        fwrite($md_obj,"{$info[1]}\n");
        fwrite($md_obj,"```\n");
        fwrite($md_obj,"> ### {$comment}\n");
        fwrite($md_obj,"---\n\n\n");
    }
    fclose($md_obj);
}


make_markdown(get_data());
复制代码

最终效果




原文地址:访问原文地址
快照地址: 访问文章快照