采集流程
根据链接获取页面内容(curl)->获取需要采集的内容(可以通过正则、xpath、css选择器等方法进行筛选)
<?php require_once 'phpspider/autoloader.php';
use phpspider\core\phpspider;
use phpspider\core\requests;
use phpspider\core\selector;
/* Do NOT delete this comment */
/* 不要删除这段注释 */
requests::$input_encoding = 'GB2312';
requests::$output_encoding = 'GB2312';
//获取博客文章列表的文章url
for($i=1;$i<=10;$i++){
$url = "https://www.cnblogs.com/jcydd/default.html?page=".$i;
$html = requests::get($url);
//var_dump($html);
$selector = "/<a\sid=\"homepage1_HomePageDays_DaysList_ctl0\d_DayList_TitleUrl_\d\"\sclass=\"postTitle2\"\shref=\"(.*)\">/";
$result[] = selector::select($html, $selector,'regex');
}
//var_dump($result); //根据url循环获取文章标题和内容
foreach($result as $k=> $v){
foreach($v as $kk=>$vv){ $html1 = requests::get($vv);
//var_dump($html1); //获取文章标题,正则表达式前后要加上@,我也不知道为什么
$selector1 = "@<a\sid=\"cb_post_title_url\"\s(?:.)+?>(.*)</a>@";
$result1 = selector::select($html1, $selector1,'regex');
//var_dump($result1); //文章内容正则有点问题,有些内容获取不到
$selector2 = "@<div\sid=\"cnblogs_post_body\"\sclass=\"blogpost-body\">((.|\n)*)<div\sid=\"MySignature\">@";
$result2 = selector::select($html1, $selector2,'regex');
//var_dump($result2);
//去除文章内容里的html标签
$result2=preg_replace('/<[^<]*>/',"",$result2); //写入文件
$myfile = fopen("f".$k.$kk.".txt", "w") or die("Unable to open file!");
fwrite($myfile, $result1);
fwrite($myfile,"\r\n");
foreach($result2 as $vvv){
fwrite($myfile, $vvv);
} fclose($myfile); }
}