1 <?php
2
3 /* 参考文章 https://www.iamle.com/archives/2202.html */
4
5 require_once __DIR__.'/vendor/autoload.php';
6 use GuzzleHttp\Client;
7 use Symfony\Component\DomCrawler\Crawler;
8
9 // $url = 'https://movie.douban.com/subject/25812712/?from=showing';
10 // //下载网页内容
11 // $client = new Client([
12 // 'timeout' => 10,
13 // 'headers' => ['User-Agent' => 'Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)',
14 // ],
15 // ]);
16 // $response = $client->request('GET', $url)->getBody()->getContents();
17 // print_r($response); exit;
18
19 print_r(json_encode(Spider(), JSON_UNESCAPED_UNICODE));
20 //print_r(Spider());
21
22 function Spider()
23 {
24 //需要爬取的页面
25 $url = 'https://movie.douban.com/subject/25812712/?from=showing';
26
27 //下载网页内容
28 $client = new Client([
29 'timeout' => 10,
30 'headers' => ['User-Agent' => 'Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)',
31 ],
32 ]);
33 /* 原格式输出 */
34 echo '<pre>';
35 $response = $client->request('GET', $url)->getBody()->getContents();
36
37 //进行XPath页面数据抽取
38 $data = []; //结构化数据存本数组
39 $crawler = new Crawler();
40 $crawler->addHtmlContent($response);
41
42 try {
43 //电影名称
44 //网页结构中用css选择器用id的比较容易写xpath表达式
45 $data['name'] = $crawler->filterXPath('//*[@id="content"]/h1/span[1]')->text();
46 //电影海报
47 $data['cover'] = $crawler->filterXPath('//*[@id="mainpic"]/a/img/@src')->text();
48 //导演
49 $data['director'] = $crawler->filterXPath('//*[@id="info"]/span[1]/span[2]')->text();
50 //多个导演处理成数组
51 $data['director'] = explode('/', $data['director']);
52 //过滤前后空格
53 $data['director'] = array_map('trim', $data['director']);
54
55 //编剧
56 $data['cover'] = $crawler->filterXPath('//*[@id="info"]/span[2]/span[2]/a')->text();
57 //主演
58 $data['mactor'] = $crawler->filterXPath('//*[@id="info"]/span[contains(@class,"actor")]/span[contains(@class,"attrs")]')->text();
59 //多个主演处理成数组
60 $data['mactor'] = explode('/', $data['mactor']);
61 //过滤前后空格
62 $data['mactor'] = array_map('trim', $data['mactor']);
63
64 //上映日期
65 $data['rdate'] = $crawler->filterXPath('//*[@id="info"]')->text();
66 //使用正则进行抽取
67 preg_match_all("/(\d{4})-(\d{2})-(\d{2})\(.*?\)/", $data['rdate'], $rdate); //2017-07-07(中国大陆) / 2017-06-14(安锡动画电影节) / 2017-06-30(美国)
68 $data['rdate'] = $rdate[0];
69 //简介
70 //演示使用class选择器的方式
71 $data['introduction'] = trim($crawler->filterXPath('//div[contains(@class,"indent")]/span')->text());
72
73 //演员
74 //本xpath表达式会得到多个对象结果,用each方法进行遍历
75 //each是传入的参数是一个闭包,在闭包中使用外部的变量使用use方法,并使用变量指针
76 $crawler->filterXPath('//ul[contains(@class,"celebrities-list from-subject")]/li')->each(function (Crawler $node, $i) use (&$data) {
77 $actor['name'] = $node->filterXPath('//div[contains(@class,"info")]/span[contains(@class,"name")]/a')->text(); //名字
78 $actor['role'] = $node->filterXPath('//div[contains(@class,"info")]/span[contains(@class,"role")]')->text(); //角色
79 $actor['avatar'] = $node->filterXPath('//a/div[contains(@class,"avatar")]/@style')->text(); //头像
80 //background-image: url(https://img3.doubanio.com/img/celebrity/medium/5253.jpg) 正则抽取头像图片
81 preg_match_all("/((https|http|ftp|rtsp|mms)?:\/\/)[^\s]+\.(jpg|jpeg|gif|png)/", $actor['avatar'], $avatar);
82 $actor['avatar'] = $avatar[0][0];
83 //print_r($actor);
84 $data['actor'][] = $actor;
85 });
86
87 } catch (\Exception $e) {
88
89 }
90
91 return $data;
92
93 }