第一步:
创建项目
scrapy stratproject [name]
如 scrapy startproject maoyan
第二步:
进入到项目的文件夹目录创建APP
scrapy gensider movie maoyan.com
第三步:
配置movie.py文件
import scrapy from maoyan.items import MaoyanItem class MovieSpider(scrapy.Spider): name = 'movie' allowed_domains = ['maoyan.com'] start_urls = ['https://maoyan.com/board/4'] def parse(self, response): movies = response.xpath('//dd') for movie_item in movies: item = MaoyanItem() item['title'] = movie_item.xpath('.//p/a/@title').extract_first() item['actor'] = movie_item.xpath('.//p[@class="star"]/text()').extract_first().strip() item['time'] = movie_item.xpath('.//p[@class="releasetime"]/text()').extract_first() yield item next_url = response.xpath('//a[text()="下一页"]/@href').extract_first() url = response.urljoin(next_url) yield scrapy.Request(url=url, callback=self.parse)
第四步:
配置items.py文件
import scrapy class MaoyanItem(scrapy.Item): title = scrapy.Field() actor = scrapy.Field() time = scrapy.Field()
第五步:
配置pipelines.py文件
import pymongo class MongoPipeline(object): def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DB') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def process_item(self, item, spider): name = item.__class__.__name__ self.db[name].insert(dict(item)) return item def close_spider(self, spider): self.client.close()
第六步:
配置settings.py文件
# -*- coding: utf-8 -*- # Scrapy settings for maoyan project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'maoyan' SPIDER_MODULES = ['maoyan.spiders'] NEWSPIDER_MODULE = 'maoyan.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False ITEM_PIPELINES = {'maoyan.pipelines.MongoPipeline': 400, } MONGO_URI = 'mongodb://admin:123456@localhost/' MONGO_DB = 'maoyan'
第七步:
运行项目
scrapy crawl movie