1. 创建一个 Scrapy 项目,在命令行或者 Pycharm 的 Terminal 中输入:
scrapy startproject imagepix
自动生成了下列文件:
2. 在 imagepixiv/spiders 文件夹下新建一个 imagepixiv.py 文件
3. imagepixiv.py 下的代码:
import scrapy from urllib.parse import urlencode import json from ..items import ImagepixItem class ImagepixivSpider(scrapy.Spider): name = 'imagepixiv' def start_requests(self): data = {'keyword': '风景'} base_url_1 = 'https://api.pixivic.com/illustrations?' for page in range(1, self.settings.get('MAX_PAGE') + 1): data['page'] = page params = urlencode(data) url_1 = base_url_1 + params yield scrapy.Request(url_1, callback=self.parse) def parse(self, response): result = json.loads(response.text) for image in result.get('data'): item = ImagepixItem() item['title'] = image.get('title') item['id'] = image.get('id') url = image.get('imageUrls')[0].get('large') url_rel = 'https://img.pixivic.com:23334/get/' + str(url) item['url'] = url_rel yield item
4. items.py 下的代码:
import scrapy from scrapy import Field class ImagepixItem(scrapy.Item): title = Field() id = Field() url = Field()
5. pipelines.py 下的代码:
from scrapy import Request from scrapy.exceptions import DropItem from scrapy.pipelines.images import ImagesPipeline class ImagepixPipeline(ImagesPipeline): def file_path(self, request, response=None, info=None): url = request.url file_name = url.split('/')[-1] return file_name def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem('Image Downloaded Failed') return item def get_media_requests(self, item, info): yield Request(item['url'])
6. settings.py 下的代码:
BOT_NAME = 'imagepix' SPIDER_MODULES = ['imagepix.spiders'] NEWSPIDER_MODULE = 'imagepix.spiders' MAX_PAGE = 50 FEED_EXPORT_ENCODING = 'utf-8' IMAGES_STORE = './images' ITEM_PIPELINES = { 'imagepix.pipelines.ImagepixPipeline': 300, } ROBOTSTXT_OBEY = False
7. 在命令行运行:
scrapy crawl imagepixiv
8. 结果: