scrapy中多个pipeline作用:
一个项目可能需要爬取多个网站,根据每个网站的数据量(处理方式)不同,可创建多个管道 pipeline
class SpideranythingPipeline(object):
def process_item(self, item, spider):
if spider.name == 'itcast': # spider为爬虫实例 itcast是爬虫的名字,, 由此可区分多个爬虫
print(item)
return item
pipeline的方法
mysql
class SpiderSuningBookPipeline(object):
def process_item(self, item, spider):
# collection.insert(dict(item))
sql = """
insert into book(title,author,download_text,new) values('%s','%s','%s','%s')"""\
%(
item['title'],
item['author'],
item['download_text'],
item['new']
)
print(sql)
self.cursor.execute(sql)
return item def open_spider(self, spider):
# 连接数据库
self.connect = pymysql.connect(
host='127.0.0.1',
port=3306,
db='study',
user='root',
passwd='',
charset='utf8',
use_unicode=True) # 通过cursor执行增删查改
self.cursor = self.connect.cursor()
self.connect.autocommit(True) def close_spider(self, spider):
self.cursor.close()
self.connect.close()
mongodb
from pymongo import MongoClient class PracticePipeline(object):
def process_item(self, item, spider):
''' 接受爬虫返回的数据 '''
pass def open_spider(self, spider):
''' 爬虫启动的时候调用 '''
spider.hello = 'world' # 可以给spider添加属性
# 初始化数据库连接
client = MongoClient()
spider.collection = client['SpiderAnything']['hr'] def close_spider(self, spider):
''' 爬虫关闭的时候调用 '''
pass