scrapy-pipeline的方法

scrapy中多个pipeline作用：

一个项目可能需要爬取多个网站，根据每个网站的数据量（处理方式）不同，可创建多个管道 pipeline

class SpideranythingPipeline(object):

    def process_item(self, item, spider):

        if spider.name == 'itcast':  # spider为爬虫实例  itcast是爬虫的名字，， 由此可区分多个爬虫

            print(item)

            return item

pipeline的方法

mysql

class SpiderSuningBookPipeline(object):

    def process_item(self, item, spider):

        # collection.insert(dict(item))

        sql = """

            insert into book(title,author,download_text,new) values('%s','%s','%s','%s')"""\

              %(

            item['title'],

            item['author'],

            item['download_text'],

            item['new']

)

        print(sql)

        self.cursor.execute(sql)

        return item

    def open_spider(self, spider):

        # 连接数据库

        self.connect = pymysql.connect(

        host='127.0.0.1',

        port=3306,

        db='study',

        user='root',

        passwd='',

        charset='utf8',

        use_unicode=True)

        # 通过cursor执行增删查改

        self.cursor = self.connect.cursor()

        self.connect.autocommit(True)

    def close_spider(self, spider):

        self.cursor.close()

        self.connect.close()

mongodb

from pymongo import MongoClient

class PracticePipeline(object):

    def process_item(self, item, spider):

        ''' 接受爬虫返回的数据 '''

        pass

    def open_spider(self, spider):

        ''' 爬虫启动的时候调用 '''

        spider.hello = 'world' # 可以给spider添加属性

        # 初始化数据库连接

        client = MongoClient()

        spider.collection = client['SpiderAnything']['hr']

    def close_spider(self, spider):

        ''' 爬虫关闭的时候调用 '''

        pass