我正在尝试从Django项目(使用celery在管理界面中的任务)运行scrapy(蜘蛛/爬网)。这是我的代码。
当我尝试从python shell调用任务时,这是错误

djangoproject:

-monapp:

   -tasks.py
   -spider.py
   -myspider.py            '
   -models.py
         .....

task.py:
  from djcelery import celery
  from demoapp.spider import *
  from demoapp.myspider import *

  @celery.task
  def add(x, y):
    return x + y

  @celery.task
  def scra():
        result_queue = Queue()
        crawler = CrawlerWorker(MySpider(), result_queue)
        crawler.start()
        return "success"

spider.py:
         from scrapy import project, signals
         from scrapy.settings import Settings
         from scrapy.crawler import Crawler
         from scrapy.xlib.pydispatch import dispatcher
         from multiprocessing.queues import Queue
         import multiprocessing

         class CrawlerWorker(multiprocessing.Process):

            def __init__(self, spider, result_queue):
                multiprocessing.Process.__init__(self)
                self.result_queue = result_queue
                self.crawler = Crawler(Settings())
                if not hasattr(project, 'crawler'):
                self.crawler.install()
                self.crawler.configure()

                self.items = []
                self.spider = spider
                dispatcher.connect(self._item_passed, signals.item_passed)

             def _item_passed(self, item):
                self.items.append(item)

             def run(self):
                self.crawler.crawl(self.spider)
                self.crawler.start()
                self.crawler.stop()
                self.result_queue.put(self.items)

myspider.py
        from scrapy.selector import HtmlXPathSelector
        from scrapy.contrib.spiders import CrawlSpider, Rule
        from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
        from scrapy.item import Item, Field

        class TorentItem(Item):

         title = Field()
         desc = Field()
        class MySpider(CrawlSpider):
         name = 'job'
         allowed_domains = ['tanitjobs.com']
         start_urls = [\
                 'http://tanitjobs.com/browse-by-category/Nurse/',]
         rules = (
        Rule (SgmlLinkExtractor(allow=('page=*',)
                      ,restrict_xpaths=('//div[@class="pageNavigation"]',),
                       unique = True)
           , callback='parse_item', follow= True),
             )
        def parse_item(self, response):
           hxs = HtmlXPathSelector(response)
           items= hxs.select('\
                     //div[@class="offre"]/div[@class="detail"]')
           scraped_items =[]

               for item in items:
                 scraped_item = TorentItem()

                         scraped_item['title']=item.select(\
                               'a/strong/text()').extract()
                 scraped_item['desc'] =item.select(\
                          './div[@class="descriptionjob"]/text()').extract()

                 scraped_items.append(scraped_item)
                 return scraped_items

最佳答案

我使用django管理命令在 shell 上进行了工作。以下是我的代码段。随时进行修改以满足您的需求。

from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import signals
from scrapy.utils.project import get_project_settings

from django.core.management.base import BaseCommand

from myspiderproject.spiders.myspider import MySpider

class ReactorControl:
    def __init__(self):
    self.crawlers_running = 0

    def add_crawler(self):
        self.crawlers_running += 1

    def remove_crawler(self):
        self.crawlers_running -= 1
        if self.crawlers_running == 0:
            reactor.stop()

def setup_crawler(domain):
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed)

    spider = MySpider(domain=domain)
    crawler.crawl(spider)
    reactor_control.add_crawler()
    crawler.start()

reactor_control = ReactorControl()

class Command(BaseCommand):
    help = 'Crawls the site'

    def handle(self, *args, **options):
        setup_crawler('somedomain.com')
        reactor.run()  # the script will block here until the spider_closed signal was sent

希望这可以帮助。

10-06 01:38