我正在尝试从Django项目(使用celery在管理界面中的任务)运行scrapy(蜘蛛/爬网)。这是我的代码。
当我尝试从python shell调用任务时,这是错误
djangoproject:
-monapp:
-tasks.py
-spider.py
-myspider.py '
-models.py
.....
task.py:
from djcelery import celery
from demoapp.spider import *
from demoapp.myspider import *
@celery.task
def add(x, y):
return x + y
@celery.task
def scra():
result_queue = Queue()
crawler = CrawlerWorker(MySpider(), result_queue)
crawler.start()
return "success"
spider.py:
from scrapy import project, signals
from scrapy.settings import Settings
from scrapy.crawler import Crawler
from scrapy.xlib.pydispatch import dispatcher
from multiprocessing.queues import Queue
import multiprocessing
class CrawlerWorker(multiprocessing.Process):
def __init__(self, spider, result_queue):
multiprocessing.Process.__init__(self)
self.result_queue = result_queue
self.crawler = Crawler(Settings())
if not hasattr(project, 'crawler'):
self.crawler.install()
self.crawler.configure()
self.items = []
self.spider = spider
dispatcher.connect(self._item_passed, signals.item_passed)
def _item_passed(self, item):
self.items.append(item)
def run(self):
self.crawler.crawl(self.spider)
self.crawler.start()
self.crawler.stop()
self.result_queue.put(self.items)
myspider.py
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.item import Item, Field
class TorentItem(Item):
title = Field()
desc = Field()
class MySpider(CrawlSpider):
name = 'job'
allowed_domains = ['tanitjobs.com']
start_urls = [\
'http://tanitjobs.com/browse-by-category/Nurse/',]
rules = (
Rule (SgmlLinkExtractor(allow=('page=*',)
,restrict_xpaths=('//div[@class="pageNavigation"]',),
unique = True)
, callback='parse_item', follow= True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items= hxs.select('\
//div[@class="offre"]/div[@class="detail"]')
scraped_items =[]
for item in items:
scraped_item = TorentItem()
scraped_item['title']=item.select(\
'a/strong/text()').extract()
scraped_item['desc'] =item.select(\
'./div[@class="descriptionjob"]/text()').extract()
scraped_items.append(scraped_item)
return scraped_items
最佳答案
我使用django管理命令在 shell 上进行了工作。以下是我的代码段。随时进行修改以满足您的需求。
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import signals
from scrapy.utils.project import get_project_settings
from django.core.management.base import BaseCommand
from myspiderproject.spiders.myspider import MySpider
class ReactorControl:
def __init__(self):
self.crawlers_running = 0
def add_crawler(self):
self.crawlers_running += 1
def remove_crawler(self):
self.crawlers_running -= 1
if self.crawlers_running == 0:
reactor.stop()
def setup_crawler(domain):
settings = get_project_settings()
crawler = Crawler(settings)
crawler.configure()
crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed)
spider = MySpider(domain=domain)
crawler.crawl(spider)
reactor_control.add_crawler()
crawler.start()
reactor_control = ReactorControl()
class Command(BaseCommand):
help = 'Crawls the site'
def handle(self, *args, **options):
setup_crawler('somedomain.com')
reactor.run() # the script will block here until the spider_closed signal was sent
希望这可以帮助。