我已经写了一个蜘蛛,但是每当我运行这个蜘蛛时,我都会得到这个错误:

Traceback (most recent call last):
  File "/usr/local/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent
    call.func(*call.args, **call.kw)
  File "/usr/local/lib/python2.7/dist-packages/twisted/internet/task.py", line 607, in _tick
    taskObj._oneWorkUnit()
  File "/usr/local/lib/python2.7/dist-packages/twisted/internet/task.py", line 484, in _oneWorkUnit
    result = next(self._iterator)
  File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 57, in <genexpr>
    work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
  File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 96, in iter_errback
    yield it.next()
  File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/offsite.py", line 28, in process_spider_output
    for x in result:
  File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
    return (_set_referer(r) for r in result or ())
  File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "/home/vaibhav/scrapyprog/comparison/eScraperInterface/eScraper/spiders/streetstylestoreSpider.py", line 38, in parse
    item['productURL'] = site.select('.//a/@href').extract()
exceptions.AttributeError: 'unicode' object has no attribute 'select'

我的代码是:
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider

#------------------------------------------------------------------------------

class ESpider(CrawlSpider):

    name = "streetstylestoreSpider"
    allowed_domains = ["streetstylestore.com"]

    start_urls = [
                  "http://streetstylestore.com/index.php?id_category=16&controller=category",
                  "http://streetstylestore.com/index.php?id_category=46&controller=category",
                  "http://streetstylestore.com/index.php?id_category=51&controller=category",
                  "http://streetstylestore.com/index.php?id_category=61&controller=category",
                  "http://streetstylestore.com/index.php?id_category=4&controller=category"
                  ]


    def parse(self, response):

        items = []
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//ul[@id="product_list"]/li').extract()

        for site in sites:

            item = EscraperItem()
            item['currency'] = 'INR'
            item['productSite'] = ["http://streetstylestore.com"]
            item['productURL'] = site.select('.//a/@href').extract()
            item['productImage'] = site.select('.//a/img/@src').extract()
            item['productTitle'] = site.select('.//a/@title').extract()
            productMRP = [i.strip().split('Rs')[-1].replace(',','') for i in hxs.select('.//div[@class="price_container"]//span[@class="old_price"]/text()').extract()]
            productPrice = [i.strip().split('Rs')[-1].replace(',','') for i in hxs.select('.//div[@class="price_container"]//p[@class="price"]/text()').extract()]
            item['productPrice'] = productMRP + productPrice

            items.append(item)
            secondURL = item['productURL'][0]
            request = Request(secondURL,callback=self.parsePage2)
            request.meta['item'] = item
            yield request


    def parsePage2(self, response):

        temp = []
        item = response.meta['item']
        hxs = HtmlXPathSelector(response)

        availability =  [i for i in hxs.select('//div[@class="details"]/p/text()').extract() if 'In Stock ' in i]

        if  availability:
            item['availability'] = True
        else:
            item['availability'] = False

        hasVariants =  hxs.select('//div[@class="attribute_list"]').extract()

        if hasVariants:
            item['hasVariants'] = True
        else:
            item['hasVariants'] = False

        category = hxs.select('//div[@class="breadcrumb"]/a/text()').extract()
        if category:
            productCategory = [category[0]]
            if len(category) >= 1:
                productSubCategory = [category[1]]
            else:
                productSubCategory = ['']
        else:
            productCategory = ['']
            productSubCategory = ['']

        item['productCategory'] = productCategory
        item['productSubCategory'] = productSubCategory

        for i in hxs.select('//div[@id="thumbs_list"]/ul/li/a/img/@src').extract():
            temp.append(i.replace("medium","large"))

        item['productDesc'] =  " ".join([i for i in hxs.select('//div[@id="short_description_content"]/p/text()').extract()])
        item['productImage'] = item['productImage'] + hxs.select('//div[@id="thumbs_list"]/ul/li/a/img/@src').extract() + hxs.select('//div[@id="thumbs_list"]/ul/li/a/@href').extract() + temp
        item['image_urls'] = list(set(item['productImage']))

        return item

有人可以告诉我我的代码有什么问题吗...

最佳答案

不要在存储在.extract()中的内容上调用sites-extract()返回文本,但您现在还不想从中获取文本。这...

sites = hxs.select('//ul[@id="product_list"]/li').extract()

...应该是这样的:
sites = hxs.select('//ul[@id="product_list"]/li')

关于python - Scrapy异常-exceptions.AttributeError : 'unicode' object has no attribute 'select' ,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/17268175/

10-09 08:49