我已经在这里呆了12个小时,希望有人能帮上忙。
这是我的代码,我想要的是在页面爬行时获得页面上每个链接的锚点和URL。
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.utils.url import urljoin_rfc
from scrapy.utils.response import get_base_url
from urlparse import urljoin
#from scrapy.item import Item
from tutorial.items import DmozItem
class HopitaloneSpider(CrawlSpider):
name = 'dmoz'
allowed_domains = ['domain.co.uk']
start_urls = [
'http://www.domain.co.uk'
]
rules = (
#Rule(SgmlLinkExtractor(allow='>example\.org', )),
Rule(SgmlLinkExtractor(allow=('\w+$', )), callback='parse_item', follow=True),
)
user_agent = 'Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))'
def parse_item(self, response):
#self.log('Hi, this is an item page! %s' % response.url)
hxs = HtmlXPathSelector(response)
#print response.url
sites = hxs.select('//html')
#item = DmozItem()
items = []
for site in sites:
item = DmozItem()
item['title'] = site.select('a/text()').extract()
item['link'] = site.select('a/@href').extract()
items.append(item)
return items
我做错了...我的眼睛现在疼。
最佳答案
response.body应该是你想要的
def parse_item(self, response):
#self.log('Hi, this is an item page! %s' % response.url)
body = response.body
item = ....
关于xpath - 如何使用hxs.select完整地获取整个文档,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/13435620/