为了使用scrapy从网站获取我需要的数据,我首先需要创建一个响应对象,以便可以在其上使用HtmlXpathSelector。 HtmlXpathSelector不接受url字符串作为参数。在下面的代码示例中,“ response2”变量为空,因为我不知道该怎么做。
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from tarantula.items import OlgaItem
class OlgaSpider(CrawlSpider):
"""This crawler gets the physician's name and his homepage url."""
name = 'Olga'
DOWNLOAD_DELAY = 6 #para tentar evitar ser banido
#ROBOTSTXT_OBEY = True
#CONCURRENT_REQUESTS = 1
FEED_URI = '/home/mercutio22/gitcode/MedicWebsites.csv'
FEED_FORMAT = 'csv'
USER_AGENT = "Googlebot/2.1 ( http://www.google.com/bot.html )"
#allowed_domains = ['guiareunimedicos.med.br']
start_urls = (
'http://medial-saude.guiareunimedicos.med.br/index.pl?act=searc\
h&_id_=172&_ev_=Submit&_formSearchSubmit=%3Adefault%3A&type=0&country=0\
&q=oncologia#results/',
'http://www.guiareunimedicos.med.br/index.pl?act=search&_id_=17\
#2&_ev_=Submit&_formSearchSubmit=%3Adefault%3A&type=0&country=0&q=cancer\
#ologia#results/' )
rules = (
Rule(SgmlLinkExtractor(allow=r"V=", restrict_xpaths='//a[text()=">"]'),
callback='parse_item', follow=True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
mdata = hxs.select('//div[contains(@class, "mdata")]')
links = mdata.select('./a/@href').extract()
names = mdata.select('./a/text()').extract()
items = []
for index in range(len(names)):
i = OlgaItem()
i['name'] = names[index]
i['link'] = links[index]
response2 =
hxs2 = HtmlXPathSelector(response2) ###
name = hxs2.select('//big/text()').extract()
i['clinics'] = hxs2.select('//h2/a/text()').extract()
data = hxs2.select('//div[contains(@class, "stab data")]')
addresses = [ x.select('./p/text()').extract() for x in data ]
addresses = [ ''.join(x) for x in addresses ]
addresses = [ x.replace('Telefone(s): \r\n\r\n\r\n', '') for x in addresses ]
addresses = [ x[2:] for x in addresses ]
i['addresses'] = addresses
i['phones'] = hxs2.select('//span[@id]/text()').extract()
items.append(i)
return items
最佳答案
您还可以通过提供一些html来创建HtmlXPathSelector
:
hxs = HtmlXPathSelector(text= '<div>blah-blah</div>')
关于python - 如何从URL字符串实例化scrapy.http.Response?,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/8030744/