这是我从易趣获得物品url的代码,即link3:
def url_soup(url):
source=(urllib2.urlopen(url)).read()
soup=BeautifulSoup(source)
link=soup.select('a.ListItemLink')
for links in link:
link3=('http://www.ebay.com/'+'%s') % (links['href'])
Dept={"All Departments":"0","Apparel":"5438","Auto":"91083","Baby":"5427","Beauty":"1085666",
"Books":"3920","Electronics":"3944","Gifts":"1094765","Grocery":"976759","Health":"976760",
"Home":"4044","Home Improvement":"1072864","Jwelery":"3891","Movies":"4096","Music":"4104",
"Party":"2637","Patio":"5428","Pets":"5440","Pharmacy":"5431","Photo Center":"5426",
"Sports":"4125","Toys":"4171","Video Games":"2636"}
def gen_url(keyword,domain):
if domain in Dept.keys():
main_url=('http://www.ebay.com/search/search-ng.do?search_query='+'%s'+'&ic=16_0&Find=Find&search_constraint='+'%s') % (keyword,Dept.get(domain))
url_soup(main_url)
gen_url('Bags','Apparel')
现在我想让我的蜘蛛每次都选择
start_urls
作为link3
。我是新来的!!
最佳答案
您需要定义start_requests()
方法来动态定义蜘蛛的url。
例如,你应该有这样的东西:
from scrapy.http import Request
from scrapy.selector import Selector
from scrapy.spider import BaseSpider
class MySpider(BaseSpider):
name = "my_spider"
domains = ['Auto']
departments = {"All Departments": "0", "Apparel": "5438", "Auto": "91083", "Baby": "5427", "Beauty": "1085666",
"Books": "3920", "Electronics": "3944", "Gifts": "1094765", "Grocery": "976759", "Health": "976760",
"Home": "4044", "Home Improvement": "1072864", "Jwelery": "3891", "Movies": "4096", "Music": "4104",
"Party": "2637", "Patio": "5428", "Pets": "5440", "Pharmacy": "5431", "Photo Center": "5426",
"Sports": "4125", "Toys": "4171", "Video Games": "2636"}
keyword = 'Auto'
allowed_domains = ['ebay.com']
def start_requests(self):
for domain in self.domains:
if domain in self.departments:
url = 'http://www.ebay.com/search/search-ng.do?search_query=%s&ic=16_0&Find=Find&search_constraint=%s' % (self.keyword, self.departments.get(domain))
print "YIELDING"
yield Request(url)
def parse(self, response):
print "IN PARSE"
sel = Selector(response)
links = sel.select('//a[@class="ListItemLink"]/@href')
for link in links:
href = link.extract()[0]
yield Request('http://www.ebay.com/' + href, self.parse_data)
def parse_data(self, response):
# do your actual crawling here
print "IN PARSE DATA"
希望能有所帮助。