我正在尝试检测商品在Amazon上的可用性。为什么此代码不起作用?
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
from time import sleep
import time
from lxml import html
import json
def check(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
page = requests.get(url, headers = headers)
for i in range(20):
sleep(3)
doc = html.fromstring(page.content)
XPATH_AVAILABILITY = '//div[@id ="availability"]//text()'
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
return AVAILABILITY
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
html = req.get(i)
doc = SimplifiedDoc(html)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
#soup = BeautifulSoup(html, "lxml")
soup = BeautifulSoup(response.content, features="lxml")
title = soup.select("#productTitle")[0].get_text().strip()
if check(i) == 'In stock.':
price = soup.select("#priceblock_saleprice")[0].get_text()
else:
price = "UNAVAILABLE"
review_count = int(soup.select("#acrCustomerReviewText")[0].get_text().split()[0])
jsonObject = {'title': title, 'price': price, 'review_count': review_count}
print json.dumps(jsonObject, indent=2)
print "////////////////////////////////////////////////"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
当我执行它时,出现此错误:
检查文件“ scra.py”,第17行
doc = html.fromstring(page.content)
AttributeError:“ unicode”对象没有属性“ fromstring”
请帮我。我已经尝试过将
page
转换为pagedata = page.json()
,但这只会使情况变得更糟。 最佳答案
尝试使用它代替html.fromstring
doc = BeautifulSoup(page.content, 'html.parser')
doc = doc.prettify()
关于python - AttributeError:“unicode”对象没有属性“fromstring”。如何解决这个问题?,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/59774103/