问题描述
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = "https://www.amazon.in/s/ref=sr_nr_p_36_4?fst=as%3Aoff&rh=n%3A976419031%2Cn%3A1389401031%2Cn%3A1389432031%2Ck%3Amobile%2Cp_36%3A1318507031&keywords=mobile&ie=UTF8&qid=1543902909&rnid=1318502031"
uClient = uReq(my_url)
raw_html= uClient.read()
uClient.close()
page_soup = soup(raw_html, "html.parser")
containers = page_soup.findAll("div",{"class":"s-item-container"})
filename = "Product.csv"
f = open (filename , "w")
headers = "Name,Price,Prime \n"
f.write(headers)
for container in containers:
title_container = container.findAll("div",{"class":"a-row a-spacing-mini"})
product_name = title_container[0].div.a.h2.text
price = container.findAll("span",{"class":"a-size-small a-color-secondary a-text-strike"})
product_price = price[0].text.strip()
prime = container.findAll("i",{"class":"a-icon a-icon-prime a-icon-small s-align-text-bottom"})
product_prime = prime[0].text
print("product_name : " + product_name)
print("product_price : " + product_price)
print("product_prime : " + product_prime)
f.write(product_name + "," + product_price + "," + product_prime + "\n")
f.close
我写了我的第一个网页抓取代码,但由于某种原因它只循环了 4 次并显示了一个错误消息(文件firstwebscrapping.py",第 23 行,在product_price = price[0].text.strip()索引错误:列表索引超出范围).拜托,有人能解释一下我哪里做错了吗?
I wrote my first web scraping code but for some reason it only looped for 4 times and showed a error msg that (File "firstwebscrapping.py", line 23, inproduct_price = price[0].text.strip()IndexError: list index out of range).Please, can someone explain where I've done wrong?
推荐答案
第一个问题是不是每件商品都有原价和现价,所以可以修改这段代码.
The first problem is not every item have the original price and current price, so you can modify this code.
来自 "class":"a-size-small a-color-secondary a-text-strike"
To "class":"a-size-base a-color-price s-price a-text-bold"
此代码将引发另一个问题
And another issue will raise from this code
containers = target[0].findAll("div",{"class":"s-item-container"})
s-item-container不仅在ajaxData中,而且在atfResults中,所以我们使用select函数获取目标div列表使用这段代码target = page_soup.select('div#atfResults')
,希望这能解决您的问题.
s-item-container not only in ajaxData but also in atfResults, so we use the select function to get the target div list use this code target = page_soup.select('div#atfResults')
, hope this can solve your question.
div#search-main-wrapper>div#ajaxData>s-item-容器div#search-main-wrapper>div#atfResults>s-item-容器
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = "https://www.amazon.in/s/ref=sr_nr_p_36_4?fst=as%3Aoff&rh=n%3A976419031%2Cn%3A1389401031%2Cn%3A1389432031%2Ck%3Amobile%2Cp_36%3A1318507031&keywords=mobile&ie=UTF8&qid=1543902909&rnid=1318502031"
uClient = uReq(my_url)
raw_html= uClient.read()
uClient.close()
page_soup = soup(raw_html, "html.parser")
target = page_soup.select('div#atfResults')
containers = target[0].findAll("div",{"class":"s-item-container"})
filename = "Product.csv"
f = open (filename , "w")
headers = "Name,Price,Prime \n"
f.write(headers)
print(len(containers))
for container in containers:
title_container = container.findAll("div",{"class":"a-row a-spacing-mini"})
product_name = title_container[0].div.a.h2.text
price = container.findAll("span",{"class":"a-size-base a-color-price s-price a-text-bold"})
product_price = price[0].text.strip()
prime = container.findAll("i",{"class":"a-icon a-icon-prime a-icon-small s-align-text-bottom"})
product_prime = prime[0].text
print("product_name : " + product_name)
print("product_price : " + product_price)
print("product_prime : " + product_prime)
f.write(product_name + "," + product_price + "," + product_prime + "\n")
f.close()
这篇关于网页抓取列表索引超出范围的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!