网址以抓取类型
url = requests.get('http://books.toscrape.com/index.html')
soup = BeautifulSoup(url.text, 'html.parser')
navlist = soup.select('.nav-list')[0].find('li').find('ul').findAll('li')
网站中的类型列表
genre_list = []
报废小说清单
novel_list = []
让Loop在navlist中循环以将类型追加到genre_list
for i in navlist:
a = i.find('a').getText()
genre_list.append(a.strip().lower())
让Loop遍历genre_list
for x, y in enumerate(genre_list):
count = 1
url_1=requests.get(f'http://books.toscrape.com/catalogue/category/books/{y}_{x+2}/page
{count}.html')
url_2 = requests.get(f'http://books.toscrape.com/catalogue/category/books/{y}_{x+2}/index.html')
url = url_1
if url.status_code == 404:
url = url_2
else:
url = url_1
while url:
soup1 = BeautifulSoup(url.text, 'html.parser')
novel = soup1.select('.product_pod')
count += 1
for循环循环播放每本小说,以获取其标题,费用,评分
for i, j in enumerate(novel):
rate_list = ['One', 'Two', 'Three', 'Four', 'Five']
novel_1 = novel[i].find('h3')
title = novel_1.find('a').get('title')
cost = novel[i].find('p', class_='price_color').getText().strip('Â')
循环查找具有特定等级的小说
for rating in rate_list:
rate = novel[i].find('p', class_=f'{rating}')
if rate:
novel_list.append({'Title': title, 'Rating': rating, 'Price': cost, 'Genre': y})
if url == url_2:
break
else:
url=requests.get(f'http://books.toscrape.com/catalogue/category/books/{y}_{x + 2}/page
{count}.html')
将这些小说写到我的CSV文件中
with open('novel.csv','w', encoding="utf-8", newline='') as f:
fieldnames = ['Title', 'Rating', 'Price', 'Genre']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for i in novel_list:
writer.writerow(i)
最佳答案
使用for循环并提供页码。然后使用pandas dataframe
将数据加载到dataframe
中,然后执行to_csv。
码:
from bs4 import BeautifulSoup
import requests
import pandas as pd
url='http://books.toscrape.com/catalogue/page-{}.html'
title=[]
ratings=[]
cost=[]
for page in range(1,51):
finalurl=url.format(page)
res=requests.get(finalurl).text
soup=BeautifulSoup(res,'html.parser')
for t,r,c in zip(soup.select('.image_container >a>img'),soup.select('p.star-rating'),soup.select('p.price_color')):
title.append(t['alt'])
ratings.append(r.attrs['class'][-1])
cost.append(c.text[1:])
df = pd.DataFrame({"Title":title,"Ratings":ratings,"Cost":cost})
print(df)
df.to_csv('Titlebooks.csv')
在控制台上输出:
Cost Ratings Title
0 £51.77 Three A Light in the Attic
1 £53.74 One Tipping the Velvet
2 £50.10 One Soumission
3 £47.82 Four Sharp Objects
4 £54.23 Five Sapiens: A Brief History of Humankind
5 £22.65 One The Requiem Red
6 £33.34 Four The Dirty Little Secrets of Getting Your Dream...
7 £17.93 Three The Coming Woman: A Novel Based on the Life of...
8 £22.60 Four The Boys in the Boat: Nine Americans and Their...
9 £52.15 One The Black Maria
10 £13.99 Two Starving Hearts (Triangular Trade Trilogy, #1)
11 £20.66 Four Shakespeare's Sonnets
12 £17.46 Five Set Me Free
13 £52.29 Five Scott Pilgrim's Precious Little Life (Scott Pi...
14 £35.02 Five Rip it Up and Start Again
15 £57.25 Three Our Band Could Be Your Life: Scenes from the A...
16 £23.88 One Olio
17 £37.59 One Mesaerion: The Best Science Fiction Stories 18...
18 £51.33 Two Libertarianism for Beginners
19 £45.17 Two It's Only the Himalayas
20 £12.84 One In Her Wake
21 £37.32 Two How Music Works
22 £30.52 Three Foolproof Preserving: A Guide to Small Batch J...
23 £25.27 Five Chase Me (Paris Nights #2)
24 £34.53 Five Black Dust
25 £54.64 Three Birdsong: A Story in Pictures
26 £22.50 Three America's Cradle of Quarterbacks: Western Penn...
27 £53.13 Three Aladdin and His Wonderful Lamp
28 £40.30 Five Worlds Elsewhere: Journeys Around Shakespeareâ...
29 £44.18 Four Wall and Piece
.. ... ... ...
970 £24.89 Three Lord of the Flies
971 £58.99 Three Listen to Me (Fusion #1)
972 £57.20 Five Kitchens of the Great Midwest
973 £38.43 Five Jane Eyre
974 £34.74 Four Imperfect Harmony
975 £40.44 Four Icing (Aces Hockey #2)
976 £45.24 Three Hawkeye, Vol. 1: My Life as a Weapon (Hawkeye #1)
977 £34.96 Four Having the Barbarian's Baby (Ice Planet Barbar...
978 £56.76 Four Giant Days, Vol. 1 (Giant Days #1-4)
979 £40.28 Five Fruits Basket, Vol. 1 (Fruits Basket #1)
980 £38.00 Two Frankenstein
981 £28.80 Three Forever Rockers (The Rocker #12)
982 £39.24 Three Fighting Fate (Fighting #6)
983 £32.93 Two Emma
984 £51.32 Three Eat, Pray, Love
985 £47.09 Five Deep Under (Walker Security #1)
986 £28.42 Four Choosing Our Religion: The Spiritual Lives of ...
987 £22.85 Three Charlie and the Chocolate Factory (Charlie Buc...
988 £41.24 One Charity's Cross (Charles Towne Belles #4)
989 £39.07 Five Bright Lines
990 £29.82 One Bridget Jones's Diary (Bridget Jones #1)
991 £37.26 Four Bounty (Colorado Mountain #7)
992 £20.30 Three Blood Defense (Samantha Brinkman #1)
993 £34.65 Five Bleach, Vol. 1: Strawberry and the Soul Reaper...
994 £43.38 One Beyond Good and Evil
995 £55.53 One Alice in Wonderland (Alice's Adventures in Won...
996 £57.06 Four Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)
997 £16.97 Five A Spy's Devotion (The Regency Spies of London #1)
998 £53.98 One 1st to Die (Women's Murder Club #1)
999 £26.08 Five 1,000 Places to See Before You Die
[1000 rows x 3 columns]
如果您不想对最终页数进行硬编码,请尝试此操作。
from bs4 import BeautifulSoup
import requests
import pandas as pd
res=requests.get("http://books.toscrape.com/index.html").text
soup=BeautifulSoup(res,'html.parser')
#Get the total page count
pagecount=soup.select_one('.current').text.split('of')[-1].strip()
title=[]
ratings=[]
cost=[]
for page in range(1,int(pagecount)+1):
finalurl="http://books.toscrape.com/catalogue/page-{}.html".format(page)
res=requests.get(finalurl).text
soup=BeautifulSoup(res,'html.parser')
for t,r,c in zip(soup.select('.image_container >a>img'),soup.select('p.star-rating'),soup.select('p.price_color')):
title.append(t['alt'])
ratings.append(r.attrs['class'][-1])
cost.append(c.text[1:])
df = pd.DataFrame({"Title":title,"Ratings":ratings,"Cost":cost})
print(df)
df.to_csv('Titlebooks.csv')