问题描述
这是我从
另一部分代码的这一部分允许获取 URL.
browser = webdriver.Chrome()def get_urls(浏览器,登陆页面):browser.get(landing_page)urls = [i.get_attribute('href') for i inbrowser.find_elements_by_css_selector('.next-games-date >a:nth-child(1), .next-games-date >a:nth-child(n+3)')]返回网址....如果 __name__ == '__main__':start_url = "https://www.oddsportal.com/matches/soccer/";网址 = []浏览器 = webdriver.Chrome()结果 = 无urls = get_urls(浏览器,start_url)urls.insert(0, start_url)对于数字,枚举中的网址(网址):如果数量>0:browser.get(url)html = browser.page_source游戏数据 = 解析数据(html)如果 game_data 为 None:继续结果 = pd.DataFrame(game_data.__dict__)
如何让 urls
与我的代码集成并进行迭代以向我提供单个数据帧?
我不得不对函数 generate_matches
进行一些调整,因为某些类名的返回是不可靠的.我从那个函数中删除了我从来不应该拥有的全局语句.
将pandas导入为pd从 bs4 导入 BeautifulSoup 作为 bs从硒导入网络驱动程序进口螺纹从 multiprocessing.pool 导入线程池导入操作系统进口重新类驱动程序:def __init__(self):选项 = webdriver.ChromeOptions()options.add_argument("--headless")# 取消注释下一行以禁止日志记录:options.add_experimental_option('excludeSwitches', ['enable-logging'])self.driver = webdriver.Chrome(options=options)def __del__(self):self.driver.quit() # 当我们清理的时候清理驱动# print('驱动程序已被退出".')threadLocal = threading.local()def create_driver():the_driver = getattr(threadLocal, 'the_driver', None)如果 the_driver 是 None:the_driver = 驱动程序()setattr(threadLocal, 'the_driver', the_driver)返回 the_driver.driver类游戏数据:def __init__(self):self.date = []self.time = []self.game = []self.score = []self.home_odds = []self.draw_odds = []self.away_odds = []self.country = []self.league = []def generate_matches(table):tr_tags = table.findAll('tr')对于 tr_tags 中的 tr_tag:如果 tr_tag.attrs 中的 'class' 和 tr_tag['class'] 中的 'dark':th_tag = tr_tag.find('th', {'class': 'first2 tl'})a_tags = th_tag.findAll('a')country = a_tags[0].text联赛 = a_tags[1].text别的:td_tags = tr_tag.findAll('td')产生 td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \td_tags[4].text、td_tags[5].text、国家、联赛def parse_data(url, return_urls=False):浏览器 = create_driver()browser.get(url)汤 = bs(browser.page_source, lxml")div = 汤.find('div', {'id': 'col-content'})table = div.find('table', {'class': 'table-main'})h1 = 汤.find('h1').textm = re.search(r'\d+ \w+ \d{4}$', h1)游戏日期 = m[0]游戏数据 = 游戏数据()对于 generate_matches(table) 中的行:game_data.date.append(game_date)game_data.time.append(row[0])game_data.game.append(row[1])game_data.score.append(row[2])game_data.home_odds.append(row[3])game_data.draw_odds.append(row[4])game_data.away_odds.append(row[5])game_data.country.append(row[6])game_data.league.append(row[7])如果 return_urls:span = soup.find('span', {'class': 'next-games-date'})a_tags = span.findAll('a')urls = ['https://www.oddsportal.com' + a_tag['href'] 用于 a_tags 中的 a_tag]返回游戏数据,网址返回游戏数据如果 __name__ == '__main__':结果 = 无pool = ThreadPool(5) # 然而,我们将得到 7 个 URL# 获取今天的数据和其他日子的网址:game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))urls.pop(1) # 删除今天的 url: 我们已经有了那个数据game_data_results = pool.imap(parse_data, urls)对于范围内的 i (8):game_data = game_data_today if i == 1 else next(game_data_results)结果 = pd.DataFrame(game_data.__dict__)如果结果为无:结果 = 结果别的:结果 = results.append(result, ignore_index=True)打印(结果)# 打印(结果.头())# 确保所有驱动程序都退出":删除线程本地进口GCgc.collect() # 一点额外的保险
打印:
日期时间比赛得分 home_odds draw_odds away_odds country League2021 年 9 月 07 日 00:00 帕丘卡西 - 蒙特雷西 0:1 +219 +280 -106 墨西哥西甲 MX 女子2021 年 9 月 1 日 01:05 Millonarios - Patriotas 1:0 -303 +380 +807 哥伦比亚甲级联赛 A2021 年 9 月 2 日 02:00 Club Tijuana W - Club Leon W 4:0 -149 +293 +311 墨西哥西甲 MX 女子2021 年 9 月 3 日 08:30 苏州东吴 - 南京市 0:0 +165 +190 +177 中甲联赛2021 年 9 月 4 日 08:45 古晋城足球俱乐部 - 砂拉越联队1:0 +309 +271 -143 马来西亚超级联赛……………………………………1305 2021 年 9 月 14 日 21:45 中央科尔多瓦 - Atl.图库曼 +192 +217 +146 13 阿根廷职业联赛1306 2021 年 9 月 14 日 22:00 科洛科洛 - 埃弗顿 -141 +249 +395 11 智利甲级联赛1307 2021 年 9 月 14 日 23:30 哥伦布船员 - 纽约红牛队 - - - 1 美国 MLS1308 2021 年 9 月 14 日 23:30 纽约市 - FC 达拉斯 - - - 1 美国 MLS1309 2021 年 9 月 14 日 23:30 多伦多 FC - 迈阿密国际 - - - 1 美国 MLS[1310 行 x 9 列]
This is my code to scrape odds from www.oddsportal.com.
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
import os
import re
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(table):
global country, league
tr_tags = table.findAll('tr')
for tr_tag in tr_tags:
if 'class' not in tr_tag.attrs:
continue
tr_class = tr_tag['class']
if 'dark' in tr_class:
th_tag = tr_tag.find('th', {'class': 'first2 tl'})
a_tags = th_tag.findAll('a')
country = a_tags[0].text
league = a_tags[1].text
elif 'deactivate' in tr_class:
td_tags = tr_tag.findAll('td')
yield td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \
td_tags[4].text, td_tags[5].text, country, league
def parse_data(url):
browser = create_driver()
browser.get(url)
soup = bs(browser.page_source, "lxml")
div = soup.find('div', {'id': 'col-content'})
table = div.find('table', {'class': 'table-main'})
h1 = soup.find('h1').text
m = re.search(r'\d+ \w+ \d{4}$', h1)
game_date = m[0]
game_data = GameData()
for row in generate_matches(table):
game_data.date.append(game_date)
game_data.time.append(row[0])
game_data.game.append(row[1])
game_data.score.append(row[2])
game_data.home_odds.append(row[3])
game_data.draw_odds.append(row[4])
game_data.away_odds.append(row[5])
game_data.country.append(row[6])
game_data.league.append(row[7])
return game_data
# URLs go here
urls = {
"https://www.oddsportal.com/matches/soccer/20210903/",
}
if __name__ == '__main__':
results = None
# To limit the number of browsers we will use
# (set to a large number if you don't want a limit):
MAX_BROWSERS = 5
pool = ThreadPool(min(MAX_BROWSERS, len(urls)))
for game_data in pool.imap(parse_data, urls):
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
# print(results.head())
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
Currently, the code just gets data for one urls. I would likeI am trying to integrate this part into my code that allows the pages to be iterated over all the links for "Yesterday, today, tomorrow and the next 5 days" as below:
This part of another code allows to get the URLs.
browser = webdriver.Chrome()
def get_urls(browser, landing_page):
browser.get(landing_page)
urls = [i.get_attribute('href') for i in
browser.find_elements_by_css_selector(
'.next-games-date > a:nth-child(1), .next-games-date > a:nth-child(n+3)')]
return urls
....
if __name__ == '__main__':
start_url = "https://www.oddsportal.com/matches/soccer/"
urls = []
browser = webdriver.Chrome()
results = None
urls = get_urls(browser, start_url)
urls.insert(0, start_url)
for number, url in enumerate(urls):
if number > 0:
browser.get(url)
html = browser.page_source
game_data = parse_data(html)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
How do I get the urls
to integrate with my code and iterate to provide me with one single dataframe?
I had to make some adjustments to function generate_matches
since the returning of certain class names was not reliable. And I removed global statements from that function that I never have never should have had.
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
import os
import re
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(table):
tr_tags = table.findAll('tr')
for tr_tag in tr_tags:
if 'class' in tr_tag.attrs and 'dark' in tr_tag['class']:
th_tag = tr_tag.find('th', {'class': 'first2 tl'})
a_tags = th_tag.findAll('a')
country = a_tags[0].text
league = a_tags[1].text
else:
td_tags = tr_tag.findAll('td')
yield td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \
td_tags[4].text, td_tags[5].text, country, league
def parse_data(url, return_urls=False):
browser = create_driver()
browser.get(url)
soup = bs(browser.page_source, "lxml")
div = soup.find('div', {'id': 'col-content'})
table = div.find('table', {'class': 'table-main'})
h1 = soup.find('h1').text
m = re.search(r'\d+ \w+ \d{4}$', h1)
game_date = m[0]
game_data = GameData()
for row in generate_matches(table):
game_data.date.append(game_date)
game_data.time.append(row[0])
game_data.game.append(row[1])
game_data.score.append(row[2])
game_data.home_odds.append(row[3])
game_data.draw_odds.append(row[4])
game_data.away_odds.append(row[5])
game_data.country.append(row[6])
game_data.league.append(row[7])
if return_urls:
span = soup.find('span', {'class': 'next-games-date'})
a_tags = span.findAll('a')
urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags]
return game_data, urls
return game_data
if __name__ == '__main__':
results = None
pool = ThreadPool(5) # We will be getting, however, 7 URLs
# Get today's data and the Urls for the other days:
game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
urls.pop(1) # Remove url for today: We already have the data for that
game_data_results = pool.imap(parse_data, urls)
for i in range(8):
game_data = game_data_today if i == 1 else next(game_data_results)
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
# print(results.head())
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
Prints:
date time game score home_odds draw_odds away_odds country league
0 07 Sep 2021 00:00 Pachuca W - Monterrey W 0:1 +219 +280 -106 Mexico Liga MX Women
1 07 Sep 2021 01:05 Millonarios - Patriotas 1:0 -303 +380 +807 Colombia Primera A
2 07 Sep 2021 02:00 Club Tijuana W - Club Leon W 4:0 -149 +293 +311 Mexico Liga MX Women
3 07 Sep 2021 08:30 Suzhou Dongwu - Nanjing City 0:0 +165 +190 +177 China Jia League
4 07 Sep 2021 08:45 Kuching City FC - Sarawak Utd. 1:0 +309 +271 -143 Malaysia Premier League
... ... ... ... ... ... ... ... ... ...
1305 14 Sep 2021 21:45 Central Cordoba - Atl. Tucuman +192 +217 +146 13 Argentina Liga Profesional
1306 14 Sep 2021 22:00 Colo Colo - Everton -141 +249 +395 11 Chile Primera Division
1307 14 Sep 2021 23:30 Columbus Crew - New York Red Bulls - - - 1 USA MLS
1308 14 Sep 2021 23:30 New York City - FC Dallas - - - 1 USA MLS
1309 14 Sep 2021 23:30 Toronto FC - Inter Miami - - - 1 USA MLS
[1310 rows x 9 columns]
这篇关于使用 BeautifulSoup 遍历 URL 以进行网页抓取的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!