本文介绍了如何使用python从网站中提取表的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!
问题描述
我一直在尝试从网站中提取表格,但我迷路了.谁能帮我 ?我的目标是提取范围页面表:
导入请求导入json将熊猫作为pd导入汇入def get_organisationId(url):#url ='https://training.gov.au/Organisation/Details/31102'标头= {'User-Agent':'Mozilla/5.0(Macintosh; Intel Mac OS X 10_14_6)AppleWebKit/537.36(KHTML,like Gecko)Chrome/87.0.4280.67 Safari/537.36'}分别= requests.get(URL,headers = headers)id_list = re.findall(r'OrganisationId =(.*?)&',分别为文本)OrganisationId = id_list [0],如果id_list否则无返回organizationId#首先获取OrganisationIdurl ='https://training.gov.au/Organisation/Details/31102'OrganisationId = get_organisationId(url)def get_AjaxScopeQualification(organisationId):如果organisationId:url = f'https://training.gov.au/Organisation/AjaxScopeQualification/{organisationId}?tabIndex = 4'标头= {'origin':'https://training.gov.au','referer':f'https://training.gov.au/Organisation/Details/{organisationId}?tabIndex = 4','用户代理':'Mozilla/5.0(Macintosh; Intel Mac OS X 10_14_6)AppleWebKit/537.36(KHTML,例如Gecko)Chrome/87.0.4280.67 Safari/537.36','x-requested-with':'XMLHttpRequest'}data = {'page':'1','size':'100','orderBy':'Code-asc','groupBy':``,'filter':''}r = requests.post(URL,json = data,headers = headers)响应= json.loads(re.sub(r'new Date \((\ d +),(\ d +),(\ d +),0,0,0 \)',r'''\ 1- \ 2-\ 2',r.text))返回响应响应= get_AjaxScopeQualification(organisationId)dfn = pd.json_normalize(response,'data',meta = ['total'])打印(dfn.columns)print(dfn [['Code','Title','Extent']])
结果:
response ['data'] [0]{'Id':'5096634d-4210-4fd4-a51d-f548cd39d57b','NrtId':'2feb7e3f-7fc6-4719-ba66-2a066f6635c7','RtoId':'3fbfd9c9-3cce-4d69-973e-4e2674f8c5a9','TrainingComponentType':2'代码':'BSB20115',标题":商业II级证书",'IsImplicit':错误,'ExtentId':'01',范围":交付和评估",'开始日期':'2015-3-3','EndDate':'2022-3-3','DeliveryNsw':是的,'DeliveryVic':是的,'DeliveryQld':是的,'DeliverySa':是的,'DeliveryWa':是的,'DeliveryTas':是的,'DeliveryNt':是的,'DeliveryAct':是的,'ScopeDecisionType':0,'ScopeDecision':'交付和评估','OverseasCodeAlpha':无,'OverseasCodeAlhpaList':[],'OverseasCodeAlphaOutput':''}
i have been trying to extract the table from website but i am lost. can anyone help me ?my goal is to extract the table of scope page : https://training.gov.au/Organisation/Details/31102
import requests
from bs4 import BeautifulSoup
url = "https://training.gov.au/Organisation/Details/31102"
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
table = soup.find(id ="ScopeQualification")
[row.text.split() for row in table.find_all("tr")]
解决方案
- find
OrganisationId
from 'https://training.gov.au/Organisation/Details/31102'. - find XHR url, https://training.gov.au/Organisation/AjaxScopeQualification/3fbfd9c9-3cce-4d69-973e-4e2674f8c5a9?tabIndex=4, POST Method.
import requests
import json
import pandas as pd
import re
def get_organisationId(url):
# url = 'https://training.gov.au/Organisation/Details/31102'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36'}
resp = requests.get(url, headers=headers)
id_list = re.findall(r'OrganisationId=(.*?)&', resp.text)
organisationId = id_list[0] if id_list else None
return organisationId
# get organisationId first
url = 'https://training.gov.au/Organisation/Details/31102'
organisationId = get_organisationId(url)
def get_AjaxScopeQualification(organisationId):
if organisationId:
url = f'https://training.gov.au/Organisation/AjaxScopeQualification/{organisationId}?tabIndex=4'
headers = {
'origin': 'https://training.gov.au',
'referer': f'https://training.gov.au/Organisation/Details/{organisationId}?tabIndex=4',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
data = {'page': '1', 'size': '100', 'orderBy': 'Code-asc', 'groupBy': '', 'filter': ''}
r = requests.post(url, json=data, headers=headers)
response = json.loads(re.sub(r'new Date\((\d+),(\d+),(\d+),0,0,0\)', r'"\1-\2-\2"', r.text))
return response
response = get_AjaxScopeQualification(organisationId)
dfn = pd.json_normalize(response, 'data', meta=['total'])
print(dfn.columns)
print(dfn[[ 'Code', 'Title', 'Extent']])
result:
response['data'][0]
{'Id': '5096634d-4210-4fd4-a51d-f548cd39d57b',
'NrtId': '2feb7e3f-7fc6-4719-ba66-2a066f6635c7',
'RtoId': '3fbfd9c9-3cce-4d69-973e-4e2674f8c5a9',
'TrainingComponentType': 2,
'Code': 'BSB20115',
'Title': 'Certificate II in Business',
'IsImplicit': False,
'ExtentId': '01',
'Extent': 'Deliver and assess',
'StartDate': '2015-3-3',
'EndDate': '2022-3-3',
'DeliveryNsw': True,
'DeliveryVic': True,
'DeliveryQld': True,
'DeliverySa': True,
'DeliveryWa': True,
'DeliveryTas': True,
'DeliveryNt': True,
'DeliveryAct': True,
'ScopeDecisionType': 0,
'ScopeDecision': 'Deliver and assess',
'OverseasCodeAlpha': None,
'OverseasCodeAlhpaList': [],
'OverseasCodeAlphaOutput': ''}
这篇关于如何使用python从网站中提取表的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!