如何使用python从网站中提取表

本文介绍了如何使用python从网站中提取表的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我一直在尝试从网站中提取表格，但我迷路了.谁能帮我 ?我的目标是提取范围页面表:

 导入请求导入json将熊猫作为pd导入汇入def get_organisationId(url):#url ='https://training.gov.au/Organisation/Details/31102'标头= {'User-Agent':'Mozilla/5.0(Macintosh; Intel Mac OS X 10_14_6)AppleWebKit/537.36(KHTML，like Gecko)Chrome/87.0.4280.67 Safari/537.36'}分别= requests.get(URL，headers = headers)id_list = re.findall(r'OrganisationId =(.*?)&'，分别为文本)OrganisationId = id_list [0]，如果id_list否则无返回organizationId#首先获取OrganisationIdurl ='https://training.gov.au/Organisation/Details/31102'OrganisationId = get_organisationId(url)def get_AjaxScopeQualification(organisationId):如果organisationId:url = f'https://training.gov.au/Organisation/AjaxScopeQualification/{organisationId}?tabIndex = 4'标头= {'origin':'https://training.gov.au'，'referer':f'https://training.gov.au/Organisation/Details/{organisationId}?tabIndex = 4'，'用户代理':'Mozilla/5.0(Macintosh; Intel Mac OS X 10_14_6)AppleWebKit/537.36(KHTML，例如Gecko)Chrome/87.0.4280.67 Safari/537.36'，'x-requested-with':'XMLHttpRequest'}data = {'page':'1'，'size':'100'，'orderBy':'Code-asc'，'groupBy':``，'filter':''}r = requests.post(URL，json = data，headers = headers)响应= json.loads(re.sub(r'new Date \((\ d +)，(\ d +)，(\ d +)，0,0,0 \)'，r'''\ 1- \ 2-\ 2'，r.text))返回响应响应= get_AjaxScopeQualification(organisationId)dfn = pd.json_normalize(response，'data'，meta = ['total'])打印(dfn.columns)print(dfn [['Code'，'Title'，'Extent']])

结果:

  response ['data'] [0]{'Id':'5096634d-4210-4fd4-a51d-f548cd39d57b'，'NrtId':'2feb7e3f-7fc6-4719-ba66-2a066f6635c7'，'RtoId':'3fbfd9c9-3cce-4d69-973e-4e2674f8c5a9'，'TrainingComponentType':2'代码':'BSB20115'，标题":商业II级证书"，'IsImplicit':错误，'ExtentId':'01'，范围":交付和评估"，'开始日期':'2015-3-3'，'EndDate':'2022-3-3'，'DeliveryNsw':是的，'DeliveryVic':是的，'DeliveryQld':是的，'DeliverySa':是的，'DeliveryWa':是的，'DeliveryTas':是的，'DeliveryNt':是的，'DeliveryAct':是的，'ScopeDecisionType':0，'ScopeDecision':'交付和评估'，'OverseasCodeAlpha':无，'OverseasCodeAlhpaList':[]，'OverseasCodeAlphaOutput':''}

i have been trying to extract the table from website but i am lost. can anyone help me ?my goal is to extract the table of scope page : https://training.gov.au/Organisation/Details/31102

import requests
from bs4 import BeautifulSoup
url = "https://training.gov.au/Organisation/Details/31102"
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')

table = soup.find(id ="ScopeQualification")
[row.text.split() for row in table.find_all("tr")]

解决方案

find OrganisationId from 'https://training.gov.au/Organisation/Details/31102'.
find XHR url, https://training.gov.au/Organisation/AjaxScopeQualification/3fbfd9c9-3cce-4d69-973e-4e2674f8c5a9?tabIndex=4, POST Method.

import requests
import json
import pandas as pd
import re

def get_organisationId(url):
    # url = 'https://training.gov.au/Organisation/Details/31102'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36'}
    resp = requests.get(url, headers=headers)
    id_list = re.findall(r'OrganisationId=(.*?)&', resp.text)
    organisationId = id_list[0] if id_list else None
    return organisationId

# get organisationId first
url = 'https://training.gov.au/Organisation/Details/31102'
organisationId = get_organisationId(url)


def get_AjaxScopeQualification(organisationId):
    if organisationId:
        url = f'https://training.gov.au/Organisation/AjaxScopeQualification/{organisationId}?tabIndex=4'
        headers = {
         'origin': 'https://training.gov.au',
         'referer': f'https://training.gov.au/Organisation/Details/{organisationId}?tabIndex=4',
         'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36',
         'x-requested-with': 'XMLHttpRequest'
        }
        data = {'page': '1', 'size': '100', 'orderBy': 'Code-asc', 'groupBy': '', 'filter': ''}
        r = requests.post(url, json=data, headers=headers)
        response = json.loads(re.sub(r'new Date\((\d+),(\d+),(\d+),0,0,0\)', r'"\1-\2-\2"', r.text))
        return response
response = get_AjaxScopeQualification(organisationId)
dfn = pd.json_normalize(response, 'data', meta=['total'])
print(dfn.columns)
print(dfn[[ 'Code', 'Title', 'Extent']])

result:

response['data'][0]

{'Id': '5096634d-4210-4fd4-a51d-f548cd39d57b',
 'NrtId': '2feb7e3f-7fc6-4719-ba66-2a066f6635c7',
 'RtoId': '3fbfd9c9-3cce-4d69-973e-4e2674f8c5a9',
 'TrainingComponentType': 2,
 'Code': 'BSB20115',
 'Title': 'Certificate II in Business',
 'IsImplicit': False,
 'ExtentId': '01',
 'Extent': 'Deliver and assess',
 'StartDate': '2015-3-3',
 'EndDate': '2022-3-3',
 'DeliveryNsw': True,
 'DeliveryVic': True,
 'DeliveryQld': True,
 'DeliverySa': True,
 'DeliveryWa': True,
 'DeliveryTas': True,
 'DeliveryNt': True,
 'DeliveryAct': True,
 'ScopeDecisionType': 0,
 'ScopeDecision': 'Deliver and assess',
 'OverseasCodeAlpha': None,
 'OverseasCodeAlhpaList': [],
 'OverseasCodeAlphaOutput': ''}

这篇关于如何使用python从网站中提取表的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持！