问题描述
我想从Roster Resource获取数据,这里是一个网页的例子()。最低限度,我希望获得预计转到首发阵容并将该数据导入到我的电子表格中。然后我会为Roster Resource的每个MLB球队做这件事,为每支球队创建一张表格,并为每支球队预测阵容。
我尝试了一些 getElementById和getElementsByClassName,但是我很难获取我想要的数据,因为这似乎只是网页上的一个非常大的表格。
任何见解如果你浏览网页然后选择检查元素在浏览器开发工具中会看到整个表格位于一个框架中:
覆盖>< iframe id =pageswitcher-contentframeborder =0marginheight =0marginwidth =0src =https://docs.google.com/spreadsheets/d/e/2PACX-1vSe6YBd7UW_ijhVHdRsM132Z3aUXUIzGuHcoqqdsr_nUXIYHbvRDFY0XCwGVndXJnWRaWVYhbeDbo5W/pubhtml/sheet?headers=false&gid=1569103012style =display:block;宽度:100%;身高:100%;>< / iframe>
所以实际上您需要检索数据如下面的代码所示:
<$ c从这个Google Spreadsheet文档中,可以用XHR和Regex完成。 $ c> Option Explicit
Sub Test()
Dim sContent As String
Dim i As Long
Dim j As Long
Dim k As Long
Dim aTables()
Dim aHeader()As String
Dim aRows()As String
通过XHR检索HTML内容
With CreateObject(MSXML2.XMLHTTP)
。打开GET,https://www.rosterresource.com/mlb-arizona-diamondbacks,假
。发送
sContent =。 ResponseText
End With
'iframe之前全部剪切URL
sContent = Split(sContent,< iframe src =,2)(1)
' ?在URL中签名
sContent = Split(sContent,?,2)(0)
'下载谷歌传播通过提取的URL
'e。 G。 https://docs.google.com/spreadsheets/d/e/2PACX-1vQngsjnOpqkD8FQIOLn4cFayZTe4dl5VJZLNjMzji2Iq0dVXan7nj20Pq6oKnVS_HFla9e5GUtCyYl_/pubhtml
'e。 G。 https://docs.google.com/spreadsheets/d/e/2PACX-1vSe6YBd7UW_ijhVHdRsM132Z3aUXUIzGuHcoqqdsr_nUXIYHbvRDFY0XCwGVndXJnWRaWVYhbeDbo5W/pubhtml
通过XHR $ b $检索HTML内容使用CreateObject(MSXML2.XMLHTTP)
.Open GET,sContent,False
.Send
sContent = .ResponseText
End With
'用RegEx $ b $解析用CreateObject(VBScript.RegExp)
.Global = True
.MultiLine = True
.IgnoreCase = True
'在iframe内处理所有表格
.Pattern =< table \ b [\ s\S] *>?(?[\s\S] *)< /表>中
With .Execute(sContent)
ReDim aTables(0 To .Count - 1)
For i = 0 To .Count - 1
aTables(i)= .Item(i ).SubMatches(0)
Next
End With
For k = 0 To UBound(aTables)
'简单的HTML简化
sContent = aTables(k)
'删除除表格格式之外的所有标签
.Pattern =<(?!/ td | / tr | / th |(?:td | tr | th)\b)[^>] * > | \r | \\\
| \ t
sContent = .Replace(sContent,)
'移除标签属性
.Pattern =<(\瓦特+)\b [^>] +>中
sContent = .Replace(sContent,< $ 1>)
'将th替换为td
.Pattern =<(/?)th>
sContent = .Replace(sContent,< $ 1td>)
'替换HTML实体和名称; &安培; #NUMBER;字符
.Pattern =&(?: \w + |#\d +);
.Global = False
使用.Execute(sContent)
如果.Count = 0则退出
sContent = Replace(sContent,.Item(0) ,DecodeHTMLEntities(.Item(0)))
End With
Loop
.Global = True
'Extract rows
.Pattern =< tr>( ?:?< TD> * LT; / TD>)+)< / TR>中
With .Execute(sContent)
ReDim aRows(0 To .Count - 1,0)
For i = 0 To .Count - 1
aRows(i,0)= .Item(i).SubMatches(0)
Next
End With
'Extract cells
.Pattern =< td>(。*?)< / td>
For i = 0 To UBound(aRows,1)
With .Execute(aRows(i,0))
For j = 0 To .Count - 1
如果UBound aRows,2) j然后ReDim保存aRows(UBound(aRows,1),j)
aRows(i,j)= Trim(.Item(j).SubMatches(0))
DoEvents
Next
End With
Next
aTables(k)= aRows
Next
End With
'输出
With ThisWorkbook
'全部删除现有工作表
Application.DisplayAlerts = False
.Sheets.Add,.Sheets(.Sheets.Count)
Do .Sheets.Count> 1
.Sheets(1).Delete
Loop
Application.DisplayAlerts = True
'输出每个表来分隔工作表
对于k = 0 To UBound(aTables)
如果.Sheets.Count< (k + 1)Then .Sheets.Add,.Sheets(.Sheets.Count)
With .Sheets(k + 1)
.Cells.Delete
Output2DArray .Cells(1,1 ),a表格(k)
.Columns.AutoFit
以
结尾下一个
以
结尾结束小组
功能DecodeHTMLEntities(sText As String)作为字符串
静态oHtmlfile作为对象
静态oDiv作为对象
如果oHtmlfile没有那么
设置oHtmlfile = CreateObject( htmlfile)
oHtmlfile.Open
Set oDiv = oHtmlfile.createElement(div)
End If
oDiv.innerHTML = sText
DecodeHTMLEntities = oDiv.innerText
End Function
Sub Output2DArray(oDstRng作为范围,aCells作为变体)
随着oDstRng
.Parent.Select
With .Resize(_
UBound(aCells,1) - LBound(aCells,1)+ 1,_
UBound(aCells,2) - LBou nd(aCells,2)+ 1)
.NumberFormat =@
.Value = aCells
End With
End With
End Sub
通常RegEx不推荐用于HTML解析,所以。在这种情况下处理的数据非常简单,这就是为什么使用RegEx进行分析的原因。关于RegEx:(特别是),,。简化使得HTML代码在某种程度上适合解析。顺便说一句,使用相同的方法。
I am trying to get data from Roster Resource, here's an example of a webpage (https://www.rosterresource.com/mlb-arizona-diamondbacks). At the very minimum, I want to get the "Projected "Go-to" Starting Lineup" and import that data into my spreadsheet. I would then do this for every MLB team from Roster Resource to create a sheet that has every team and the projected lineup for each team.
I have tried some methods of "getElementById" and "getElementsByClassName", but I'm having difficulty getting to the data I want since this seems to be just one very large table on the webpage.
Any insight to get me on the right direction of getting the data would be very helpful.
If you navigate the webpage https://www.rosterresource.com/mlb-arizona-diamondbacks and choose Inspect element from context menu on the table, you will see in browser developer tools that the whole table is located within a frame:
<iframe id="pageswitcher-content" frameborder="0" marginheight="0" marginwidth="0" src="https://docs.google.com/spreadsheets/d/e/2PACX-1vSe6YBd7UW_ijhVHdRsM132Z3aUXUIzGuHcoqqdsr_nUXIYHbvRDFY0XCwGVndXJnWRaWVYhbeDbo5W/pubhtml/sheet?headers=false&gid=1569103012" style="display: block; width: 100%; height: 100%;"></iframe>
So actually you need to retrieve the data from that Google Spreadsheet document. That could be done with XHR and Regex, as shown in the below code:
Option Explicit
Sub Test()
Dim sContent As String
Dim i As Long
Dim j As Long
Dim k As Long
Dim aTables()
Dim aHeader() As String
Dim aRows() As String
' Retrieve HTML content via XHR
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.rosterresource.com/mlb-arizona-diamondbacks", False
.Send
sContent = .ResponseText
End With
' Cut all before iframe URL
sContent = Split(sContent, "<iframe src=""", 2)(1)
' Cut all after ? sign within URL
sContent = Split(sContent, "?", 2)(0)
' Download google spreadsheet by extracted URL
' e. g. https://docs.google.com/spreadsheets/d/e/2PACX-1vQngsjnOpqkD8FQIOLn4cFayZTe4dl5VJZLNjMzji2Iq0dVXan7nj20Pq6oKnVS_HFla9e5GUtCyYl_/pubhtml
' e. g. https://docs.google.com/spreadsheets/d/e/2PACX-1vSe6YBd7UW_ijhVHdRsM132Z3aUXUIzGuHcoqqdsr_nUXIYHbvRDFY0XCwGVndXJnWRaWVYhbeDbo5W/pubhtml
' Retrieve HTML content via XHR
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", sContent, False
.Send
sContent = .ResponseText
End With
' Parse with RegEx
With CreateObject("VBScript.RegExp")
.Global = True
.MultiLine = True
.IgnoreCase = True
' Process all tables within iframe content
.Pattern = "<table\b[\s\S]*?>([\s\S]*?)</table>"
With .Execute(sContent)
ReDim aTables(0 To .Count - 1)
For i = 0 To .Count - 1
aTables(i) = .Item(i).SubMatches(0)
Next
End With
For k = 0 To UBound(aTables)
' Minor HTML simplification
sContent = aTables(k)
' Remove all tags except table formatting
.Pattern = "<(?!/td|/tr|/th|(?:td|tr|th)\b)[^>]*>|\r|\n|\t"
sContent = .Replace(sContent, "")
' Remove tags attributes
.Pattern = "<(\w+)\b[^>]+>"
sContent = .Replace(sContent, "<$1>")
' Replace th with td
.Pattern = "<(/?)th>"
sContent = .Replace(sContent, "<$1td>")
' Replace HTML entities &name; &#number; with chars
.Pattern = "&(?:\w+|#\d+);"
.Global = False
Do
With .Execute(sContent)
If .Count = 0 Then Exit Do
sContent = Replace(sContent, .Item(0), DecodeHTMLEntities(.Item(0)))
End With
Loop
.Global = True
' Extract rows
.Pattern = "<tr>((?:<td>.*?</td>)+)</tr>"
With .Execute(sContent)
ReDim aRows(0 To .Count - 1, 0)
For i = 0 To .Count - 1
aRows(i, 0) = .Item(i).SubMatches(0)
Next
End With
' Extract cells
.Pattern = "<td>(.*?)</td>"
For i = 0 To UBound(aRows, 1)
With .Execute(aRows(i, 0))
For j = 0 To .Count - 1
If UBound(aRows, 2) < j Then ReDim Preserve aRows(UBound(aRows, 1), j)
aRows(i, j) = Trim(.Item(j).SubMatches(0))
DoEvents
Next
End With
Next
aTables(k) = aRows
Next
End With
' Output
With ThisWorkbook
' Remove all existing worksheets
Application.DisplayAlerts = False
.Sheets.Add , .Sheets(.Sheets.Count)
Do While .Sheets.Count > 1
.Sheets(1).Delete
Loop
Application.DisplayAlerts = True
' Output each table to separate worksheet
For k = 0 To UBound(aTables)
If .Sheets.Count < (k + 1) Then .Sheets.Add , .Sheets(.Sheets.Count)
With .Sheets(k + 1)
.Cells.Delete
Output2DArray .Cells(1, 1), aTables(k)
.Columns.AutoFit
End With
Next
End With
End Sub
Function DecodeHTMLEntities(sText As String) As String
Static oHtmlfile As Object
Static oDiv As Object
If oHtmlfile Is Nothing Then
Set oHtmlfile = CreateObject("htmlfile")
oHtmlfile.Open
Set oDiv = oHtmlfile.createElement("div")
End If
oDiv.innerHTML = sText
DecodeHTMLEntities = oDiv.innerText
End Function
Sub Output2DArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
UBound(aCells, 1) - LBound(aCells, 1) + 1, _
UBound(aCells, 2) - LBound(aCells, 2) + 1)
.NumberFormat = "@"
.Value = aCells
End With
End With
End Sub
Generally RegEx's aren't recommended for HTML parsing, so there is disclaimer. Data being processed in this case is quite simple that is why it is parsed with RegEx. About RegEx: introduction (especially syntax), introduction JS, VB flavor. Simplification makes HTML code suitable for parsing in some degree. BTW there is one more answer using the same approach.
这篇关于Excel VBA网页抓取数据表的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!