我有一个需要使用html并在其中获取一些标签的应用程序。
我需要获取所有tr和所有td,并获取其内部文本。
你能给我一个代码来做吗?
我已经在这几个小时了...
该网站的内容是:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!-- Updated: 03/11/2011 15:17:29-->
<html xmlns="http://www.w3.org/1999/xhtml" >
<head><title>
Untitled Page
</title><meta http-equiv="Page-Exit" content="progid:DXImageTransform.Microsoft.GradientWipe(duration=1)" /><meta HTTP-EQUIV="CACHE-CONTROL" content="NO-CACHE" /><meta HTTP-EQUIV="PRAGMA" content="NO-CACHE" /><meta http-equiv="refresh" content="60" />
<style type="text/css">
.DisplayTable { width: 97%; }
.DisplayHeader { font-family: Arial; font-weight: bold; font-size: 25px; color: Black; text-align: center; }
.DisplayCell { font-family: Arial; font-weight: bold; font-size: 16px; color: Black; }
.MessageTable { width: 97%; }
.MessageHeader { font-family: Arial; font-size: 20px; color: SteelBlue; border-bottom: solid 3px SteelBlue; }
.MessageText { font-family: Arial; font-size: 20px; color: SteelBlue; text-align: right; }
.DisplayFillChange { font-family: Arial; font-weight: bold; font-size: 16px; color: MediumBlue; background-color: LightCyan; border-bottom: solid 1px LightCyan; }
.DisplayFreeChange { font-family: Arial; font-weight: bold; font-size: 16px; color: OrangeRed; background-color: LightCyan; border-bottom: solid 1px LightCyan; }
.DisplayEventChange { font-family: Arial; font-weight: bold; font-size: 16px; color: DarkGreen; background-color: LightCyan; border-bottom: solid 1px LightCyan; }
.DisplayExamChange { font-family: Arial; font-weight: bold; font-size: 16px; color: IndianRed; background-color: LightCyan; border-bottom: solid 1px LightCyan; }
</style>
</head>
<body dir="rtl" style="margin: 0px; background-color: LightCyan; overflow: hidden;" scroll="no" onload="resize()">
<form name="form1" method="post" action="MainScreen.aspx?pid=17&mid=6264&page=5&msgof=0&static=1" id="form1">
<div>
<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="/wEPDwUJLTQwMjA0MzQzZGSqqj0xDnBRKxIgowwhNZzzyzQHVg==" />
</div>
<table width="100%" cellspacing="0" cellpadding="0" border="0" style="background-image: url(fill.gif);">
<tr height="59" style="font-family: Arial; font-size: 34px; color: Yellow; vertical-align: middle;">
<td width="15"> </td>
<td width="45%" align="right" id="clock">00:00</td>
<td align="center" nowrap><b>שינוי מערכת שעות לתאריך </b></td>
<td width="45%" align="left">04.11.2011</td>
<td width="15"> </td>
</tr>
</table>
<br />
<div id="header" align="center"><table width='100%' class='DisplayTable' cellspacing='0' border='1'><tr class='DisplayHeader'><td width='1%' style='color: LightCyan;'>0</td><td width='14%'>יא - 1</td><td width='14%'>יא - 2</td><td width='14%'>יא - 3</td><td width='14%'>יא - 4</td><td width='14%'>יא - 5</td><td width='14%'>יא - 6</td><td width='14%'>יא - 7</td><td width='1%' style='color: LightCyan;'>0</td></tr></table></div>
<div id="scrollPanel" align="center" style="overflow: hidden;">
<div id="panel" align="center" style=""><table width='100%' class='DisplayTable' cellspacing='0' border='1'><tr><td width='1%' class='DisplayCell'>0</td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='1%' class='DisplayCell'>0</td></tr><tr><td width='1%' class='DisplayCell'>1</td><td width='14%' class='DisplayCell'><table width='100%'></table></td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='1%' class='DisplayCell'>1</td></tr><tr><td width='1%' class='DisplayCell'>2</td><td width='14%' class='DisplayCell'><table width='100%'></table></td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='1%' class='DisplayCell'>2</td></tr><tr><td width='1%' class='DisplayCell'>3</td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='1%' class='DisplayCell'>3</td></tr><tr><td width='1%' class='DisplayCell'>4</td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='1%' class='DisplayCell'>4</td></tr><tr><td width='1%' class='DisplayCell'>5</td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='1%' class='DisplayCell'>5</td></tr><tr><td width='1%' class='DisplayCell'>6</td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='1%' class='DisplayCell'>6</td></tr><tr><td width='1%' class='DisplayCell'>7</td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='1%' class='DisplayCell'>7</td></tr><tr><td width='1%' class='DisplayCell'>8</td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='1%' class='DisplayCell'>8</td></tr><tr><td width='1%' class='DisplayCell'>9</td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='14%' class='DisplayCell'> </td><td width='1%' class='DisplayCell'>9</td></tr></table></div>
<div id="messages" align="center"><table width='100%' class='MessageTable' cellspacing='0' cellpadding='7' border='0'><tr><td class='MessageHeader'>הודעות</td></tr></tr></table></div>
</div>
</form>
<script>
var sp;
var delay = 0;
function resize(){
sp = document.getElementById('scrollPanel');
sp.style.height = document.documentElement.clientHeight - sp.offsetTop;
delay = document.getElementById('panel').clientHeight - document.getElementById('scrollPanel').clientHeight;
if (delay > 0)
delay = delay / 5 * 120;
else
delay = 0;
setTimeout("doScroll()", 3000);
setTimeout("doNextPage()", 500);
}
function doScroll()
{
sp.scrollTop += 5;
setTimeout("doScroll()", 100);
}
updateClock();
function nextUrl()
{
return 'MainScreen.aspx?pid=17&mid=6264&page=6&msgof=0&nd=0';
}
function doNextPage()
{
}
function updateClock()
{
document.getElementById('clock').innerHTML = getClock();
setTimeout("updateClock()", 55000)
}
function getClock()
{
var date = new Date();
var hours = date.getHours();
var minutes = date.getMinutes();
if (hours < 10)
hours = '0' + hours;
if (minutes < 10)
minutes = '0' + minutes;
return hours + ':' + minutes;
}
</script>
</body>
</html>
最佳答案
最简单的方法是使用HTML解析库,例如HTMLCleaner,TagSoup,HTML Parser等。通过这种方式,您可以简单地从文档中获取所需的元素,或使用“节点访问者”或任何库调用它手动进行迭代。
从上面快速浏览一个随机选择的库的documentation,表明类似以下内容的内容对于HTMLCleaner应该适用:
HtmlCleaner cleaner = new HtmlCleaner();
TagNode root= cleaner.clean(...);
TagNode[] trNodes= root.getElementsByName("tr");
for (TagNode trNode : trNodes) {
System.out.println("All text inside this <tr> tag (including children): " + trNode.getText());
}
使用相同库但现在使用TagNodeVisitor并在
<td>
上进行过滤的示例:node.traverse(new TagNodeVisitor() {
public boolean visit(TagNode tagNode, HtmlNode htmlNode) {
if (htmlNode instanceof TagNode) {
TagNode tag = (TagNode) htmlNode;
String tagName = tag.getName();
if ("td".equals(tagName)) {
System.out.println("All text inside this <td> tag (including children): " + tag.getText());
}
}
// tells visitor to continue traversing the DOM tree
return true;
}
});