1.页面解析接口

package com.dajiangtai.djt_spider.service;

import com.dajiangtai.djt_spider.entity.Page;

/**
* 页面解析接口
* @author Administrator
*
*/
public interface IProcessService {

public void process(Page page);
}

2.页面解析实现类

package com.dajiangtai.djt_spider.service.impl;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;

import com.dajiangtai.djt_spider.entity.Page;
import com.dajiangtai.djt_spider.service.IProcessService;
import com.dajiangtai.djt_spider.util.HtmlUtil;
import com.dajiangtai.djt_spider.util.LoadPropertyUtil;
import com.dajiangtai.djt_spider.util.RegexUtil;

/**
* 优酷页面解析实现类
* @author Administrator
*
*/
public class YOUKUProcessService implements IProcessService{

//获取到的总播放数:16,960,789,989 其xpath为:
// /html/body/div[4]/div/div[1]/div[2]/div[2]/ul/li[11]
//这里ul最近的div[2]其class为"p-base",因此,仿写其他xpath,改成如下相对路径
private String parseAllNumber = "/body/div/div/div/div/div/ul/li[11]";
//评论数
private String parseCommentNumber = "//div[@class=\"p-base\"]/ul/li[12]";
//赞数
private String parseSupportNumber = "//div[@class=\"p-base\"]/ul/li[13]";

public void process(Page page) {

String content = page.getContent();
HtmlCleaner htmlCleaner = new HtmlCleaner();
//利用htmlCleaner对网页进行解析,得到根节点
TagNode rootNode = htmlCleaner.clean(content);
try {
Object[] evaluateXPath = rootNode.evaluateXPath(parseAllNumber);
if(evaluateXPath.length>0){
TagNode node = (TagNode)evaluateXPath[0];
System.out.println(node.getText().toString());
}

evaluateXPath = rootNode.evaluateXPath(parseCommentNumber);
if(evaluateXPath.length>0){
TagNode node = (TagNode)evaluateXPath[0];
System.out.println(node.getText().toString());
}

evaluateXPath = rootNode.evaluateXPath(parseSupportNumber);
if(evaluateXPath.length>0){
TagNode node = (TagNode)evaluateXPath[0];
System.out.println(node.getText().toString());
}
} catch (XPatherException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

}

3.爬虫入口类在main方法中对页面解析方法进行测试:

package com.dajiangtai.djt_spider.start;

import com.dajiangtai.djt_spider.entity.Page;
import com.dajiangtai.djt_spider.service.IDownLoadService;
import com.dajiangtai.djt_spider.service.IProcessService;
import com.dajiangtai.djt_spider.service.IStoreService;
import com.dajiangtai.djt_spider.service.impl.ConsoleStoreService;
import com.dajiangtai.djt_spider.service.impl.HttpClientDownLoadService;
import com.dajiangtai.djt_spider.service.impl.YOUKUProcessService;

/**
* 电视剧爬虫入口类
* @author Administrator
*
*/
public class StartDSJCount {

//页面下载接口
private IDownLoadService downLoadService;

private IProcessService processService;

public static void main(String[] args) {
StartDSJCount dsj = new StartDSJCount();
dsj.setDownLoadService(new HttpClientDownLoadService());
dsj.setProcessService(new YOUKUProcessService());
String url = "http://list.youku.com/show/id_z9cd2277647d311e5b692.html?spm=a2h0j.8191423.sMain.5~5~A!2.iCUyO9";
//下载页面
Page page = dsj.downloadPage(url);
//解析页面
dsj.processPage(page);

}

//下载页面方法
public Page downloadPage(String url){
return this.downLoadService.download(url);
}

//解析页面方法
public void processPage(Page page){
this.processService.process(page);
}

public IDownLoadService getDownLoadService() {
return downLoadService;
}

public void setDownLoadService(IDownLoadService downLoadService) {
this.downLoadService = downLoadService;
}

public IProcessService getProcessService() {
return processService;
}

public void setProcessService(IProcessService processService) {
this.processService = processService;
}

}

4.测试结果如下:

通过xpath获取对应的整个子节点信息-LMLPHP

这样,下图中标记信息已经全部解析成功了。

通过xpath获取对应的整个子节点信息-LMLPHP

05-28 20:34