根据客户要求,爬取excel文件中url对应的文章的阅读量和评论量。代码如下:
import jxl.Cell; import jxl.Sheet; import jxl.Workbook; import jxl.read.biff.BiffException; import jxl.write.Label; import jxl.write.WritableSheet; import jxl.write.WritableWorkbook; import jxl.write.WriteException; import org.apache.commons.lang3.StringUtils; import org.openqa.selenium.By; import org.openqa.selenium.JavascriptExecutor; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.edge.EdgeDriver; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; import java.util.concurrent.TimeUnit; class NewsInfo{ public String url; public String title; public String plNum; public String readNum; public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getPlNum() { return plNum; } public void setPlNum(String plNum) { this.plNum = plNum; } public String getReadNum() { return readNum; } public void setReadNum(String readNum) { this.readNum = readNum; } } public class SeleniumUtils { public static WebDriver driver; public static void main(String[] args) throws WriteException, IOException { System.setProperty("webdriver.edge.driver", "E:\\edgedriver_win64\\msedgedriver.exe"); driver = new EdgeDriver(); Workbook workbook = null; File Inputfile = new File("E:\\edgedriver_win64\\demo.xls"); ArrayList<String> urlList = new ArrayList<>(); List<NewsInfo> newsList = new ArrayList<>(); try { FileInputStream fileInputStream = new FileInputStream(Inputfile); workbook = Workbook.getWorkbook(fileInputStream); Sheet readfirst = workbook.getSheet(0); int rows = readfirst.getRows(); for(int i =0;i<rows;i++) { Cell[] cells = readfirst.getRow(i); //循环得到每一行的单元格对象 //根据每一个单元格对象的到里面的值 String url= cells[0].getContents(); if(StringUtils.isNotBlank(url)){ urlList.add(url); } } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (BiffException e) { e.printStackTrace(); } for(int i=0;i<urlList.size();i++){ String url = urlList.get(i); System.out.println("------------------------url-------------------------" + url ); System.out.println("------------------------i-------------------------" + i ); NewsInfo news = getNewsReadNum(url); newsList.add(news); } writeExcel(newsList); } public static void writeExcel(List<NewsInfo> list) throws IOException, WriteException { driver.quit(); String[] titles=new String[]{"url地址","标题","阅读量","评论量"}; File file = new File("E:\\edgedriver_win64\\demo2.xls"); //判断file是否存在 if(file.exists()){ file.delete(); System.out.println("原有表格删除成功"); } file.createNewFile(); //创建工作簿 WritableWorkbook workbook = Workbook.createWorkbook(file); WritableSheet sheet1 = workbook.createSheet("sheet1", 0); Label label=null; //设置列名 for (int j = 0; j <titles.length ; j++) { label=new Label(j,0,titles[j]); sheet1.addCell(label); } //方便定义行 int count=0; //将list中的数据添加至工作簿中 String[] title for (int i=0;i<list.size();i++) { NewsInfo info = list.get(i); String url = info.getUrl(); String title = info.getTitle(); String readNum = info.getReadNum(); String plNum = info.getPlNum(); count++; label=new Label(0,count,url); sheet1.addCell(label); label=new Label(1,count,title); sheet1.addCell(label); label=new Label(2,count,readNum); sheet1.addCell(label); label=new Label(3,count,plNum); sheet1.addCell(label); } workbook.write(); workbook.close(); } public static NewsInfo getNewsReadNum(String url){ NewsInfo news = new NewsInfo(); news.setUrl(url); try{ String readNum =""; driver.get(url); String title = driver.getTitle(); System.out.println(title); news.setTitle(title); if(url.contains("toutiao")){ try{ WebElement titleElement = driver.findElement(By.className("main")); WebElement h1= titleElement.findElement(By.tagName("h1")); title = h1.getText(); news.setTitle(title); }catch(org.openqa.selenium.NoSuchElementException e){ System.out.println(e.getMessage()); } } if(url.contains("baidu")){ try{ WebElement sampleDiv = driver.findElement(By.className("xcp-list-title")); String plNum = sampleDiv.getText(); System.out.println(" plNum Text: " + plNum); news.setPlNum(plNum); }catch(org.openqa.selenium.NoSuchElementException e){ System.out.println(e.getMessage()); } }else if(url.contains("sohu")){ try{ WebElement sampleDiv = driver.findElement(By.className("c-comment-more")); String plNum = sampleDiv.getText(); System.out.println(" plNum Text: " + plNum); news.setPlNum(plNum); }catch(org.openqa.selenium.NoSuchElementException e){ System.out.println(e.getMessage()); } }else if(url.contains("toutiao")){ try{ WebElement sampleDiv = driver.findElement(By.className("ttp-comment-wrapper")); WebElement sample = sampleDiv.findElement(By.className("title")); String plNum = sample.getText(); System.out.println(" plNum Text: " + plNum); news.setPlNum(plNum); }catch(org.openqa.selenium.NoSuchElementException e){ System.out.println(e.getMessage()); } }else if(url.contains("qq.com")){ try{ WebElement sampleDiv = driver.findElement(By.className("header-number")); String plNum = sampleDiv.getText(); System.out.println(" plNum Text: " + plNum); news.setPlNum(plNum); }catch(org.openqa.selenium.NoSuchElementException e){ System.out.println(e.getMessage()); } }else if(url.contains("163.com")){ try{ WebElement sampleDiv = driver.findElement(By.className("tie-actCount")); String plNum = sampleDiv.getText(); System.out.println(" plNum Text: " + plNum); news.setPlNum(plNum); }catch(org.openqa.selenium.NoSuchElementException e){ System.out.println(e.getMessage()); } } return news; }catch(org.openqa.selenium.NoSuchElementException e){ System.out.println(e.getMessage()); return news; } } }
欢迎大神指点,欢迎小伙伴们讨论