根据客户要求,爬取excel文件中url对应的文章的阅读量和评论量。代码如下:

import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;
import jxl.read.biff.BiffException;
import jxl.write.Label;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
import org.apache.commons.lang3.StringUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.edge.EdgeDriver;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.TimeUnit;


class NewsInfo{
    public String url;
    public String title;
    public String plNum;
    public String readNum;


    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getPlNum() {
        return plNum;
    }

    public void setPlNum(String plNum) {
        this.plNum = plNum;
    }

    public String getReadNum() {
        return readNum;
    }

    public void setReadNum(String readNum) {
        this.readNum = readNum;
    }
}

public class SeleniumUtils {

    public static WebDriver driver;

    public static void main(String[] args) throws WriteException, IOException {

        System.setProperty("webdriver.edge.driver", "E:\\edgedriver_win64\\msedgedriver.exe");
        driver = new EdgeDriver();

        Workbook workbook = null;
        File Inputfile = new File("E:\\edgedriver_win64\\demo.xls");
        ArrayList<String> urlList = new ArrayList<>();
        List<NewsInfo> newsList = new ArrayList<>();

        try {
            FileInputStream fileInputStream = new FileInputStream(Inputfile);
            workbook = Workbook.getWorkbook(fileInputStream);

            Sheet readfirst = workbook.getSheet(0);
            int rows = readfirst.getRows();

            for(int i =0;i<rows;i++) {
                Cell[] cells = readfirst.getRow(i); //循环得到每一行的单元格对象
                //根据每一个单元格对象的到里面的值
                String url= cells[0].getContents();
                if(StringUtils.isNotBlank(url)){
                    urlList.add(url);
                }
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (BiffException e) {
            e.printStackTrace();
        }


        for(int i=0;i<urlList.size();i++){

            String url = urlList.get(i);
            System.out.println("------------------------url-------------------------" + url );
            System.out.println("------------------------i-------------------------" + i );

            NewsInfo news = getNewsReadNum(url);
            newsList.add(news);

        }

        writeExcel(newsList);
    }



    public static void  writeExcel(List<NewsInfo> list) throws IOException, WriteException {
        driver.quit();
        String[] titles=new String[]{"url地址","标题","阅读量","评论量"};
        File file = new File("E:\\edgedriver_win64\\demo2.xls");
        //判断file是否存在
        if(file.exists()){
            file.delete();
            System.out.println("原有表格删除成功");
        }
        file.createNewFile();
        //创建工作簿
        WritableWorkbook workbook = Workbook.createWorkbook(file);
        WritableSheet sheet1 = workbook.createSheet("sheet1", 0);
        Label label=null;
        //设置列名
        for (int j = 0; j <titles.length ; j++) {
            label=new Label(j,0,titles[j]);
            sheet1.addCell(label);
        }
        //方便定义行
        int count=0;
        //将list中的数据添加至工作簿中 String[] title

        for (int i=0;i<list.size();i++) {
            NewsInfo info = list.get(i);
            String url = info.getUrl();
            String title = info.getTitle();
            String readNum = info.getReadNum();
            String plNum = info.getPlNum();

            count++;
            label=new Label(0,count,url);
            sheet1.addCell(label);
            label=new Label(1,count,title);
            sheet1.addCell(label);
            label=new Label(2,count,readNum);
            sheet1.addCell(label);
            label=new Label(3,count,plNum);
            sheet1.addCell(label);
        }
        workbook.write();
        workbook.close();
    }





    public static NewsInfo getNewsReadNum(String url){

        NewsInfo news = new NewsInfo();
        news.setUrl(url);

        try{
            String readNum ="";

            driver.get(url);
            String title = driver.getTitle();
            System.out.println(title);
            news.setTitle(title);

            if(url.contains("toutiao")){

                try{
                    WebElement titleElement = driver.findElement(By.className("main"));
                    WebElement h1= titleElement.findElement(By.tagName("h1"));
                    title = h1.getText();
                    news.setTitle(title);
                }catch(org.openqa.selenium.NoSuchElementException e){
                    System.out.println(e.getMessage());

                }
            }


            if(url.contains("baidu")){

                try{
                    WebElement sampleDiv = driver.findElement(By.className("xcp-list-title"));
                    String plNum = sampleDiv.getText();
                    System.out.println("   plNum      Text: " + plNum);
                    news.setPlNum(plNum);
                }catch(org.openqa.selenium.NoSuchElementException e){
                    System.out.println(e.getMessage());
                }
            }else if(url.contains("sohu")){

                try{
                    WebElement sampleDiv = driver.findElement(By.className("c-comment-more"));
                    String plNum = sampleDiv.getText();
                    System.out.println("   plNum      Text: " + plNum);
                    news.setPlNum(plNum);
                }catch(org.openqa.selenium.NoSuchElementException e){
                    System.out.println(e.getMessage());
                }
            }else if(url.contains("toutiao")){

                try{
                    WebElement sampleDiv = driver.findElement(By.className("ttp-comment-wrapper"));
                    WebElement sample = sampleDiv.findElement(By.className("title"));
                    String plNum = sample.getText();
                    System.out.println("   plNum      Text: " + plNum);
                    news.setPlNum(plNum);
                }catch(org.openqa.selenium.NoSuchElementException e){
                    System.out.println(e.getMessage());
                }
            }else if(url.contains("qq.com")){

                try{
                    WebElement sampleDiv = driver.findElement(By.className("header-number"));
                    String plNum = sampleDiv.getText();
                    System.out.println("   plNum      Text: " + plNum);
                    news.setPlNum(plNum);
                }catch(org.openqa.selenium.NoSuchElementException e){
                    System.out.println(e.getMessage());
                }
            }else if(url.contains("163.com")){

                try{
                    WebElement sampleDiv = driver.findElement(By.className("tie-actCount"));
                    String plNum = sampleDiv.getText();
                    System.out.println("   plNum      Text: " + plNum);
                    news.setPlNum(plNum);
                }catch(org.openqa.selenium.NoSuchElementException e){
                    System.out.println(e.getMessage());
                }
            }

            return news;
        }catch(org.openqa.selenium.NoSuchElementException e){
            System.out.println(e.getMessage());

            return  news;
        }


    }
}

欢迎大神指点,欢迎小伙伴们讨论

05-15 03:00