java - 爬行amazon.com

我正在搜寻亚马逊产品，并且原理还不错。

我从这个不错的教程中获得了三个类：

http://www.netinstructions.com/how-to-make-a-simple-web-crawler-in-java/

我将文件添加到以下代码（Spider类）中：

import java.io.FileNotFoundException;
import java.util.*;


public class Spider {
    public static final int MAX_PAGES_TO_SEARCH = 10000;
    private Set<String> pagesVisited = new HashSet<String>();
    private List<String> pagesToVisit = new LinkedList<String>();

    public void search(String url) {
        while (this.pagesVisited.size() < MAX_PAGES_TO_SEARCH) {
        String currentUrl;
        SpiderLeg leg = new SpiderLeg();
        if (this.pagesToVisit.isEmpty()) {
            //System.out.println("abc");
            currentUrl = url;
            this.pagesVisited.add(url);
        } else {
            //System.out.println("def");
            currentUrl = this.nextUrl();
        }
        try {
            Thread.sleep(10000);
            leg.crawl(currentUrl); // Lots of stuff happening here. Look at the crawl method in
        } catch (FileNotFoundException e) {
            System.out.println("Oops, FileNotFoundException caught");
        } catch (InterruptedException e) {
            e.printStackTrace();
        }

        this.pagesToVisit.addAll(leg.getLinks());
        //System.out.println("Test");
    }
    System.out.println("\n**Done** Visited " + this.pagesVisited.size() + " web page(s)");
    SpiderLeg leg = new SpiderLeg();
    leg.calcAdjMatrix();
    for (int i = 0; i < leg.adjMatrix.length; i++) {
        System.out.println(Arrays.toString(leg.adjMatrix[i]));

    }

}

private String nextUrl() {
    String nextUrl;
    do {
        if (this.pagesToVisit.isEmpty()){
            return "https://www.amazon.de/Proband-Thriller-Guido-Kniesel/dp/1535287004/ref=sr_1_1?s=books&ie=UTF8&qid=1478247246&sr=1-1&keywords=%5B%5D";
        }
        nextUrl = this.pagesToVisit.remove(0);
    } while (this.pagesVisited.contains(nextUrl));
    this.pagesVisited.add(nextUrl);
    return nextUrl;
}
}

SpiderLeg类：

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.util.*;

public class SpiderLeg {
// We'll use a fake USER_AGENT so the web server thinks the robot is a normal web browser.
    private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36";
    private static List<String> links = new LinkedList<String>();
    private static String graphLink;
    private Document htmlDocument;
    private static double counter = 0;
    static Map<String, Set<String>> adjMap = new HashMap<String, Set<String>>();
    static int[][] adjMatrix;
    static List<String> mapping;

    public boolean crawl(String url) throws FileNotFoundException {
        if (url.isEmpty()) {
        return false;
    }
    try{
        Connection connection = Jsoup.connect(url).ignoreContentType(true).userAgent(USER_AGENT);
        Document htmlDocument = connection.get();
        this.htmlDocument = htmlDocument;
        if(connection.response().statusCode() == 200){
            // 200 is the HTTP OK status code
            // indicating that everything is great.
            counter++;
            double progress;
            progress = (counter/Spider.MAX_PAGES_TO_SEARCH)*100;
            System.out.println("\n**Visiting** Received web page at " + url);
            System.out.println("\n**Progress** " + progress + "%");
        }
        if(!connection.response().contentType().contains("text/html")) {
            System.out.println("**Failure** Retrieved something other than HTML");
            return false;
        }

        //Elements linksOnPage = htmlDocument.select("a[href*=/gp/product/]");
        Elements linksOnPage = htmlDocument.select("a[href*=/dp/]");
        Elements salesRank = htmlDocument.select("span.zg_hrsr_rank");
        Elements category = htmlDocument.select("span.zg_hrsr_ladder a");

        String categoryString = category.html();
        String salesRankString = salesRank.html();
        salesRankString = salesRankString.replace("\n", " ");
        categoryString = categoryString.replace("\n", " ");
        //System.out.println(categoryString);
        System.out.println("Found (" + linksOnPage.size() + ") links");

        PrintWriter pw = new PrintWriter(new FileWriter("Horror.csv", true));
        StringBuilder sb = new StringBuilder();

        int beginIndex = url.indexOf(".de/");
        int endIndex = url.indexOf("/dp");
        String title = url.substring(beginIndex+4,endIndex);

        if(!adjMap.containsKey(title)){
            if(categoryString.contains("Horror")){
                adjMap.put(title, new HashSet<String>());
                sb.append(title);
                sb.append(',');
                sb.append(salesRankString);
                sb.append(',');
                sb.append(categoryString);
                sb.append(',');
                for(Element link : linksOnPage){
                    String graphLink = link.attr("abs:href");
                    if(!graphLink.contains("one-click")){
                        if(!graphLink.contains("Kindle")){
                            if(!graphLink.contains("unsticky")){
                                this.links.add(graphLink);
                                //adjMap.get(url).add(graphLink);
                                adjMap.get(title).add(cutTitle(graphLink));
                                sb.append(graphLink);
                                sb.append(',');
                            }
                        }
                    }
                }
            sb.append('\n');
            pw.write(sb.toString());
            pw.close();
            }

        }


        System.out.println("done!");
        return true;
    }
    catch(IOException ioe) {
        // We were not successful in our HTTP request
        System.out.println("Error in out HTTP request " + ioe);
        return false;
    }
    }

public static void calcAdjMatrix(){
    Set<String> allMyURLs = new HashSet(adjMap.keySet());
    for(String s: adjMap.keySet()){
        allMyURLs.addAll(adjMap.get(s));
        System.out.println(s + "\t" + adjMap.get(s));
    }

    int dim = allMyURLs.size();
    adjMatrix = new int[dim][dim];
    List<String> nodes_list = new ArrayList<>();
    for(String s: allMyURLs){
        nodes_list.add(s);
    }

    for(String s: nodes_list){
        Set<String> outEdges = adjMap.get(s);
        int i = nodes_list.indexOf(s);
        if(outEdges != null){
            for(String s1: outEdges){
                int j = nodes_list.indexOf(s1);
                adjMatrix[i][j] = 1;
            }
        }

    }

}

public String cutTitle(String url) throws FileNotFoundException{
    int beginIndex = url.indexOf(".de/");
    int endIndex = url.indexOf("/dp");
    String title;
    if(url.contains(".de") && url.contains("/dp")){
        title = url.substring(beginIndex+4,endIndex);
    }else{
        title = "wrong url";
    }

    return title;
}
public boolean searchForWord(String searchWord) {

    if(this.htmlDocument == null) {
        System.out.println("ERROR! Call crawl() before performing analysis on the document");
        return false;
    }
    System.out.println("Searching for the word " + searchWord + "...");
    String bodyText = this.htmlDocument.body().text();
    return bodyText.toLowerCase().contains(searchWord.toLowerCase());
}


public List<String> getLinks(){
    return this.links;
}

}

SpiderTest类：

public class SpiderTest {
    public static void main(String[] args) {
        Spider spider = new Spider();
        spider.search("https://www.amazon.de/Wille-geschehe-Psychothriller-Guido-Kniesel/dp/1537455389/ref=pd_sim_14_1?_encoding=UTF8&psc=1&refRID=CQPDDGY4BJ4D8THNNSZ6");
    }
}

现在的问题是，在我认为有100个URL之后，亚马逊禁止我访问服务器。该程序不再找到URL。

有谁知道我该如何解决？

最佳答案

好吧，不要粗鲁然后爬行它们。

检查他们的robots.txt（wiki）看看他们允许您做什么。如果他们禁止您进入他们不希望您去的地方，请不要感到惊讶。