我正在搜寻亚马逊产品,并且原理还不错。
我从这个不错的教程中获得了三个类:
http://www.netinstructions.com/how-to-make-a-simple-web-crawler-in-java/
我将文件添加到以下代码(Spider类)中:
import java.io.FileNotFoundException;
import java.util.*;
public class Spider {
public static final int MAX_PAGES_TO_SEARCH = 10000;
private Set<String> pagesVisited = new HashSet<String>();
private List<String> pagesToVisit = new LinkedList<String>();
public void search(String url) {
while (this.pagesVisited.size() < MAX_PAGES_TO_SEARCH) {
String currentUrl;
SpiderLeg leg = new SpiderLeg();
if (this.pagesToVisit.isEmpty()) {
//System.out.println("abc");
currentUrl = url;
this.pagesVisited.add(url);
} else {
//System.out.println("def");
currentUrl = this.nextUrl();
}
try {
Thread.sleep(10000);
leg.crawl(currentUrl); // Lots of stuff happening here. Look at the crawl method in
} catch (FileNotFoundException e) {
System.out.println("Oops, FileNotFoundException caught");
} catch (InterruptedException e) {
e.printStackTrace();
}
this.pagesToVisit.addAll(leg.getLinks());
//System.out.println("Test");
}
System.out.println("\n**Done** Visited " + this.pagesVisited.size() + " web page(s)");
SpiderLeg leg = new SpiderLeg();
leg.calcAdjMatrix();
for (int i = 0; i < leg.adjMatrix.length; i++) {
System.out.println(Arrays.toString(leg.adjMatrix[i]));
}
}
private String nextUrl() {
String nextUrl;
do {
if (this.pagesToVisit.isEmpty()){
return "https://www.amazon.de/Proband-Thriller-Guido-Kniesel/dp/1535287004/ref=sr_1_1?s=books&ie=UTF8&qid=1478247246&sr=1-1&keywords=%5B%5D";
}
nextUrl = this.pagesToVisit.remove(0);
} while (this.pagesVisited.contains(nextUrl));
this.pagesVisited.add(nextUrl);
return nextUrl;
}
}
SpiderLeg类:
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.util.*;
public class SpiderLeg {
// We'll use a fake USER_AGENT so the web server thinks the robot is a normal web browser.
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36";
private static List<String> links = new LinkedList<String>();
private static String graphLink;
private Document htmlDocument;
private static double counter = 0;
static Map<String, Set<String>> adjMap = new HashMap<String, Set<String>>();
static int[][] adjMatrix;
static List<String> mapping;
public boolean crawl(String url) throws FileNotFoundException {
if (url.isEmpty()) {
return false;
}
try{
Connection connection = Jsoup.connect(url).ignoreContentType(true).userAgent(USER_AGENT);
Document htmlDocument = connection.get();
this.htmlDocument = htmlDocument;
if(connection.response().statusCode() == 200){
// 200 is the HTTP OK status code
// indicating that everything is great.
counter++;
double progress;
progress = (counter/Spider.MAX_PAGES_TO_SEARCH)*100;
System.out.println("\n**Visiting** Received web page at " + url);
System.out.println("\n**Progress** " + progress + "%");
}
if(!connection.response().contentType().contains("text/html")) {
System.out.println("**Failure** Retrieved something other than HTML");
return false;
}
//Elements linksOnPage = htmlDocument.select("a[href*=/gp/product/]");
Elements linksOnPage = htmlDocument.select("a[href*=/dp/]");
Elements salesRank = htmlDocument.select("span.zg_hrsr_rank");
Elements category = htmlDocument.select("span.zg_hrsr_ladder a");
String categoryString = category.html();
String salesRankString = salesRank.html();
salesRankString = salesRankString.replace("\n", " ");
categoryString = categoryString.replace("\n", " ");
//System.out.println(categoryString);
System.out.println("Found (" + linksOnPage.size() + ") links");
PrintWriter pw = new PrintWriter(new FileWriter("Horror.csv", true));
StringBuilder sb = new StringBuilder();
int beginIndex = url.indexOf(".de/");
int endIndex = url.indexOf("/dp");
String title = url.substring(beginIndex+4,endIndex);
if(!adjMap.containsKey(title)){
if(categoryString.contains("Horror")){
adjMap.put(title, new HashSet<String>());
sb.append(title);
sb.append(',');
sb.append(salesRankString);
sb.append(',');
sb.append(categoryString);
sb.append(',');
for(Element link : linksOnPage){
String graphLink = link.attr("abs:href");
if(!graphLink.contains("one-click")){
if(!graphLink.contains("Kindle")){
if(!graphLink.contains("unsticky")){
this.links.add(graphLink);
//adjMap.get(url).add(graphLink);
adjMap.get(title).add(cutTitle(graphLink));
sb.append(graphLink);
sb.append(',');
}
}
}
}
sb.append('\n');
pw.write(sb.toString());
pw.close();
}
}
System.out.println("done!");
return true;
}
catch(IOException ioe) {
// We were not successful in our HTTP request
System.out.println("Error in out HTTP request " + ioe);
return false;
}
}
public static void calcAdjMatrix(){
Set<String> allMyURLs = new HashSet(adjMap.keySet());
for(String s: adjMap.keySet()){
allMyURLs.addAll(adjMap.get(s));
System.out.println(s + "\t" + adjMap.get(s));
}
int dim = allMyURLs.size();
adjMatrix = new int[dim][dim];
List<String> nodes_list = new ArrayList<>();
for(String s: allMyURLs){
nodes_list.add(s);
}
for(String s: nodes_list){
Set<String> outEdges = adjMap.get(s);
int i = nodes_list.indexOf(s);
if(outEdges != null){
for(String s1: outEdges){
int j = nodes_list.indexOf(s1);
adjMatrix[i][j] = 1;
}
}
}
}
public String cutTitle(String url) throws FileNotFoundException{
int beginIndex = url.indexOf(".de/");
int endIndex = url.indexOf("/dp");
String title;
if(url.contains(".de") && url.contains("/dp")){
title = url.substring(beginIndex+4,endIndex);
}else{
title = "wrong url";
}
return title;
}
public boolean searchForWord(String searchWord) {
if(this.htmlDocument == null) {
System.out.println("ERROR! Call crawl() before performing analysis on the document");
return false;
}
System.out.println("Searching for the word " + searchWord + "...");
String bodyText = this.htmlDocument.body().text();
return bodyText.toLowerCase().contains(searchWord.toLowerCase());
}
public List<String> getLinks(){
return this.links;
}
}
SpiderTest类:
public class SpiderTest {
public static void main(String[] args) {
Spider spider = new Spider();
spider.search("https://www.amazon.de/Wille-geschehe-Psychothriller-Guido-Kniesel/dp/1537455389/ref=pd_sim_14_1?_encoding=UTF8&psc=1&refRID=CQPDDGY4BJ4D8THNNSZ6");
}
}
现在的问题是,在我认为有100个URL之后,亚马逊禁止我访问服务器。该程序不再找到URL。
有谁知道我该如何解决?
最佳答案
好吧,不要粗鲁然后爬行它们。
检查他们的robots.txt(wiki)看看他们允许您做什么。如果他们禁止您进入他们不希望您去的地方,请不要感到惊讶。