Crawling amazon.com

I'm crawling amazon products and principle it's going fine.

I have three classes from this nice tutorial:

http://www.netinstructions.com/how-to-make-a-simple-web-crawler-in-java/

I added the files to the following code (class Spider):

import java.io.FileNotFoundException;
import java.util.*;


public class Spider {
    public static final int MAX_PAGES_TO_SEARCH = 10000;
    private Set<String> pagesVisited = new HashSet<String>();
    private List<String> pagesToVisit = new LinkedList<String>();

    public void search(String url) {
        while (this.pagesVisited.size() < MAX_PAGES_TO_SEARCH) {
        String currentUrl;
        SpiderLeg leg = new SpiderLeg();
        if (this.pagesToVisit.isEmpty()) {
            //System.out.println("abc");
            currentUrl = url;
            this.pagesVisited.add(url);
        } else {
            //System.out.println("def");
            currentUrl = this.nextUrl();
        }
        try {
            Thread.sleep(10000);
            leg.crawl(currentUrl); // Lots of stuff happening here. Look at the crawl method in
        } catch (FileNotFoundException e) {
            System.out.println("Oops, FileNotFoundException caught");
        } catch (InterruptedException e) {
            e.printStackTrace();
        }

        this.pagesToVisit.addAll(leg.getLinks());
        //System.out.println("Test");
    }
    System.out.println("\n**Done** Visited " + this.pagesVisited.size() + " web page(s)");
    SpiderLeg leg = new SpiderLeg();
    leg.calcAdjMatrix();
    for (int i = 0; i < leg.adjMatrix.length; i++) {
        System.out.println(Arrays.toString(leg.adjMatrix[i]));

    }

}

private String nextUrl() {
    String nextUrl;
    do {
        if (this.pagesToVisit.isEmpty()){
            return "https://www.amazon.de/Proband-Thriller-Guido-Kniesel/dp/1535287004/ref=sr_1_1?s=books&ie=UTF8&qid=1478247246&sr=1-1&keywords=%5B%5D";
        }
        nextUrl = this.pagesToVisit.remove(0);
    } while (this.pagesVisited.contains(nextUrl));
    this.pagesVisited.add(nextUrl);
    return nextUrl;
}
}

class SpiderLeg:

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.util.*;

public class SpiderLeg {
// We'll use a fake USER_AGENT so the web server thinks the robot is a normal web browser.
    private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36";
    private static List<String> links = new LinkedList<String>();
    private static String graphLink;
    private Document htmlDocument;
    private static double counter = 0;
    static Map<String, Set<String>> adjMap = new HashMap<String, Set<String>>();
    static int[][] adjMatrix;
    static List<String> mapping;

    public boolean crawl(String url) throws FileNotFoundException {
        if (url.isEmpty()) {
        return false;
    }
    try{
        Connection connection = Jsoup.connect(url).ignoreContentType(true).userAgent(USER_AGENT);
        Document htmlDocument = connection.get();
        this.htmlDocument = htmlDocument;
        if(connection.response().statusCode() == 200){
            // 200 is the HTTP OK status code
            // indicating that everything is great.
            counter++;
            double progress;
            progress = (counter/Spider.MAX_PAGES_TO_SEARCH)*100;
            System.out.println("\n**Visiting** Received web page at " + url);
            System.out.println("\n**Progress** " + progress + "%");
        }
        if(!connection.response().contentType().contains("text/html")) {
            System.out.println("**Failure** Retrieved something other than HTML");
            return false;
        }

        //Elements linksOnPage = htmlDocument.select("a[href*=/gp/product/]");
        Elements linksOnPage = htmlDocument.select("a[href*=/dp/]");
        Elements salesRank = htmlDocument.select("span.zg_hrsr_rank");
        Elements category = htmlDocument.select("span.zg_hrsr_ladder a");

        String categoryString = category.html();
        String salesRankString = salesRank.html();
        salesRankString = salesRankString.replace("\n", " ");
        categoryString = categoryString.replace("\n", " ");
        //System.out.println(categoryString);
        System.out.println("Found (" + linksOnPage.size() + ") links");

        PrintWriter pw = new PrintWriter(new FileWriter("Horror.csv", true));
        StringBuilder sb = new StringBuilder();

        int beginIndex = url.indexOf(".de/");
        int endIndex = url.indexOf("/dp");
        String title = url.substring(beginIndex+4,endIndex);

        if(!adjMap.containsKey(title)){
            if(categoryString.contains("Horror")){
                adjMap.put(title, new HashSet<String>());
                sb.append(title);
                sb.append(',');
                sb.append(salesRankString);
                sb.append(',');
                sb.append(categoryString);
                sb.append(',');
                for(Element link : linksOnPage){
                    String graphLink = link.attr("abs:href");
                    if(!graphLink.contains("one-click")){
                        if(!graphLink.contains("Kindle")){
                            if(!graphLink.contains("unsticky")){
                                this.links.add(graphLink);
                                //adjMap.get(url).add(graphLink);
                                adjMap.get(title).add(cutTitle(graphLink));
                                sb.append(graphLink);
                                sb.append(',');
                            }
                        }
                    }
                }
            sb.append('\n');
            pw.write(sb.toString());
            pw.close();
            }

        }


        System.out.println("done!");
        return true;
    }
    catch(IOException ioe) {
        // We were not successful in our HTTP request
        System.out.println("Error in out HTTP request " + ioe);
        return false;
    }
    }

public static void calcAdjMatrix(){
    Set<String> allMyURLs = new HashSet(adjMap.keySet());
    for(String s: adjMap.keySet()){
        allMyURLs.addAll(adjMap.get(s));
        System.out.println(s + "\t" + adjMap.get(s));
    }

    int dim = allMyURLs.size();
    adjMatrix = new int[dim][dim];
    List<String> nodes_list = new ArrayList<>();
    for(String s: allMyURLs){
        nodes_list.add(s);
    }

    for(String s: nodes_list){
        Set<String> outEdges = adjMap.get(s);
        int i = nodes_list.indexOf(s);
        if(outEdges != null){
            for(String s1: outEdges){
                int j = nodes_list.indexOf(s1);
                adjMatrix[i][j] = 1;
            }
        }

    }

}

public String cutTitle(String url) throws FileNotFoundException{
    int beginIndex = url.indexOf(".de/");
    int endIndex = url.indexOf("/dp");
    String title;
    if(url.contains(".de") && url.contains("/dp")){
        title = url.substring(beginIndex+4,endIndex);
    }else{
        title = "wrong url";
    }

    return title;
}
public boolean searchForWord(String searchWord) {

    if(this.htmlDocument == null) {
        System.out.println("ERROR! Call crawl() before performing analysis on the document");
        return false;
    }
    System.out.println("Searching for the word " + searchWord + "...");
    String bodyText = this.htmlDocument.body().text();
    return bodyText.toLowerCase().contains(searchWord.toLowerCase());
}


public List<String> getLinks(){
    return this.links;
}

}

class SpiderTest:

public class SpiderTest {
    public static void main(String[] args) {
        Spider spider = new Spider();
        spider.search("https://www.amazon.de/Wille-geschehe-Psychothriller-Guido-Kniesel/dp/1537455389/ref=pd_sim_14_1?_encoding=UTF8&psc=1&refRID=CQPDDGY4BJ4D8THNNSZ6");
    }
}

Now the problem is, that after 100 URLs I think, that amazon is banning me from the server. The program doesn't find URLs anymore.

Does anyone has an idea how I can fix that?


Well, don't be rude and crawl them then.

Check their robots.txt (wiki) to see what they allow you to do. Don't be surprised if they ban you if you go places they don't want you to go.

Amazon Scraper (vaclavrut/amazon-crawler) · Apify, Amazon crawler - this configuration will extract items for keywords that you will specify "https://images-na.ssl-images-amazon.com/images/I/31fbysMcYFL.jpg",​  Amazon's Choice for crawling toys. VTech Wiggle and Crawl Ball. 4.6 out of 5 stars 2,020. $14.99 $ 14. 99. Get it as soon as Sat, Feb 29. FREE Shipping on orders over


The problem is very common when you try to crawl big websites that don't want to be crawled. They basically block you for a period of time to prevent their data being crawled or stolen.

With that being said, you have two options, either make each request from a different IP/server which will make your requests look legit and avoid the ban, or go for the easiest way which is to use a service that does that for you.

I've done both and the first one is complex, takes time and needs maintenance (you have to build a network of servers), the second option is usually not free but very fast to implement and guarantees that all your requests will always return data and you won't be banned.

There are some services on the internet that does that. I've used proxycrawl (which also has a free tier) in the past which works very good. They have an API that you can call and you only can use your same code, just changing the url you call.

This would be an example for amazon:

GET https://api.proxycrawl.com?token=yourtoken&url=https://amazon.com

And you would get always a response, even if you crawl 1000 pages a second, you will never be banned as you will be calling that proxy instead which does all the magic for you.

I hope it helps :)

How to scrape Amazon, Learn how to scrape Amazon products without getting captchas or using proxies You can crawl any domain .com .es .co.uk .it etc. scrape Amazon reviews and  Calm Legs Natural Sleep Aid for Natural Itching, Crawling, Tingling and Leg Jerk Relief with Iron, Magnesium, and Valerian Root (60 Tablets) (30 Day Supply)


You can try using proxy servers to prevent being blocked. There are services providing working proxies. I have good experience using https://gimmeproxy.com which specifically has proxies supporting amazon.

To get proxy working with Amazon, you need just to make the following request:

https://gimmeproxy.com/api/getProxy?api_key=your_api_key&websites=amazon

You will get JSON response with all proxy data which you can use later as needed:

{
  "supportsHttps": true,
  "protocol": "socks5",
  "ip": "116.182.122.182",
  "port": "1915",
  "get": true,
  "post": true,
  "cookies": true,
  "referer": true,
  "user-agent": true,
  "anonymityLevel": 1,
  "websites": {
    "example": true,
    "google": false,
    "amazon": true
  },
  "country": "BR",
  "tsChecked": 1517952910,
  "curl": "socks5://116.182.122.182:1915",
  "ipPort": "116.182.122.182:1915",
  "type": "socks5",
  "speed": 37.78,
  "otherProtocols": {}
}

Crawling amazon.com, Well, don't be rude and crawl them then. Check their robots.txt (wiki) to see what they allow you to do. Don't be surprised if they ban you if you  Amazon's Choice for crawling baby doll. Little Live Bizzy Bubs Single Pack - Primmy. 4.4 out of 5 stars 398. $6.84 $ 6. 84. Get it as soon as Sat, Apr 25.


How To Scrape Amazon Product Data and Prices using Python, In this tutorial, we will build an Amazon scraper for extracting product When you're crawling a massive site like Amazon.com, you need to  Buy Crawling Monster Hand - ST: Toys & Games - Amazon.com FREE DELIVERY possible on eligible purchases


Web Crawling (Foundations and Trends(r) in , Amazon.com: Web Crawling (Foundations and Trends(r) in Information Retrieval) (9781601983220): Olston, Christopher, Najork, Marc: Books. This item: Hive- A Game Crawling With Possibilities $25.60. Ships from and sold by Amazon.com. FREE Shipping. Details. Czech Games Codenames: Duet - The Two Player Word Deduction Game $18.43. Available to ship in 1-2 days. Ships from and sold by Amazon.com. FREE Shipping on orders over $25. Details. Patchwork $23.98.


crawling toys, Find helpful customer reviews and review ratings for Learning Scrapy: Learn the art of effi cient web scraping and crawling with Python at Amazon.com. Amazon Crawler. A relatively simple amazon.com crawler written in python. It has the following features: supports hundreds of simultaneous requests, depending on machine's limits