Multi-threaded Java Web Crawler
February 9, 2012
I accidentally found this code in my old USB drive and decided to share in case someone finds it helpful.
This is basically a program that mass-downloads files in a list. In this example code, the list of the files is generated using a specific syntax: a URL input of http://shin.ws/[!<-1,100->!], for example, is a list containing http://shin.ws/1, http://shin.ws/2, ... http://shin.ws/100. I just arbitrarily chose ask.com in the example below. It can be easily expanded so that it imports the list from different sources.
Download queries are saved in a DownInfo object, and they are stacked in the DownloadQueue queue. The default number of threads is 16.
/* Daeyun Shin */
/* M_HTTPDown.java */
import java.io.*;
import java.util.ArrayList;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
public class M_HTTPDown {
final static int maxThreadCount = 16;
public static void main(String[] args){
String regex="\\[!<-[0-9]*,.[0-9]*->!\\]";
String fURL = "http://www.ask.com/web?q=[!<-20001,50000->!]&page=1";
String temp=null;
int n1,n2;
Pattern r = Pattern.compile(regex);
Matcher m = r.matcher(fURL);
if (m.find()) {
temp=m.group();
}else{
System.out.println("Wrong regex format: " + fURL);
System.exit(0);
}
System.out.println(temp);
r = Pattern.compile("[0-9]+");
m = r.matcher(temp);
m.find();
n1=Integer.parseInt(m.group());
m.find();
n2=Integer.parseInt(m.group());
for(int i=n1;i<=n2;i++){
DownInfo d= new DownInfo(fURL.replaceFirst(regex, Integer.toString(i))
,getUA(), "UTF-8","UTF-8", "HTTPDown/"+i+".html");
DownloadQueue.add(d);
}
for(int i=maxThreadCount;i>0;i--){
Thread downloadThread = new Thread(new URLDownload());
downloadThread.start();
}
}
public static String getUA(){
BufferedReader UA_reader;
ArrayList<String> UA_list=new ArrayList<String>();
String line;
try{
UA_reader=new BufferedReader(new FileReader(new File("UA-list.txt")));
while((line=UA_reader.readLine())!=null){
UA_list.add(line);
}
}catch(Exception ex){
System.out.println("Failed to read UA-list.txt");
ex.printStackTrace();
return "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US;" +
"rv:1.9.0.10)Gecko/2009042316 Firefox/3.0.10";
}
return "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US;" +
"rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10";
//return UA_list.get((int)(Math.random()*(UA_list.size())));
//reading from file temporarily disabled.
}
}
/* DownloadQueue.java */
import java.util.Queue;
import java.util.LinkedList;
public class DownloadQueue {
public static Queue<DownInfo> downloadList = new LinkedList<DownInfo>();
public static synchronized void add(DownInfo di){
downloadList.add(di);
}
public static synchronized DownInfo poll(){
return downloadList.poll();
}
public static DownInfo peek(){
return downloadList.peek();
}
}
/*
* DownInfo.java
* */
public class DownInfo {
String url, userAgent, inputEncoding, outputEncoding, fileName;
public DownInfo(String u,String ua,String iEncoding,
String oEncoding, String fName){
url=u;
userAgent=ua;
inputEncoding=iEncoding;
outputEncoding=oEncoding;
fileName=fName;
}
}
/* URLDownload.java */
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.SocketTimeoutException;
import java.net.URL;
public class URLDownload implements Runnable{
URL u;
HttpURLConnection httpConnection;
String line,content="";
BufferedReader reader;
BufferedWriter writer;
String url, userAgent, inputEncoding, outputEncoding, fileName;
DownInfo tempD;
int oSize;
int threadID=(int)(Math.random()*9999);
public void initialize(DownInfo d){
url=d.url;
userAgent=d.userAgent;
inputEncoding=d.inputEncoding;
outputEncoding=d.outputEncoding;
fileName=d.fileName;
}
public void run(){
System.out.println("Thread "+threadID+" has started.");
while((tempD=DownloadQueue.poll())!=null){
initialize(tempD);
try{
u = new URL(url);
httpConnection = (HttpURLConnection) u.openConnection();
httpConnection.setRequestProperty("User-Agent", userAgent);
httpConnection.setReadTimeout(10000);
if(httpConnection.getResponseCode()==200){
reader = new BufferedReader(new InputStreamReader(
httpConnection.getInputStream(), inputEncoding ));
writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(fileName), outputEncoding));
content="";
oSize=0;
while((line=reader.readLine())!=null){
writer.write(line+"\n");
oSize+=line.length()+1;
}
reader.close();
writer.close();
}else{
System.out.println("httpConnection Error: "
+httpConnection.getResponseCode());
}
}catch(SocketTimeoutException ex){
System.out.println("Connection timed out.");
}catch(Exception ex){
ex.printStackTrace();
}
System.out.println(threadID+": "+u.toString() + "["+oSize+"]");
}
System.out.println("End of thread"+threadID+".");
}
}
