Science, philosophy, programming, and humor for a long and prosperous life

Multi-threaded Java Web Crawler

I accidentally found this code in my old USB drive and decided to share in case someone finds it helpful.

This is basically a program that mass-downloads files in a list. In this example code, the list of the files is generated using a specific syntax: a URL input of http://shin.ws/[!<-1,100->!], for example, is a list containing http://shin.ws/1, http://shin.ws/2, ... http://shin.ws/100. I just arbitrarily chose ask.com in the example below. It can be easily expanded so that it imports the list from different sources.

Download queries are saved in a DownInfo object, and they are stacked in the DownloadQueue queue. The default number of threads is 16.

/* Daeyun Shin */

/* M_HTTPDown.java */
import java.io.*;
import java.util.ArrayList;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
public class M_HTTPDown {
  final static int maxThreadCount = 16;
  public static void main(String[] args){
    String regex="\\[!<-[0-9]*,.[0-9]*->!\\]";
    String fURL = "http://www.ask.com/web?q=[!<-20001,50000->!]&page=1";
    String temp=null;
    int n1,n2;
    Pattern r = Pattern.compile(regex);
    Matcher m = r.matcher(fURL);
    if (m.find()) {
      temp=m.group();
    }else{
      System.out.println("Wrong regex format: " + fURL);
      System.exit(0);
    }
    System.out.println(temp);
    r = Pattern.compile("[0-9]+");
    m = r.matcher(temp);
    m.find();
    n1=Integer.parseInt(m.group());
    m.find();
    n2=Integer.parseInt(m.group());
    for(int i=n1;i<=n2;i++){
      DownInfo d= new DownInfo(fURL.replaceFirst(regex, Integer.toString(i))
              ,getUA(), "UTF-8","UTF-8", "HTTPDown/"+i+".html");
      DownloadQueue.add(d);
    }
    for(int i=maxThreadCount;i>0;i--){
      Thread downloadThread = new Thread(new URLDownload());
      downloadThread.start();
    }

  }
    public static String getUA(){
      BufferedReader UA_reader;
      ArrayList<String> UA_list=new ArrayList<String>();
      String line;
      try{
        UA_reader=new BufferedReader(new FileReader(new File("UA-list.txt")));
        while((line=UA_reader.readLine())!=null){
          UA_list.add(line);
            }
      }catch(Exception ex){
        System.out.println("Failed to read UA-list.txt");
        ex.printStackTrace();
        return "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US;" +
                "rv:1.9.0.10)Gecko/2009042316 Firefox/3.0.10";
      }
      return "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US;" +
            "rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10";
      //return UA_list.get((int)(Math.random()*(UA_list.size())));
      //reading from file temporarily disabled.
    }
}

/* DownloadQueue.java */
import java.util.Queue;
import java.util.LinkedList;
public class DownloadQueue {
  public static Queue<DownInfo> downloadList = new LinkedList<DownInfo>();
  public static synchronized void add(DownInfo di){
    downloadList.add(di);
  }
  public static synchronized DownInfo poll(){
    return downloadList.poll();
  }
  public static DownInfo peek(){
    return downloadList.peek();
  }
}

/*
 * DownInfo.java
 * */
public class DownInfo {
  String url, userAgent, inputEncoding, outputEncoding, fileName;
  public DownInfo(String u,String ua,String iEncoding,
          String oEncoding, String fName){
    url=u;
    userAgent=ua;
    inputEncoding=iEncoding;
    outputEncoding=oEncoding;
    fileName=fName;
  }
}

/* URLDownload.java */
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.SocketTimeoutException;
import java.net.URL;
public class URLDownload implements Runnable{
  URL u;
  HttpURLConnection httpConnection;
  String line,content="";
  BufferedReader reader;
  BufferedWriter writer;
  String url, userAgent, inputEncoding, outputEncoding, fileName;
  DownInfo tempD;
  int oSize;
  int threadID=(int)(Math.random()*9999);
  public void initialize(DownInfo d){
    url=d.url;
    userAgent=d.userAgent;
    inputEncoding=d.inputEncoding;
    outputEncoding=d.outputEncoding;
    fileName=d.fileName;
  }
  public void run(){
    System.out.println("Thread "+threadID+" has started.");
    while((tempD=DownloadQueue.poll())!=null){
      initialize(tempD);
      try{
        u = new URL(url);
        httpConnection = (HttpURLConnection) u.openConnection(); 
        httpConnection.setRequestProperty("User-Agent", userAgent);
        httpConnection.setReadTimeout(10000);
        if(httpConnection.getResponseCode()==200){
          reader = new BufferedReader(new InputStreamReader(
                  httpConnection.getInputStream(), inputEncoding ));
          writer = new BufferedWriter(new OutputStreamWriter(
                  new FileOutputStream(fileName), outputEncoding));
          content="";
          oSize=0;
          while((line=reader.readLine())!=null){
            writer.write(line+"\n");
            oSize+=line.length()+1;
          }
          reader.close();
          writer.close();

        }else{
          System.out.println("httpConnection Error: "
                  +httpConnection.getResponseCode());
        }
      }catch(SocketTimeoutException ex){
        System.out.println("Connection timed out.");
      }catch(Exception ex){
        ex.printStackTrace();
      }
      System.out.println(threadID+": "+u.toString() + "["+oSize+"]");
    }
    System.out.println("End of thread"+threadID+".");
  }
}

Approximate e^x without the math library in C

This is a simple C program that approximates e^x based on its Taylor series. It guarantees that the error is smaller than the input value r.

/* Daeyun Shin */
#include <stdio.h>
#include <stdlib.h>
double power(double x, int y){ 
    int i;
    double result=1;
    for(i=0;i<y;i++) result*=x;
    return result;
}
int factorial(int x){ 
    int result=1;
    for(;x>1;x--) result*=x;
    return result;
}
double nth_taylor(double x,int n,int errorChecking){
    if (errorChecking && x<0) x=-x;
    return power(x,n-1)/factorial(n-1);
}
int main(){
    int n=0;
    double ans=0,r,x;
    printf("Enter in the value of x:");
    scanf("%lf",&x);
    printf("Enter in the value of r:");
    scanf("%lf",&r);
    while(nth_taylor(x,n,1)>=r){
        n++; 
        ans+=nth_taylor(x,n,0);
    }   
    printf("Number of terms = %d\nAnswer = %lf",n,ans);
    return 0;
}

IRC KickBan Bot

I wrote a simple IRC bot in Python. Whenever a user joins a channel, it immediately bans and kicks the user. It is useful for channel squatting and pranks. It can be easily expanded so that it kickbans under certain conditions.

You can PM the bot to make it join or leave a channel.

For example, /msg iKickban join #testchannel, /msg iKickban leave #testchannel

#!/usr/bin/python2
import socket, string, re

host="irc.freenode.net"
port=6667
nick="iKickban"
ident="iKickban"
realname="kickban"
readbuffer=""

kickmsg="hi"
channels=["#channel1", "#channel2"] #list of channnels to auto join
owner="daeyun" #the bot will ignore commands from other users

s=socket.socket()
s.connect((host,port))
s.send("NICK %s\n" % nick)
s.send("USER %s %s abc :%s\n" % (ident, host, realname))
for channel in channels:
    s.send("JOIN %s\n" % channel)

while 1:
    readbuffer=readbuffer+s.recv(1024)
    temp=string.split(readbuffer, "\n")
    readbuffer=temp.pop( )
    for line in temp:
        line=string.rstrip(line)
        line=string.split(line)
    try:
        m = re.search(':([0-9a-zA-z-_\'`\^]+)!', line[0])
        user=m.group(1)
    except:
        pass
    if line[0]=="PING":
        s.send("PONG %s\n" % line[1])
    elif line[1]=="PRIVMSG" and line[2]==nick:
        try:
            if user==owner:
                if line[3]==":join":
                    s.send("JOIN %s\n" % line[4])
                if line[3]==":leave":
                    s.send("PART %s\n" % line[4])
        except:
            pass
    elif line[1]=="JOIN":
        if user!=nick:
            channel=line[2]
            s.send("MODE %s +b %s\n" % (channel,user))
            s.send("KICK %s %s :%s\n" % (channel,user,kickmsg))