关键字:网页爬虫抓取URL简单实现 .
//开始......
package com.ogilvy.sayes.util;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Hashtable;
/*
Description: 爬网页用
Author : long.tang
*/
public class SearchCrawler {
public String myGetHttpFile2(String url) {
String urlSource = url;
StringBuffer htmlBuffer = new StringBuffer();
String returnStr = null;
try {
InputStream imageSource = new URL(urlSource).openStream();
int ch;
while ((ch = imageSource.read()) > -1) {
htmlBuffer.append((char) ch);
}
imageSource.close();
returnStr = new String(htmlBuffer);
returnStr = new String(returnStr.getBytes("ISO8859_1"), "GBK");
} catch (Exception e) {
System.out.println("error>>>>");
e.printStackTrace();
}
//System.out.println("@@@:" + returnStr);
if (returnStr != null) {
return returnStr;
} else {
return "nothing";
}
}
public void doit(String content, int depth) throws Exception {
depth--;
if (depth < 1) {
//System.out.println("break::::");
return;
}
SearchCrawler search = new SearchCrawler();
ArrayList list = new ArrayList();
int j = 0;
String start = "href=";
String end = "\"";
String url = "";
String type = "http";
String[] urls;
while (content.indexOf(start, j) > -1) {
url = content.substring(content.indexOf(start, j) + 6, content.indexOf(end, content.indexOf(start, j) + 6));//+6 href="
if (url.indexOf(type) > -1) {
if (url.indexOf(".css") == -1&&url.indexOf(".ico") == -1&&url.indexOf(".exe") == -1) {
System.out.println(url);
list.add(url);
if (list != null && list.size() > 0) {
for (int k = 0; k < list.size(); k++) {
doit(search.myGetHttpFile2(String.valueOf(list.get(k))), depth);
}
}
}
}
j = content.indexOf(start, j) + 1;
}
}
public static void main(String arg[]) {
SearchCrawler search = new SearchCrawler();
try {
search.doit(search.myGetHttpFile2("http://www.2345.com/"),3);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
//结束.....