java 抓取网站数据

假设你需要获取51job 人才网上java 人才的需求数量,首先你需要分析51job 网站的搜索这

一块是怎么运作的,通过解析网页的源代码,我们发现了以下一些信息:

1. 搜索时页面请求的URL 是 http://search.51job.com/jobsearch/search_result.php

2. 请求所用的方法为:POST

3. 返回的页面的编码格式为:GBK

4. 假设我们想获取搜索java 人才时结果页面中显示的需求数量,我们发现数量位于返回的

HTML 数据中这样的一段代码之中:<td>1-30 / 14794</td>,于是我们可以得到这样的一个

模式:".+1-\d+ / (\d+).+",第一个分组的内容就是我们需要的最终数据,有关java 中的模式,

请参考java 文档中Pattern 类的介绍

5. 另外做为POST 请求,页面向服务器发送的数据如下(这个很容易能过prototype 这样的js

框架抓取到, 参考我的其它博客介绍) :

lang=c&stype=1&postchannel=0000&fromType=1&line=&keywordtype=2&keyword=java&btnJ
obarea=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&jobarea=0000&image=&btn
Funtype=%E9%80%89%E6%8B%A9%2F%E4%BF%AE%E6%94%B9&funtype=0000&btnInd
ustrytype=%E9%80%89%E6%8B%A9%2F%E4%BF%AE%E6%94%B9&industrytype=00

对于第5 条中的数据哪些是服务器真正需要的我们不管,全部发送过去就是了。有了这些准

备,我们就可以真正开始通过java 发送请求,并获得最终数据了。

我们定义Resource 类,这个类封装所有的与请求有关的信息,Resource 包括以下属性:

view plaincopy to clipboardprint?
/**
* 需要获取资源的目标地址,不包含查询串
*/
private String target;
/**
* get 请求时的查询串,或post 请求的请求数据
*/
private String queryData = "";
/**
* 请求方式,get / post
*/
private String method = "GET";
/**
* 返回的数据的编码类型
*/
private String charset = "GBK";
/**
* 抓取数据的模式,将根据模式的分组来返回数据列表
*/
private String pattern;
/**
* 需要获取资源的目标地址,不包含查询串
*/
private String target;
/**
* get 请求时的查询串,或post 请求的请求数据
*/
private String queryData = "";
/**
* 请求方式,get / post
*/
private String method = "GET";
/**
* 返回的数据的编码类型
*/
private String charset = "GBK";
/**
* 抓取数据的模式,将根据模式的分组来返回数据列表
*/
private String pattern;

以下为抓取内容的代码:

view plaincopy to clipboardprint?
//假设以下代码中res 对象封装了所有的请求信息。
//URL 指向目的地。
//res.getTarget 返回目标地址,且当为get 请求时,这个地址包含了查询串的信息
URL url = new URL(res.getTarget());
HttpURLConnection con = (HttpURLConnection) url.openConnection(); //建立到目的地的联接
con.setRequestMethod(res.getMethod()); //设置请求的方法
//设置HTTP 请求头信息
con.setRequestProperty("accept", "*/*");
con.setRequestProperty("connection", "Keep-Alive");
con.setRequestProperty("user-agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)");
con.setDoInput(true);
if (res.getMethod().equals("POST")) { //如果为Post 请求则发送请求数据
con.setDoOutput(true);
con.getOutputStream().write(res.getQueryData().getBytes());
con.getOutputStream().flush();
}
//通过BufferedReader 一行行的读取数据,如果你需要的是全部返回结果,可以修改一下这
里
BufferedReader br = new BufferedReader(new InputStreamReader(
con.getInputStream(), res.getCharset()));
Pattern pattern = Pattern.compile(res.getPattern());
String s = null;
while ((s = br.readLine()) != null) {
System.out.println(s);
Matcher m = pattern.matcher(s); //检测当前行是否与要求结果的模式相匹配
boolean b = m.matches();
if (! b) {
continue;
}
int size = m.groupCount();
List result = new ArrayList(size);
for(int i=0; i result.add(m.group(i+1)); //如果有多个分组,则取出所有分组,并把最终
结果做为列表返回
}
return result;
}

远程抓取页面信息并解析XML

XmlTransfer.java 负责链接对方服务器

package untitled1;
import java.net.URL;
import java.net.URLConnection;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import org.w3c.dom.*;
import javax.xml.parsers.*;
public class XmlTransfer{
private String urlAddr;
private String xmlStr;
HttpURLConnection urlCon = null;
public XmlTransfer(String _urlAddr,String _xmlStr) {
this.urlAddr = _urlAddr;
this.xmlStr = _xmlStr;
}
public InputStream get() throws Exception
{
if(urlCon==null){urlCon=getUrlConnection();}
if(urlCon==null){throw new Exception("连接失败");}
PrintWriter out = new PrintWriter(urlCon.getOutputStream());
out.print(xmlStr);
out.flush();
out.close();
urlCon.disconnect();
InputStream fin1 = urlCon.getInputStream();
return fin1;
}
private HttpURLConnection getUrlConnection(){
try{
URL url = new URL(urlAddr);
URLConnection conn = url.openConnection();
urlCon = (HttpURLConnection)conn;
urlCon.setRequestProperty("Content-type", "text/html;charset=gb2312");
urlCon.setDoOutput(true);
urlCon.setRequestMethod("GET");
urlCon.setUseCaches(false);
}
catch (MalformedURLException mex) {
mex.printStackTrace();
}
catch (ProtocolException pex) {
pex.printStackTrace();
}
catch (IOException iex) {
iex.printStackTrace();
}
return urlCon;
}
public static String getHttp( String strURL ){
XmlTransfer xt=new XmlTransfer(strURL,"");
StringBuffer sb = new StringBuffer();
try{
InputStream is = xt.get();
byte[] b = new byte[1024];
int iCount = 0;
while ((iCount = is.read(b)) > 0) {
sb.append(new String(b, 0, iCount));
}
}catch(Exception e){
sb.append("An error occurs in XmlTransfer.getHttp\n");
sb.append(e.getMessage());
}
return (sb.toString());
}
public static void main(String[] args) throws Exception {
System.out.println( XmlTransfer.getHttp("http://215.117.110.81/yyoa/oainfo.jsp?comm=person")
);
//http://192.168.0.110/testProvince.html","");
}
}
UsrDataSync.java 负责抓取页面
package untitled1;
import java.util.Calendar;
import java.util.TimerTask;
import javax.servlet.ServletContext;
import java.io.File;
/**
* <p>Title: </p>
*
* <p>Description: </p>
*
* <p>Copyright: Copyright (c) 2006</p>
*
* <p>Company: </p>
*
* @author not attributable
* @version 1.0
*/
public class UsrDataSync {
public UsrDataSync() {
}
public static boolean doSync(){
String strXml;
ParseXML px = new ParseXML();
strXml = XmlTransfer.getHttp("http://215.117.110.81/yyoa/oainfo.jsp?comm=person");
strXml = strXml.replaceAll("\r\n", "");
px.doParse(strXml);
return false;
}
public static void main(String[] args) throws Exception {
UsrDataSync dd= new UsrDataSync();
dd.doSync();
}
}

ParseXML.java 解析XML(包括正则表达式)

//import java.awt.*;
//import javax.servlet.*;
//import javax.servlet.http.*;
//import javax.servlet.jsp.*;
//import org.apache.jasper.runtime.*;
package usersync;
import java.io.*;
import java.util.*;
import javax.xml.parsers.*;
import org.w3c.dom.*;
import java.net.URL;
import java.net.URLConnection;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import javax.swing.*;
import java.sql.*;
/**
* <p>Title: </p>
*
* <p>Description: </p>
*
* <p>Copyright: Copyright (c) 2006</p>
*
* <p>Company: </p>
*
* @author not attributable
* @version 1.0
*/
public class ParseXML{
// StringBuffer os = new StringBuffer();
Document doc = null;
public Connection con=null;
public Connection con_history=null;
public String doParse(String str) {
try {
DocumentBuilder builder =
DocumentBuilderFactory.newInstance().newDocumentBuilder();
//InputStream is=xt.get();
doc = builder.parse(new ByteArrayInputStream(str.getBytes()));
NodeList nl= doc.getElementsByTagName("person");
int i=0;
int len=nl.getLength();
Element tempElement=null;
while(i<len)
{
tempElement=(Element) nl.item(i);
System.out.println(tempElement.getFirstChild().getNodeValue().toString());
tempElement.normalize();
/*
System.out.print(tempElement.getAttribute("id"));
System.out.print(" ");
System.out.print(tempElement.getAttribute("name"));
System.out.print(" ");
System.out.print(tempElement.getAttribute("logname"));
System.out.println();
*/
String duty=nl.item(i).getChildNodes().item(1).toString();
String department=nl.item(i).getChildNodes().item(3).toString();
String station=nl.item(i).getChildNodes().item(5).toString();
String state=nl.item(i).getChildNodes().item(7).toString();
String description=nl.item(i).getChildNodes().item(9).toString();
//String s="1312311231";
//正则表达式
duty = duty.replaceAll(".duty\\sid..\\d....","");
duty = duty.replaceAll("</duty>","");
department = department.replaceAll(".department\\sid..\\d..","");
department = department.replaceAll("</department>","");
station = station.replaceAll(".station\\sid..\\d..","");
station = station.replaceAll("</station>","");
state = state.replaceAll(".state\\sid..\\d..","");
state = state.replaceAll("</state>","");
description = description.replaceAll(".description\\sid..\\d..","");
description = description.replaceAll("</description>","");
description = description.replaceAll("<description />","");
syncUser(Integer.parseInt(tempElement.getAttribute("id")),
tempElement.getAttribute("name"),
tempElement.getAttribute("logname"),
duty,
department,
station,
state,
description);
// syncUser(Integer.parseInt(tempElement.getAttribute("id")),
// tempElement.getAttribute("name"),
// tempElement.getAttribute("logname"),
// nl.item(i).getChildNodes().item(1).getTextContent(),
// nl.item(i).getChildNodes().item(3).getTextContent(),
// nl.item(i).getChildNodes().item(5).getTextContent(),
// nl.item(i).getChildNodes().item(7).getTextContent(),
// nl.item(i).getChildNodes().item(9).getTextContent());
i++;
}
System.out.println(doc.toString());
//System.out.println(os.toString());
// System.out.println("==============" + System.currentTimeMillis() +
"==============");
}catch(Exception e){
System.out.println(e.getMessage());
}
return null;
}
private boolean syncUser(int uid, String usrname, String logname, String duty, String
department, String station, String state, String description ){
int pos = 0;//0: new, 1:running, 2:history
try{
Class.forName("com.mysql.jdbc.Driver");
con=DriverManager.getConnection("jdbc:mysql://localhost:3306/blog","root","root");
con_history=DriverManager.getConnection("jdbc:mysql://localhost:3306/blog_history","root","ro
ot");
Statement st = con.createStatement();
Statement stt = con_history.createStatement();
//新表
ResultSet rs = st.executeQuery("select * from blogusers where id=" +
Integer.toString(uid) );
//老表
ResultSet rss = stt.executeQuery("select * from blogusers where id=" + uid);
if(rs.next())
{
String dp = department.substring(0,2);
if(dp.equals("中央"))
{
st.executeUpdate("update blogusers set id=" + uid +
",TRUENAME='" + usrname + "',DUTYNAME='" +
duty + "'," +
"FLAG=" + 1 + ",DEPMENT='"+department+"' where
id=" + uid);
}else{
st.executeUpdate("update blogusers set id=" + uid +
",TRUENAME='" + usrname + "',DUTYNAME='" +
duty + "'," +
"FLAG=" + 2 + ",DEPMENT='"+department+"' where
id=" + uid);
}
//运行库
pos = 1;
}
else if(rss.next())
{
st.executeUpdate("update blogusers set id=" + uid +
",TRUENAME='" + usrname + "',DUTYNAME='" +
duty + "'," + "FLAG=" + 2 +
",DEPMENT='"+department+"' where id=" + uid);
//老库
pos = 2;
}
if(state.equals("在职"))
{
switch( pos ){
//新库
case 0:
//insert to running
st.executeUpdate("insert into
blogusers(id,TRUENAME,DUTYNAME,FLAG,DEPMENT)"+
"
values("+uid+",'"+usrname+"','"+duty+"',"+(department.startsWith(" 中央
")==true?1:2)+",'"+department+"')");
break;
//老库
case 2:
//move from running to history
moveUser(con, con_history, uid, usrname, logname, duty, department,
station, state, description);
break;
}
}else{
switch( pos ){
//新库
case 0:
//insert to history
stt.executeUpdate("insert into
blogusers(id,TRUENAME,DUTYNAME,FLAG,DEPMENT)"+
"
values("+uid+",'"+usrname+"','"+duty+"',"+(department.startsWith(" 中央
")==true?1:2)+",'"+department+"')");
break;
//运行库
case 1:
//move from history to running
moveUser(con_history, con, uid, usrname, logname, duty, department,
station, state, description );
break;
}
}
// rs.close();
// st.close();
// con.close();
}catch(Exception e){
e.printStackTrace();
}
return false;
}
private void moveUser(Connection src, Connection dest, int uid, String usrname, String
logname, String duty, String department, String station, String state, String description ) throws
SQLException {
Statement st1=src.createStatement();
Statement st2=dest.createStatement();
//查询运行库
ResultSet rs1=st1.executeQuery("select * from blogusers where id="+uid);
String s1="";
String s2="";
String s3="";
//String s4="";
//String s5="";
String s6="";
//String s7="";
String s8="";
if(rs1.next())
{
s1=rs1.getString(1);
s2=rs1.getString(2);
s3=rs1.getString(3);
//s4=rs1.getString(4);
//s5=rs1.getString(5);
s6=rs1.getString(6);
//s7=rs1.getString(7);
s8=rs1.getString(8);
}
//插入老库
st2.executeUpdate("insert into
blogusers(id,TRUENAME,DUTYNAME,FLAG,DEPMENT)"+
" values("+uid+",'"+usrname+"','"+duty+"',"+(department.startsWith(" 中央
")==true?1:2)+",'"+department+"')");
//删除运行库记录
st1.executeUpdate("delete from blogusers where id=" + uid);
}

抓取网页数据

暂时没有事情做,所以就研究一些小东东,以前经常听人家说抓取网站数据呀,感觉好牛呀,

所以自己也来研究一下下,只是没有成为牛人一组,写了一段代码,以后再慢慢的改,希望

能改成搜索引擎那样子,随意抓取各大网站数据。

//分析HTML 标签查找裢接
private string GetUrl(string strWebContent)
{
//string strRef =
@"(href|HREF|src|SRC|action|ACTION|Action)[ ]*=[ ]*[""'][^""'#>]+[""']";
string strRef = @"(href|HREF)[ ]*=[ ]*[""'][^""'#>]+[""']";
string strResult = "";
MatchCollection matches = new Regex(strRef).Matches(strWebContent);
for (int i = 0; i < matches.Count; i++)
{
strResult += matches[i].ToString().Replace("href=", "") + "\r\n";
}
//strRef = @"[ ]*[""'][^""'#>]+[""']";
//matches = new Regex(strRef).Matches(strResult);
//for (int i = 0; i < matches.Count; i++)
//{
// strResult += matches[i].ToString() + "\r\n";
//}
return strResult;
}
//提取URL 地址
private string GetUrl()
{
string strRef = @"(href|HREF)[ ]*=[ ]*[""'][^""'#>]+[""']";
Regex objRegExp = new Regex(strRef);
return strRef;
}
/// <summary>
/// 将Html 标签转化为空格
/// </summary>
/// <param name="strHtml">待转化的字符串</param>
/// <returns>经过转化的字符串</returns>
private string stripHtml(string strHtml)
{
Regex objRegExp = new Regex("<(.|\n)+?>");
string strOutput = objRegExp.Replace(strHtml, "");
strOutput = strOutput.Replace("<", "<");
strOutput = strOutput.Replace(">", ">");
return strOutput;
}
//获得标题
private string GetTitle(string strWebContent)
{
//获取标题
Match TitleMatch = Regex.Match(strWebContent, "<title>([^<]*)</title>",
RegexOptions.IgnoreCase | RegexOptions.Multiline);
return TitleMatch.Groups[1].Value;
}
//获取描述信息
private string GetDescription(string strWebContent)
{
Match Desc = Regex.Match(strWebContent, "<Meta name=\"DESCRIPTION\"
content=\"([^<]*)\">", RegexOptions.IgnoreCase | RegexOptions.Multiline);
return Desc.Groups[1].Value;
}
//根据Url 地址得到网页的html 源码
private string GetWebContent(string Url)
{
string strResult = "";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
//声明一个HttpWebRequest 请求
request.Timeout = 30000;
//设置连接超时时间
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("GB2312");
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult += streamReader.ReadToEnd();
}
catch
{
}
return strResult;
}

Java 基础:利用HttpClient 获取网页内容

HTTP 协议是目前互联网上最重要的协议,许多软件与服务都需要依赖HTTP 协议。

虽然java.net 这个package 中包含了对HTTP 的基本支持,但还有很多高级和复杂的功能无

法实现,这不能不说是一个遗憾。

HttpClient 作为Apache 的开源项目项目之一,为基于HTTP 协议的操作提供了强大的客户端

执行支持,最新的版本为3.0RC3。

__________下面通过一个例子简要展示HttpClient 的使用方法:

--------------------------------------------------------------------------------

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
iimport java.io.UnsupportedEncodingException;
import java.util.*;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpConnection;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
/**
* @author steven
*/
public class HttpClientExample {
//获得ConnectionManager,设置相关参数
private static MultiThreadedHttpConnectionManager manager =
new MultiThreadedHttpConnectionManager();
private static int connectionTimeOut = 20000;
private static int socketTimeOut = 10000;
private static int maxConnectionPerHost = 5;
private static int maxTotalConnections = 40;
//标志初始化是否完成的flag
private static boolean initialed = false;
//初始化ConnectionManger 的方法
public static void SetPara() {
manager.getParams().setConnectionTimeout(connectionTimeOut);
manager.getParams().setSoTimeout(socketTimeOut);
manager.getParams()
.setDefaultMaxConnectionsPerHost(maxConnectionPerHost);
manager.getParams().setMaxTotalConnections(maxTotalConnections);
initialed = true;
}
//通过get 方法获取网页内容
public static String getGetResponseWithHttpClient(String url, String encode) {
HttpClient client = new HttpClient(manager);
if (initialed) {
HttpClientExample.SetPara();
}
GetMethod get = new GetMethod(url);
get.setFollowRedirects(true);
String result = null;
StringBuffer resultBuffer = new StringBuffer();
try {
client.executeMethod(get);
//在目标页面情况未知的条件下,不推荐使用getResponseBodyAsString()方法
//String strGetResponseBody = post.getResponseBodyAsString();
BufferedReader in = new BufferedReader(
new InputStreamReader(
get.getResponseBodyAsStream(),
get.getResponseCharSet()));
String inputLine = null;
while ((inputLine = in.readLine()) != null) {
resultBuffer.append(inputLine);
resultBuffer.append("\n");
}
in.close();
result = resultBuffer.toString();
//iso-8859-1 is the default reading encode
result = HttpClientExample.ConverterStringCode(resultBuffer.toString(),
get.getResponseCharSet(),
encode);
} catch (Exception e) {
e.printStackTrace();
result = "";
} finally {
get.releaseConnection();
return result;
}
}
public static String getPostResponseWithHttpClient(String url,
String encode) {
HttpClient client = new HttpClient(manager);
if (initialed) {
HttpClientExample.SetPara();
}
PostMethod post = new PostMethod(url);
post.setFollowRedirects(false);
StringBuffer resultBuffer = new StringBuffer();
String result = null;
try {
client.executeMethod(post);
BufferedReader in = new BufferedReader(
new InputStreamReader(
post.getResponseBodyAsStream(),
post.getResponseCharSet()));
String inputLine = null;
while ((inputLine = in.readLine()) != null) {
resultBuffer.append(inputLine);
resultBuffer.append("\n");
}
in.close();
//iso-8859-1 is the default reading encode
result = HttpClientExample.ConverterStringCode(resultBuffer.toString(),
post.getResponseCharSet(),
encode);
} catch (Exception e) {
e.printStackTrace();
result = "";
} finally {
post.releaseConnection();
return result;
}
}
public static String getPostResponseWithHttpClient(String url,
String encode,
NameValuePair[] nameValuePair) {
HttpClient client = new HttpClient(manager);
if (initialed) {
HttpClientExample.SetPara();
}
PostMethod post = new PostMethod(url);
post.setRequestBody(nameValuePair);
post.setFollowRedirects(false);
String result = null;
StringBuffer resultBuffer = new StringBuffer();
try {
client.executeMethod(post);
BufferedReader in = new BufferedReader(
new InputStreamReader(
post.getResponseBodyAsStream(),
post.getResponseCharSet()));
String inputLine = null;
while ((inputLine = in.readLine()) != null) {
resultBuffer.append(inputLine);
resultBuffer.append("\n");
}
in.close();
//iso-8859-1 is the default reading encode
result = HttpClientExample.ConverterStringCode(resultBuffer.toString(),
post.getResponseCharSet(),
encode);
} catch (Exception e) {
e.printStackTrace();
result = "";
} finally {
post.releaseConnection();
return result;
}
}
private static String ConverterStringCode(String source, String srcEncode, String destEncode) {
if (src != null) {
try {
return new String(src.getBytes(srcEncode), destEncode);
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return "";
}
} else {
return "";
}
}
}

--------------------------------------------------------------------------------

之后,就可以通过下面的代码获得目标网页:

String source = HttpClientExample.getGetResponseWithHttpClient("www.sina.com.cn", "GBK");

注意,在默认情况下,HttpClient 的Request 的Head 中

User-Agent 的值是Jakarta Commons-HttpClient 3.0RC1,如果需要改变它(例如,变为

Mozilla/4.0),必须在调用之前运行如下语句:

System.getProperties().setProperty("httpclient.useragent", "Mozilla/4.0");

java 抓取网页乱码问题处理

String htmlContent = "";
java.io.InputStream inputStream;
java.net.URL url = new java.net.URL("www.csdn.net ");
java.net.HttpURLConnection connection = (java.net.HttpURLConnection)
url.openConnection();
connection.connect();
inputStream = connection.getInputStream();
byte bytes[] = new byte[1024*100];
int index = 0;
int count = inputStream.read(bytes, index, 1024*100);
while (count != -1) {
index += count;
count = inputStream.read(bytes, index, 1);
}
System.out.println (count);
htmlContent = new String(bytes, "gb2312");//
System.out.println(htmlContent);