1、利用htmlParse获取网页信息:
import org.htmlparser.Node;

import
org.htmlparser.NodeFilter;
import
org.htmlparser.Parser;
import
org.htmlparser.filters.TagNameFilter;
import
org.htmlparser.tags.TableTag;
import
org.htmlparser.util.NodeList;

public

class
TestYahoo {

public

static

void
testHtml() {

try
{
String sCurrentLine;
String sTotalString;
sCurrentLine
=

""
;
sTotalString
=

""
;
java.io.InputStream l_urlStream;
java.net.URL l_url
=

new
java.net.URL(

"
http://sports.sina.com.cn/iframe/nba/live/
"
);
java.net.HttpURLConnection l_connection
=
(java.net.HttpURLConnection) l_url
.openConnection();
l_connection.connect();
l_urlStream
=
l_connection.getInputStream();
java.io.BufferedReader l_reader
=

new
java.io.BufferedReader(

new
java.io.InputStreamReader(l_urlStream));

while
((sCurrentLine
=
l_reader.readLine())
!=

null
) {
sTotalString
+=
sCurrentLine;
}
System.out.println(sTotalString);

System.out.println(
"
====================
"
);
String testText
=
extractText(sTotalString);
System.out.println(testText);
}
catch
(Exception e) {
e.printStackTrace();
}

}


/**

* 抽取纯文本信息
*
*
@param
inputHtml
*
@return


*/


public

static
String extractText(String inputHtml)
throws
Exception {
StringBuffer text
=

new
StringBuffer();

Parser parser
=
Parser.createParser(
new
String(inputHtml.getBytes(),

"
8859_1
"
),
"
8859-1
"
);

//
遍历所有的节点

NodeList nodes
=
parser.extractAllNodesThatMatch(
new
NodeFilter() {

public

boolean
accept(Node node) {

return

true
;
}
});
Node node
=
nodes.elementAt(
0
);
text.append(
new
String(node.toPlainTextString().getBytes(
"
8859_1
"
)));

return
text.toString();
}


/**

* 读取文件的方式来分析内容. filePath也可以是一个Url.
*
*
@param
resource
* 文件/Url

*/


public

static

void
test5(String resource)
throws
Exception {
Parser myParser
=

new
Parser(resource);


//
设置编码

myParser.setEncoding(
"
GBK
"
);
String filterStr
=

"
table
"
;
NodeFilter filter
=

new
TagNameFilter(filterStr);
NodeList nodeList
=
myParser.extractAllNodesThatMatch(filter);
TableTag tabletag
=
(TableTag) nodeList.elementAt(
11
);

System.out.println(tabletag.toHtml());

System.out.println(
"
==============
"
);

}


/*

* public static void main(String[] args) { TestYahoo testYahoo = new
* TestYahoo(); testYahoo.testHtml(); }

*/


public

static

void
main(String[] args)
throws
Exception {
test5(
"
http://sports.yahoo.com/nba/scoreboard
"
);
}

}
2、比htmlparser更好用的html解析工具jsoup,最人性化的地方是,它支持类jquery语法。下载地址:​​http://jsoup.org/​​
mport java.util.HashMap;
2 import java.util.Map;
3
4 import org.jsoup.Jsoup;
5 import org.jsoup.nodes.Document;
6 import org.jsoup.select.Elements;
7 /**
8 *
9 * @author Rocky
10 *
11 */
12 public class spider {
13 private static final String POSTURL="http://59.49.18.116:8008/sxwwpt_wai/inquire/illegalAction!carInquire.action";
14 private void spiderData() throws Exception{
15 Map<String,String> req=new HashMap<String,String>();
16 req.put("authCode", "");
17 req.put("csjcKey","110000");
18 req.put("hpzl", "02");
19 req.put("vioViolation.hphm", "xxx");//您的车牌号
20 req.put("type","1");
21 req.put("pagination.currentPage", "1");
22 req.put("pagination.pageSize", "5");
23
24 Document doc=Jsoup.connect(POSTURL).data(req).get();
25 Elements newsHeadlines=doc.select(".if_tr td");
26 System.out.println(newsHeadlines.text());
27 }
28 /**
29 * @param args
30 * @throws Exception
31 */
32 public static void main(String[] args) throws Exception {
33
34 spider spider=new spider();
35 spider.spiderData();
36 }
37
38 }