htmlParse和Jsoup
原创
©著作权归作者所有:来自51CTO博客作者赶路人儿的原创作品,请联系作者获取转载授权,否则将追究法律责任
1、利用htmlParse获取网页信息:
import org.htmlparser.Node;
import
org.htmlparser.NodeFilter;
import
org.htmlparser.Parser;
import
org.htmlparser.filters.TagNameFilter;
import
org.htmlparser.tags.TableTag;
import
org.htmlparser.util.NodeList;
public
class
TestYahoo {
public
static
void
testHtml() {
try
{
String sCurrentLine;
String sTotalString;
sCurrentLine
=
""
;
sTotalString
=
""
;
java.io.InputStream l_urlStream;
java.net.URL l_url
=
new
java.net.URL(
"
http://sports.sina.com.cn/iframe/nba/live/
"
);
java.net.HttpURLConnection l_connection
=
(java.net.HttpURLConnection) l_url
.openConnection();
l_connection.connect();
l_urlStream
=
l_connection.getInputStream();
java.io.BufferedReader l_reader
=
new
java.io.BufferedReader(
new
java.io.InputStreamReader(l_urlStream));
while
((sCurrentLine
=
l_reader.readLine())
!=
null
) {
sTotalString
+=
sCurrentLine;
}
System.out.println(sTotalString);
System.out.println(
"
====================
"
);
String testText
=
extractText(sTotalString);
System.out.println(testText);
}
catch
(Exception e) {
e.printStackTrace();
}
}
/**
* 抽取纯文本信息
*
*
@param
inputHtml
*
@return
*/
public
static
String extractText(String inputHtml)
throws
Exception {
StringBuffer text
=
new
StringBuffer();
Parser parser
=
Parser.createParser(
new
String(inputHtml.getBytes(),
"
8859_1
"
),
"
8859-1
"
);
//
遍历所有的节点
NodeList nodes
=
parser.extractAllNodesThatMatch(
new
NodeFilter() {
public
boolean
accept(Node node) {
return
true
;
}
});
Node node
=
nodes.elementAt(
0
);
text.append(
new
String(node.toPlainTextString().getBytes(
"
8859_1
"
)));
return
text.toString();
}
/**
* 读取文件的方式来分析内容. filePath也可以是一个Url.
*
*
@param
resource
* 文件/Url
*/
public
static
void
test5(String resource)
throws
Exception {
Parser myParser
=
new
Parser(resource);
//
设置编码
myParser.setEncoding(
"
GBK
"
);
String filterStr
=
"
table
"
;
NodeFilter filter
=
new
TagNameFilter(filterStr);
NodeList nodeList
=
myParser.extractAllNodesThatMatch(filter);
TableTag tabletag
=
(TableTag) nodeList.elementAt(
11
);
System.out.println(tabletag.toHtml());
System.out.println(
"
==============
"
);
}
/*
* public static void main(String[] args) { TestYahoo testYahoo = new
* TestYahoo(); testYahoo.testHtml(); }
*/
public
static
void
main(String[] args)
throws
Exception {
test5(
"
http://sports.yahoo.com/nba/scoreboard
"
);
}
}
2、比htmlparser更好用的html解析工具jsoup,最人性化的地方是,它支持类jquery语法。下载地址:http://jsoup.org/
mport java.util.HashMap;
2 import java.util.Map;
3
4 import org.jsoup.Jsoup;
5 import org.jsoup.nodes.Document;
6 import org.jsoup.select.Elements;
7 /**
8 *
9 * @author Rocky
10 *
11 */
12 public class spider {
13 private static final String POSTURL="http://59.49.18.116:8008/sxwwpt_wai/inquire/illegalAction!carInquire.action";
14 private void spiderData() throws Exception{
15 Map<String,String> req=new HashMap<String,String>();
16 req.put("authCode", "");
17 req.put("csjcKey","110000");
18 req.put("hpzl", "02");
19 req.put("vioViolation.hphm", "xxx");//您的车牌号
20 req.put("type","1");
21 req.put("pagination.currentPage", "1");
22 req.put("pagination.pageSize", "5");
23
24 Document doc=Jsoup.connect(POSTURL).data(req).get();
25 Elements newsHeadlines=doc.select(".if_tr td");
26 System.out.println(newsHeadlines.text());
27 }
28 /**
29 * @param args
30 * @throws Exception
31 */
32 public static void main(String[] args) throws Exception {
33
34 spider spider=new spider();
35 spider.spiderData();
36 }
37
38 }