Htmlparser一个分析html内容的很好用的工具。
htmlparser是一个纯的java写的html解析的库,它不依赖于其它的java库文件,主要用于改造或提取html。
它能超高速解析html,而且不会出错。现在htmlparser最新版本为2.0。
毫不夸张地说,htmlparser就是目前最好的html解析和分析的工具。
无论你是想抓取网页数据还是改造html的内容,用了htmlparser绝对会忍不住称赞。
附上一点儿代码,留个纪念。(⊙_⊙)
package home.study.htmlparser.main;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.TextExtractingVisitor;
public class HtmlParserStudy
{
private static String URL_ADDRESS = "http://localhost:8080/htmlparsertest/HTMLParserTester.html";
public static void main(String[] args)
{
try {
Parser parser = createParser();
getEverythingInTags(parser);
getNodeInformation(parser);
getNodeInfoByFilter(parser);
} catch (ParserException e) {
e.printStackTrace();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Normal way to get a parser
*
* @return parser
* @throws ParserException
* @throws MalformedURLException
* @throws IOException
*/
public static Parser createParser() throws ParserException, MalformedURLException, IOException
{
Parser parser = new Parser((HttpURLConnection) (new URL(URL_ADDRESS)).openConnection());
return parser;
}
/**
* The normal way to visit the content in page
*
* @param parser
* @throws ParserException
*/
public static void getEverythingInTags(Parser parser) throws ParserException
{
// create a visitor
TextExtractingVisitor visitor = new TextExtractingVisitor();
// visit all the nodes in the parser given html page
// there are several kinds of NodeVisitors
parser.visitAllNodesWith(visitor);
// get the text inside the tags
String textInPage = visitor.getExtractedText();
System.out.println(textInPage);
}
/**
* Get the node then use it
*
* @param parser
* @throws ParserException
*/
public static void getNodeInformation(Parser parser) throws ParserException
{
NodeList nodesHaveChildren = new NodeList();
while (parser.elements().hasMoreNodes()) {
Node currentNode = parser.elements().nextNode();
NodeFilter aFilter = new TagNameFilter("a");
NodeFilter hasChild = new HasChildFilter(aFilter);
currentNode.collectInto(nodesHaveChildren, hasChild);
}
System.out.println(nodesHaveChildren.toHtml());
}
/**
* NodeFilter contains AndFilter, OrFilter, CssSelectorNodeFilter,
* HasAttributeFilter, HasChildFilter, HasParentFilter, HasSiblingFilter,
* TagNameFilter and so on
*
* @param parser
* @throws ParserException
*/
public static void getNodeInfoByFilter(Parser parser) throws ParserException
{
TagNameFilter trTag = new TagNameFilter("tr");
NodeList trNodeList = parser.extractAllNodesThatMatch(trTag);
for (int i = 0; i < trNodeList.size(); i++) {
System.out.println(i + " node :");
System.out.println(trNodeList.elementAt(i).toHtml());
}
}
}