htmlParse和Jsoup

原创

赶路人儿 2022-11-08 20:09:56 博主文章分类：java ©著作权

©著作权归作者所有：来自51CTO博客作者赶路人儿的原创作品，请联系作者获取转载授权，否则将追究法律责任

1、利用htmlParse获取网页信息：
import org.htmlparser.Node;
 
import
 org.htmlparser.NodeFilter;
import
 org.htmlparser.Parser;
import
 org.htmlparser.filters.TagNameFilter;
import
 org.htmlparser.tags.TableTag;
import
 org.htmlparser.util.NodeList;

public
 
class
 TestYahoo {
     
public
 
static
 
void
 testHtml() {
         
try
 {
             String sCurrentLine;
             String sTotalString;
             sCurrentLine 
=
 
""
;
             sTotalString 
=
 
""
;
             java.io.InputStream l_urlStream;
             java.net.URL l_url 
=
 
new
 java.net.URL(
                     
"
http://sports.sina.com.cn/iframe/nba/live/
"
);
             java.net.HttpURLConnection l_connection 
=
 (java.net.HttpURLConnection) l_url
                     .openConnection();
             l_connection.connect();
             l_urlStream 
=
 l_connection.getInputStream();
             java.io.BufferedReader l_reader 
=
 
new
 java.io.BufferedReader(
                     
new
 java.io.InputStreamReader(l_urlStream));
             
while
 ((sCurrentLine 
=
 l_reader.readLine()) 
!=
 
null
) {
                 sTotalString 
+=
 sCurrentLine;
             }
             System.out.println(sTotalString);

             System.out.println(
"
====================
"
);
             String testText 
=
 extractText(sTotalString);
             System.out.println(testText);
         } 
catch
 (Exception e) {
             e.printStackTrace();
         }

     }

     
/**

      * 抽取纯文本信息
      * 
      * 
@param
 inputHtml
      * 
@return

      
*/

     
public
 
static
 String extractText(String inputHtml) 
throws
 Exception {
         StringBuffer text 
=
 
new
 StringBuffer();

         Parser parser 
=
 Parser.createParser(
new
 String(inputHtml.getBytes(),
                 
"
8859_1
"
), 
"
8859-1
"
);
         
//
 遍历所有的节点

        NodeList nodes 
=
 parser.extractAllNodesThatMatch(
new
 NodeFilter() {
             
public
 
boolean
 accept(Node node) {
                 
return
 
true
;
             }
         });
         Node node 
=
 nodes.elementAt(
0
);
         text.append(
new
 String(node.toPlainTextString().getBytes(
"
8859_1
"
)));
         
return
 text.toString();
     }

     
/**

      * 读取文件的方式来分析内容. filePath也可以是一个Url.
      * 
      * 
@param
 resource
      *            文件/Url
      
*/

     
public
 
static
 
void
 test5(String resource) 
throws
 Exception {
         Parser myParser 
=
 
new
 Parser(resource);

         
//
 设置编码

        myParser.setEncoding(
"
GBK
"
);
         String filterStr 
=
 
"
table
"
;
         NodeFilter filter 
=
 
new
 TagNameFilter(filterStr);
         NodeList nodeList 
=
 myParser.extractAllNodesThatMatch(filter);
         TableTag tabletag 
=
 (TableTag) nodeList.elementAt(
11
);
             
             System.out.println(tabletag.toHtml());
             
             System.out.println(
"
==============
"
);

     }

     
/*

      * public static void main(String[] args) { TestYahoo testYahoo = new
      * TestYahoo(); testYahoo.testHtml(); }
      
*/

     
public
 
static
 
void
 main(String[] args) 
throws
 Exception {
         test5(
"
http://sports.yahoo.com/nba/scoreboard
"
);
     }
 
}

2、比htmlparser更好用的html解析工具jsoup，最人性化的地方是，它支持类jquery语法。下载地址：http://jsoup.org/
mport java.util.HashMap;
 2 import java.util.Map;
 3 
 4 import org.jsoup.Jsoup;
 5 import org.jsoup.nodes.Document;
 6 import org.jsoup.select.Elements;
 7 /**
 8  * 
 9  * @author Rocky
10  *
11  */
12 public class spider {
13     private static final String POSTURL="http://59.49.18.116:8008/sxwwpt_wai/inquire/illegalAction!carInquire.action";
14     private void spiderData() throws Exception{
15          Map<String,String> req=new HashMap<String,String>();
16          req.put("authCode", "");
17          req.put("csjcKey","110000");
18          req.put("hpzl", "02");
19          req.put("vioViolation.hphm", "xxx");//您的车牌号
20          req.put("type","1");
21          req.put("pagination.currentPage", "1");
22          req.put("pagination.pageSize", "5");
23          
24          Document doc=Jsoup.connect(POSTURL).data(req).get();
25          Elements newsHeadlines=doc.select(".if_tr td");
26          System.out.println(newsHeadlines.text());
27     }
28     /**
29      * @param args
30      * @throws Exception 
31      */
32     public static void main(String[] args) throws Exception {
33         
34         spider spider=new spider();
35         spider.spiderData();
36     }
37 
38 }