Document解析 Java html java html解析库

转载

detailtoo 2023-06-30 19:07:59

文章标签 HTML解析 HTML java Text 文章分类 Java 后端开发

这几天参加公司的定级考试，有个上机题是

访问URL: http://www.weather.com.cn/weather/101010100.shtml 页面，提取出页面中的天气信息，然后把信息按照要求输出到控制台。开始想到的是先把html文件存到本地，然后在逐行用正则表达式去解析，后来想想这种方法太土，而且解析起来会很复杂，所以就想用SAX去解析，试了一下，程序执行起来太慢，半天出不了结果，而且网络中的html标签不规则，标签不一定都有结尾，所以解析时会报错。后来在网上搜了一下，原来javax.swing.text.html包中已经提供了解析html标签的类库，在网上参考的别人的博客，地址如下：

程序如下：

1 package com.thunisoft.kms.java.lvl2.exam;
  2 
  3 import java.io.BufferedReader;
  4 import java.io.FileInputStream;
  5 import java.io.FileOutputStream;
  6 import java.io.IOException;
  7 import java.io.InputStream;
  8 import java.io.InputStreamReader;
  9 import java.io.Reader;
 10 import java.net.URL;
 11 import java.net.URLConnection;
 12 import java.util.Vector;
 13 import java.util.regex.Matcher;
 14 import java.util.regex.Pattern;
 15 
 16 import javax.swing.text.MutableAttributeSet;
 17 import javax.swing.text.html.HTML;
 18 import javax.swing.text.html.HTMLEditorKit;
 19 import javax.swing.text.html.HTMLEditorKit.ParserCallback;
 20 import javax.swing.text.html.parser.ParserDelegator;
 21 
 22 /**
 23  * Title: <br>
 24  * Description: <br>
 25  * Copyright: Copyright (c) 2007<br>
 26  * Company:<br>
 27  * 
 28  * @author keep at it
 29  * @version 1.0
 30  * @date 2013-12-4
 31  */
 32 public class GrapWeatherInfo extends ParserCallback
 33 {
 34     /** 是否是table标签 */
 35     protected boolean isTable = false;
 36     /** 是否是a标签 */
 37     protected boolean isAlink = false;
 38     /** 是否是div标签 */
 39     protected boolean isDiv = false;
 40     /** 是否是td标签 */
 41     protected boolean isTd = false;
 42     /** 放符合条件的元素 */
 43     protected static Vector<String> element = new Vector<String>();
 44     protected static String paragraphText = new String();
 45     /** 要获取文件在网络中的URL */
 46     private static final String FILE_URL = "http://www.weather.com.cn/weather/101010100.shtml";
 47     /** 文件在本地磁盘的存储位置 */
 48     private static final String FILE_LOCATION = "E:/url.html";
 49 
 50     /** 构造方法 */
 51     public GrapWeatherInfo()
 52     {
 53 
 54     }
 55 
 56     /**
 57      * 开始解析
 58      * 
 59      * @param r
 60      */
 61     private static void startParse(Reader r)
 62     {
 63         try
 64         {
 65             ParserDelegator ps = new ParserDelegator();// 负责每次在调用其 parse
 66                                                        // 方法时启动一个新的
 67                                                        // DocumentParser
 68             HTMLEditorKit.ParserCallback parser = new GrapWeatherInfo();// 解析结果驱动这些回调方法。
 69             ps.parse(r, parser, true);// 解析给定的流并通过解析的结果驱动给定的回调。
 70             Vector<String> link = element;
 71             String temp = "";
 72             for (int i = 1; i < link.size(); i++)
 73             {
 74                 if (link.get(i).contains("星期"))
 75                 {
 76                     temp = link.get(i);
 77                 }
 78                 if (link.get(i).equals(";"))
 79                 {
 80                     System.out.println();
 81                 }
 82                 else if (!link.get(i).equals(">"))
 83                 {
 84                     // Pattern p = Pattern.compile("\\s*|\t|\r|\n");
 85                     // Matcher m = p.matcher(link.get(i));
 86                     if (link.get(i).endsWith("夜间")
 87                             && !link.get(i - 1).contains("星期"))
 88                     {
 89                         System.out.println();
 90                         System.out.print(temp + "   ");
 91                         System.out.print(link.get(i) + "   ");
 92                     }
 93                     else
 94                     {
 95                         System.out.print(link.get(i) + "   ");
 96                     }
 97                 }
 98             }
 99 
100         }
101         catch (Exception e)
102         {
103             e.printStackTrace();
104         }
105     }
106 
107     /**
108      * 处理文本
109      * 
110      * @param data
111      * @param pos
112      */
113     public void handleText(char[] data, int pos)
114     {
115         Pattern p = Pattern.compile("\\s*|\t|\r|\n");
116         Matcher m = null;
117         if (isAlink)
118         {
119             String tempParagraphText = new String(data);
120             m = p.matcher(tempParagraphText);
121             if (paragraphText != null)
122             {
123                 // 符合条件的添加到集合中去
124                 element.addElement(m.replaceAll(""));
125             }
126         }
127         else if (isTd)
128         {
129             String tempParagraphText = new String(data);
130             m = p.matcher(tempParagraphText);
131             if (paragraphText != null)
132             {
133                 // 符合条件的添加到集合中去
134                 element.addElement(m.replaceAll(""));
135             }
136         }
137     }
138 
139     /**
140      * 处理开始标签
141      * 
142      * @param t
143      * @param a
144      * @param pos
145      */
146     public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos)
147     {
148         // System.out.println("start: "+t+"  "+a.getAttribute(HTML.Attribute.ID)+"  "+a.getAttribute(HTML.Attribute.CLASS));
149         // 如果是<div/>
150         if (t == HTML.Tag.DIV)
151         {
152             // 7d 是要解析的div的id属性，用来和其他的div区分
153             if ("7d".equals(a.getAttribute(HTML.Attribute.ID)))
154             {
155                 // 说明是要找的div
156                 isDiv = true;
157             }
158         }
159         // 如果是<table/>
160         if (t == HTML.Tag.TABLE)
161         {
162             // yuBaoTable 是要解析的table的class属性，用来和其他的table区分
163             if ("yuBaoTable".equals(a.getAttribute(HTML.Attribute.CLASS)))
164             {
165                 // 说明是要找的table
166                 isTable = true;
167             }
168         }
169         // 如果是<a/>,加上是id=7d的限制
170         if (t == HTML.Tag.A && isDiv)
171         {
172 
173             if (a.getAttribute(HTML.Attribute.ID) == null)
174             {
175                 if (a.getAttribute(HTML.Attribute.HREF) != null ? a
176                         .getAttribute(HTML.Attribute.HREF).toString()
177                         .endsWith(".php") : false)
178                 {
179                     // 说明是要找的<a/>
180                     isAlink = true;
181                 }
182 
183             }
184         }
185         if (t == HTML.Tag.TD && isDiv)
186         {
187             isTd = true;
188         }
189     }
190 
191     /**
192      * 解析出问题时的处理方法
193      * 
194      * @param errorMsg
195      * @param pos
196      */
197     public void handleError(String errorMsg, int pos)
198     {
199     }
200 
201     /**
202      * 处理普通tag
203      * 
204      * @param t
205      * @param a
206      * @param pos
207      */
208     public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos)
209     {
210         handleStartTag(t, a, pos);
211     }
212 
213     /**
214      * getter method
215      * 
216      * @return
217      */
218     public static String getParagraphText()
219     {
220         return paragraphText;
221     }
222 
223     /**
224      * 处理注释
225      * 
226      * @param data
227      * @param pos
228      */
229     public void handleComment(char[] data, int pos)
230     {
231     }
232 
233     /**
234      * 处理end tag
235      * 
236      * @param t
237      * @param pos
238      */
239     public void handleEndTag(HTML.Tag t, int pos)
240     {
241         // System.out.println("end: "+t+"  "+pos);
242         // 如果是<a/>标签
243         if (t == HTML.Tag.A)
244         {
245             if (isAlink)
246             {
247                 isAlink = false;
248             }
249         }// 如果是<table/>标签
250         else if (t == HTML.Tag.TABLE && isAlink == false)
251         {
252             if (isTable)
253             {
254                 isTable = false;
255                 // 一个table标签解析完的时候,element中加入一个;元素用来分隔每个table中的文本，方便输出
256             }
257             element.addElement(new String(";"));
258         }// 如果是<div/>标签
259         else if (t == HTML.Tag.DIV && isTable == false)
260         {
261             if (isDiv == true && isTable == false)
262             {
263                 isDiv = false;
264             }
265         }
266         else if (t == HTML.Tag.TD)
267         {
268             isTd = false;
269         }
270     }
271 
272     /**
273      * 程序的入口
274      * 
275      * @param args
276      */
277     public static void main(String args[])
278     {
279         InputStream input = null;
280         FileOutputStream fos = null;
281         BufferedReader brd = null;
282         try
283         {
284             // 设置要提取的文件的URL
285             URL url = new URL(FILE_URL);
286             // 建立连接
287             URLConnection conn = url.openConnection();
288             conn.connect();
289             // 获取输入流
290             input = conn.getInputStream();
291             // new 一个具体的文件输出流
292             fos = new FileOutputStream(FILE_LOCATION);
293             byte[] b = new byte[1024];
294             int read = 0;
295             // 输出
296             while ((read = input.read(b)) != -1)
297             {
298                 fos.write(b, 0, read);
299             }
300             // 获取HTML文件流，以UTF-8编码
301             brd = new BufferedReader(new InputStreamReader(new FileInputStream(
302                     FILE_LOCATION), "UTF-8"));
303             // 开始解析HTML
304             startParse(brd);
305         }
306         catch (Exception e)
307         {
308             e.printStackTrace();
309         }
310         finally
311         {
312             // 关闭资源
313             if (input != null)
314             {
315                 try
316                 {
317                     input.close();
318                 }
319                 catch (IOException e)
320                 {
321                     input = null;
322                 }
323             }
324 
325             if (fos != null)
326             {
327                 try
328                 {
329                     fos.close();
330                 }
331                 catch (IOException e)
332                 {
333                     fos = null;
334                 }
335             }
336 
337             if (brd != null)
338             {
339                 try
340                 {
341                     brd.close();
342                 }
343                 catch (IOException e)
344                 {
345                     brd = null;
346                 }
347             }
348         }
349     }
350 }

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。