获取html页面全部标签或者标签内容

原创

isea533 2022-08-19 15:15:02 博主文章分类：JAVA编程 ©著作权

©著作权归作者所有：来自51CTO博客作者isea533的原创作品，请联系作者获取转载授权，否则将追究法律责任

首先是两个正则表达式：

1.<[^>]+>：这个正则表达式可以匹配所有html标签,可以100%匹配(注意页面编码方式和读取的编码方式)。

2.>[^<]+<：这个可以匹配标签内容，本人对正则不是很熟悉，因而只是简单的将第一个正则表达式反了过来，匹配出来的结果都会带着><，如果有更好的正则表达式，希望可以告诉我。

下面上程序：

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class URLTest {

  /**
   * @param args
   * @throws URISyntaxException 
   */
  public static void main(String[] args) throws Exception {
    URL url = new URL("http://www.ascii-code.com/");
    InputStreamReader reader = new InputStreamReader(url.openStream());
    BufferedReader br = new BufferedReader(reader);
    String s = null;
    while((s=br.readLine())!=null){
      s = GetLabel(s);
      if(s!=null){
        System.out.println(s);
      }
    }
    br.close();
    reader.close();
  }
  
  public static String GetContent(String html) {
    //String html = "<ul><li>1.hehe</li><li>2.hi</li><li>3.hei</li></ul>";
    String ss = ">[^<]+<";
    String temp = null;
    Pattern pa = Pattern.compile(ss);
    Matcher ma = null;
    ma = pa.matcher(html);
    String result = null;
    while(ma.find()){
      temp = ma.group();
      if(temp!=null){
        if(temp.startsWith(">")){
          temp = temp.substring(1);
        }
        if(temp.endsWith("<")){
          temp = temp.substring(0, temp.length()-1);
        }
        if(!temp.equalsIgnoreCase("")){
          if(result==null){
            result = temp;
          }
          else{
            result+="____"+temp;
          }
        }
      }
    }
    return result;
  }
  
  public static String GetLabel(String html) {
    //String html = "<ul><li>1.hehe</li><li>2.hi</li><li>3.hei</li></ul>";
    String ss = "<[^>]+>";
    String temp = null;
    Pattern pa = Pattern.compile(ss);
    Matcher ma = null;
    ma = pa.matcher(html);
    String result = null;
    while(ma.find()){
      temp = ma.group();
      if(temp!=null){
        if(temp.startsWith(">")){
          temp = temp.substring(1);
        }
        if(temp.endsWith("<")){
          temp = temp.substring(0, temp.length()-1);
        }
        if(!temp.equalsIgnoreCase("")){
          if(result==null){
            result = temp;
          }
          else{
            result+="____"+temp;
          }
        }
      }
    }
    return result;
  }
}