读取html页面文件解析邮箱地址



package com.alpha.test;import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;/**
* 读取html页面文件解析邮箱地址
*
* @author JavaAlpha 2012-12-19 13:45:11
*/
public class ReadHtmlToTxt { // 读取文件
public static String readHtml(String path) { StringBuffer emailCont = new StringBuffer(); File htmlFile = new File(path);
if (htmlFile.exists() && htmlFile.isFile() && htmlFile.canRead()) {
Reader in;
try {
in = new FileReader(htmlFile);
char[] buff = new char[4096];
int nch;
while ((nch = in.read(buff, 0, buff.length)) != -1) {
emailCont.append(checkEmail(new String(buff, 0, nch)));
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} } return emailCont.toString();
} // 判断字符串里面是否包括@符号
public static String checkEmail(String str) { String postCont = "";
// 判断是否回复的内容
if (str.indexOf("@") > -1) { postCont = str.substring(str.indexOf("@") - 10,
str.indexOf("@") + 10); if (postCont.indexOf(">") > -1 || postCont.indexOf("<") > -1) {
postCont = postCont.replaceAll(">", "");
postCont = postCont.replaceAll("<", "");
postCont = postCont.replaceAll("/", "");
} if (postCont.indexOf(",") > -1 || postCont.indexOf(",") > -1
|| postCont.indexOf("。") > -1 || postCont.indexOf(";") > -1) {
postCont = postCont.replaceAll(",", "");
postCont = postCont.replaceAll(",", "");
postCont = postCont.replaceAll("。", "");
} postCont = postCont.substring(0, postCont.indexOf(".com") + 4); System.out.println(postCont);
} return postCont;
}

//过滤汉字
public static boolean checkChinese(String str) {

String regEx = "[\\u4e00-\\u9fa5]";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(str);
if (m != null && m.find()){
return true;//是汉字
}
return false;
} // 将整理是邮箱地址写入文件
public static void writerFile(String cont, String path) { File emailFile = new File(path); try {
//如果文件不存在,创建文件
if (!emailFile.exists()) {
emailFile.createNewFile();
}

Writer out = new FileWriter(emailFile); out.write(cont);
out.flush();
out.close();
} catch (Exception e) {
e.printStackTrace();
} }

/**
* 读取网络内容
*/
public static void readUrlCont(String strUrl) {

StringBuffer cont = new StringBuffer();//内容

try {
URL url = new URL(strUrl);
URLConnection conn = url.openConnection();
BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));
String lineCont = "";
while ((lineCont = reader.readLine())!= null) {
cont.append(lineCont+"</br>");
}

reader.close();

} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}

System.out.println(cont.toString());
} public static void main(String[] args) {

//String cont = readHtml("e://test.htm");//读取文件

//writerFile(cont, "e://test.txt");//写文件

//checkChinese("qwe123");

readUrlCont("http://www.163.com");

}}