近期做的一个项目由于客户需求,需要将网站的首页静态化。因为自己从未接触过静态化的相关知识,所以 只好硬着头皮导出找资料,焦头烂额。最后想到一种解决方案,用爬虫技术把自己的首页静态化,然后将爬下来的整个页面把首页替换掉。这样用户访问的就是一个静态资源了。百度了一下,发现果然有这样的案例,果断改改拿来用了,写此博客来记录一下。
package com.evcipa.comutil;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.List;
import java.util.Map;
import java.util.Timer;

import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.io.output.FileWriterWithEncoding;
import org.apache.log4j.Logger;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

/** 
 * @Description:生成静态页面
 * @Commpany BK
 * @author ZhangAn
 */

public class HtmlGenerator {

    HttpClient httpClient = null; //HttpClient实例  
    GetMethod getMethod =null; //GetMethod实例  
    BufferedWriter fw = null;  
    String page = null;  
    String webappname = null;  
    BufferedReader br = null;  
    InputStream in = null;  
    StringBuffer sb = null;  
    String line = null;   
    //构造方法  
//    public HtmlGenerator(String webappname){  
//        this.webappname = webappname;  
//        
//    }  

   private static Logger logger=Logger.getLogger(HtmlGenerator.class);
    public boolean createHtmlPage(String url,String htmlFileName){  
        boolean status = false;   
        int statusCode = 0;               
        try{  
            //创建一个HttpClient实例充当模拟浏览器  
            httpClient = new HttpClient();  
            //设置httpclient读取内容时使用的字符集  
            httpClient.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET,"UTF-8");           
            //创建GET方法的实例  
            getMethod = new GetMethod(url);  
            //使用系统提供的默认的恢复策略,在发生异常时候将自动重试3次  
            getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());  
            //设置Get方法提交参数时使用的字符集,以支持中文参数的正常传递  
            getMethod.addRequestHeader("Content-Type","text/html;charset=UTF-8");  
            //执行Get方法并取得返回状态码,200表示正常,其它代码为异常  
            statusCode = httpClient.executeMethod(getMethod);

            if (statusCode!=200) {  
                logger.error("静态页面引擎在解析"+url+"产生静态页面"+htmlFileName+"时出错!");  
            }else{  
                //读取解析结果  
//              System.out.println(getMethod.getResponseBodyAsStream().read());
                sb = new StringBuffer();  
                Thread.sleep(20000);
                in = getMethod.getResponseBodyAsStream();  
                //br = new BufferedReader(new InputStreamReader(in));//此方法默认会乱码,经过长时期的摸索,下面的方法才可以  
                br = new BufferedReader(new InputStreamReader(in,"UTF-8"));  
                while((line=br.readLine())!=null){  
                    sb.append(line+"\n");  
                }  
                if(br!=null)br.close();  
                page = sb.toString();  
                //将页面中的相对路径替换成绝对路径,以确保页面资源正常访问  
                page = formatPage(page);  
                //将解析结果写入指定的静态HTML文件中,实现静态HTML生成  
                writeHtml(htmlFileName,page);  
                status = true;  
            }             
        }catch(Exception ex){  
            logger.error(ex.getMessage());
            logger.error("静态页面引擎在解析"+url+"产生静态页面"+htmlFileName+"时出错:"+ex.getMessage());           
        }finally{  
            //释放http连接  
            getMethod.releaseConnection();  
        }  
        return status;  
    }  

  //将解析结果写入指定的静态HTML文件中  
    private synchronized void writeHtml(String htmlFileName,String content) throws Exception{  
        fw = new BufferedWriter(new FileWriter(htmlFileName));  
        OutputStreamWriter fw = new OutputStreamWriter(new FileOutputStream(htmlFileName),"UTF-8");  
        fw.write(page);   
        if(fw!=null)fw.close();       
    }  

    //将页面中的相对路径替换成绝对路径,以确保页面资源正常访问  
    private String formatPage(String page){       
        page = page.replaceAll("\\.\\./\\.\\./\\.\\./", webappname+"/");  
        page = page.replaceAll("\\.\\./\\.\\./", webappname+"/");  
        page = page.replaceAll("\\.\\./", webappname+"/");            
        return page;  
    }  
    public static boolean getHtmlPage(String url,String rootPath){
        boolean status = false;   
        //创建一个可执行js,css,ajax的多功能WebClient
        WebClient multiWebClient = new WebClient(BrowserVersion.CHROME);
        multiWebClient.getOptions().setJavaScriptEnabled(true);//执行JavaScript
        multiWebClient.getOptions().setCssEnabled(true);//执行css
        multiWebClient.setAjaxController(new NicelyResynchronizingAjaxController());//设置ajax代理
        multiWebClient.getOptions().setTimeout(50000);//设置超时时间
        try {

            HtmlPage htmlPage = multiWebClient.getPage(url);
            multiWebClient.waitForBackgroundJavaScript(20000);//设置加载js时间
            if(stringToFile(htmlPage.asXml(),rootPath)){//把html写入文件中
                status=true;
            }else{
                status=false;
            }

        } catch (Exception e) {
            // TODO: handle exception
            logger.error(e.getMessage());
            status=false;
        }finally {
            multiWebClient.closeAllWindows();
        }
        return status;
    }
    /**
     * 
    * @Title: stringToFile 
    * @Description: TODO 将html写入路径内
    * @param @param html
    * @param @param rootPath
    * @param @return  条件参数
    * @return boolean    返回类型 
    * @Commpany BK
    * @author ZhangAn
     */
    public static boolean stringToFile(String html,String rootPath){
        try {
            FileWriterWithEncoding fileWriter = new FileWriterWithEncoding(rootPath,"utf-8");
            fileWriter.write(html);
            fileWriter.flush();
            fileWriter.close();
            return true;
        } catch (IOException e) {
            logger.error(e.getMessage());
            return false;
        }
    }

    public static String sendGet(String url) {  
        String result = "";// 返回的结果  
        BufferedReader in = null;// 读取响应输入流  
        StringBuffer sb = new StringBuffer();// 存储参数  
        String params = "";// 编码之后的参数  
        try {  
            // 编码请求参数  
           /* if (parameters.size() == 1) {  
                for (String name : parameters.keySet()) {  
                    sb.append(name).append("=").append(  
                            java.net.URLEncoder.encode(parameters.get(name),  
                                    "UTF-8"));  
                }  
                params = sb.toString();  
            } else {  
                for (String name : parameters.keySet()) {  
                    sb.append(name).append("=").append(  
                            java.net.URLEncoder.encode(parameters.get(name),  
                                    "UTF-8")).append("&");  
                }  
                String temp_params = sb.toString();  
                params = temp_params.substring(0, temp_params.length() - 1);  
            }  */
//            String full_url = url + "?" + params;  
            // 创建URL对象  
            java.net.URL connURL = new java.net.URL(url);  
            // 打开URL连接  
            java.net.HttpURLConnection httpConn = (java.net.HttpURLConnection) connURL  
                    .openConnection();  
            // 设置通用属性  
            httpConn.setRequestProperty("Accept", "*/*");  
            httpConn.setRequestProperty("Connection", "Keep-Alive");  
            httpConn.setRequestProperty("User-Agent",  
                    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)");  
            // 建立实际的连接  
            httpConn.connect();  
            // 响应头部获取  
           /* Map<String, List<String>> headers = httpConn.getHeaderFields();  
            // 遍历所有的响应头字段  
            for (String key : headers.keySet()) {  
//                System.out.println(key + "\t:\t" + headers.get(key));  
            }  */
            // 定义BufferedReader输入流来读取URL的响应,并设置编码方式  
            in = new BufferedReader(new InputStreamReader(httpConn  
                    .getInputStream(), "UTF-8"));  
            String line;  
            // 读取返回的内容  
            while ((line = in.readLine()) != null) {  
                result += line;  
            }  
        } catch (Exception e) {  
            logger.error(e.getMessage());
        } finally {  
            try {  
                if (in != null) {  
                    in.close();  
                }  
            } catch (IOException ex) {  
                logger.error(ex.getMessage());
            }  
        }  
        return result;  
    }  
  //测试方法
     public static void main(String[] args){
          HtmlGenerator h = new HtmlGenerator("webappname");
          h.createHtmlPage("http://localhost:8080/evcipa/views/index.jsp","D:/a.html");
          System.out.println("静态页面已经生成到D:/a.html");

 }  
}

后续处理

由于是静态页面,所以如果需要改些东西,就不会像动态页面那样动态生成了,所以需要一个定时的任务来执行它,定期更新动态的内容。