java 获取动态页面html html动态获取数据

转载

墨色天香 2023-07-19 08:55:24

文章标签 java 获取动态页面html java System html 文章分类 Java 后端开发

1.HtmlUnit是一个用java编写的无界面浏览器，建模html文档，通过API调用页面，填充表单，点击链接等等。如同正常浏览器一样操作。典型应用于测试以及从网页抓取信息。并且HtmlUnit拥有HttpClient和soup两者的功能，但速度比较慢，但如果取消它的解析css和js的功能，速度也会提上去，默认开启。

2.这里选用HtmlUnit来爬取数据主要是为了获取他的js和css.

3.主要代码如下

package com.los;

import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.DomNodeList;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.los.util.DownlandPic;

import java.io.IOException;
import java.util.regex.Pattern;

public class HtmlUnitTest {
    
    public static void main(String[] args) throws IOException {
        DownlandPic dd = new DownlandPic();
        WebClient webClient = new WebClient();//实例化web客户端
        //http://www.bdqn.cn/ https://www.baidu.com/?tn=78000241_22_hao_pg
        String url = "http://www.bdqn.cn/";
        HtmlPage page = webClient.getPage(url);
        webClient.waitForBackgroundJavaScript(10000);   //等侍js脚本执行完成
        System.out.println(page.asXml());

        DomNodeList<DomElement> img = page.getElementsByTagName("script");
        for (int i=0;i<img.size();i++){
            DomElement domElement = img.get(i);
            System.out.println("获取的内容"+domElement);
        }
        //对其中的图片进行操作
        operPic(page);
        
        webClient.close();//关闭窗口，释放内存
    }

    /**
     * 将其中的图片下载，并改变将爬取后的页面中的图片地址
     *  将地址改为本地下载后的地址
     * @param page
     */
    public static void operPic(HtmlPage page){
        DomNodeList<DomElement> img = page.getElementsByTagName("img");
        for(int i=0;i<img.size();i++){
            String content = img.get(i).toString();
            String flag = ".*data-original.*";
            //使用正则表达式匹配一个img中是否含有 data-original，去除背景图片的干扰。
            boolean matches = Pattern.matches(flag, content);
            System.out.println("匹配结果-------->"+matches);
            //匹配实例  <img class="lazy" data-original="/img/newImg/qn_pro4.jpg"...>
            if(matches){
                //获取到 data-original 里的值
                String substring = content.substring(DownlandPic.getCharacterPosition3(content), DownlandPic.getCharacterPosition4(content));
                System.out.println("截取的内容："+substring);
                //将下载的图片名称定为截取的名字，爬取后的页面和图片在同一路径

                //调用下载图片方法，获取到的地址为相对路径，在这将地址补充完整
                String baseUrl = "http://www.bdqn.cn";
                DownlandPic.downloadPicture(baseUrl+substring,"L:\\Spider\\"+substring);
            }
            System.out.println("图片路径"+content);
        }
    }
}

图片下载和正则匹配

package com.los.util;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class DownlandPic {
    public static void downloadPicture(String urlList,String path){
        try {
            URL url = new URL(urlList);
            DataInputStream dataInputStream = new DataInputStream(url.openStream());
            FileOutputStream fileOutputStream = new FileOutputStream(new File(path));
            ByteArrayOutputStream output = new ByteArrayOutputStream();

            byte[] buffer = new byte[1024];
            int length;

            while ((length = dataInputStream.read(buffer)) > 0) {
                output.write(buffer, 0, length);
            }
            fileOutputStream.write(output.toByteArray());
            dataInputStream.close();
            fileOutputStream.close();
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static int getCharacterPosition3(String string){
        //这里是获取"\""符号的位置
        Matcher slashMatcher = Pattern.compile("\"").matcher(string);
        int mIdx = 0;
        while(slashMatcher.find()) {
            mIdx++;
            //当"/"符号第三次出现的位置
            if(mIdx == 2){
                break;
            }
        }
        return slashMatcher.start();
    }

    public static int getCharacterPosition4(String string){
        //这里是获取"\""符号的位置
        Matcher slashMatcher = Pattern.compile("\"").matcher(string);
        int mIdx = 0;
        while(slashMatcher.find()) {
            mIdx++;
            //当"/"符号第三次出现的位置
            if(mIdx == 3){
                break;
            }
        }
        return slashMatcher.start();
    }

    public static void main(String[] args) {
        String content = "<img class=\"lazy\" data-original=\"/img/newImg/news_img2.jpg\" src=\"/img/newImg/news2.jpg\" style=\"display: block;\"/>";
        System.out.println(getCharacterPosition3(content));
        System.out.println(getCharacterPosition4(content));
        String substring = content.substring(getCharacterPosition3(content), getCharacterPosition4(content));
        System.out.println(substring);

    }
}

3.因为这里的网页中图片地址为相对路径，所以在进行下载的时候到页面找了它的绝对路径在下载地址中进行了拼接。将下载的路径和爬取后要存储在本地的页面中的图片标签中的地址要对应，否则找不到。

　　3.在匹配 " 时之所以写2和3是根据这里爬取的数据来写的

java 获取动态页面html html动态获取数据_System