java使用jsoup，多线程批量爬取天极网某分类下的图片

原创

JavaPub 2021-06-21 16:15:07 ©著作权

文章标签 java学习 文章分类 Java 后端开发

©著作权归作者所有：来自51CTO博客作者JavaPub的原创作品，请联系作者获取转载授权，否则将追究法律责任

小Demo转自csdn某作者，

本例子只作为测试，页面个数直接设置了100个，可以可能会少或者多，容易报错，更优化的一种方式是获取“下一页”按钮的地址，然后再访问，当访问不到“下一页”的内容时跳出

多线程只体现在文件提取，也可以在elements循环中再加一个多线程访问页面的

本案例需要jsoup包的支持，可到下方url下载

jsoup jar包

Test.java==============>主方法

package com.test.main;
 
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import .MalformedURLException;
import .URL;
import .URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
 
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
 
public class Test {
    public static void main(String[] args) {
        ExecutorService executor = Executors.newFixedThreadPool(5);
        Document doc = null;
        FileWriter writer = null;
        String rui="index";
        List<String> alist = new  ArrayList<String>();
        //int keyword = 4;
        for(int keyword=4;keyword<100;keyword++){
        try {
            //创建页面对象
            doc = Jsoup.connect("http://pic.yesky.com/c/6_20491_"+keyword+".shtml").userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36").timeout(10000).get();
            //根据标签和class id获取元素
            Elements div = doc.select("div.lb_box");
            //根据标签获取元素
            Elements dl = div.select("dl");
            Elements dd = div.select("dd");
            Elements pages = dd.select("a");
            for(Element e : pages){
                System.out.println(e.text());
                System.out.println(e.attr("href"));
                Document imgdoc = Jsoup.connect(e.attr("href")).userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36").timeout(10000).get();    
                Elements scroll = imgdoc.select("div.effect_scroll");
                Elements li = scroll.select("li");
                Elements urls = li.select("a");
                int i=0;
                for(Element ipage : urls){
                    Document imgpage = Jsoup.connect(ipage.attr("href")).userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36").timeout(10000).get();    
                    Elements imgediv = imgpage.select("div.l_effect_img_mid");
                    Element img = imgediv.select("img").first();
                    FileOutUtils fo =new FileOutUtils(img, e.text());
                    fo.start();
                    System.out.println(i);
                    i++;
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        }
    }
    /**
     * 单线程下载
     * @author wangshiyu 
     * @param e
     * @param filepath
     */
    public static void savefile(Element e,String filepath){
        String src=e.attr("src");//获取img中的src路径
        // System.out.println(src);
        //获取后缀名
        String imageName = src.substring(src.lastIndexOf("/") + 1,src.length());
        //连接url
        
        URL url;
        System.out.println(src);
        try {
            url = new URL(src);
            URLConnection uri=url.openConnection();
            //获取数据流
            InputStream is=uri.getInputStream();
            //写入数据流
            File file = new File("E://imgs//"+filepath);
            if(!file.exists()){
                file.mkdirs();
            }
            OutputStream os = new FileOutputStream(new File("E://imgs//"+filepath+"//", imageName)); 
 
            byte[] buf = new byte[1024]; 
 
            int l=0; 
 
            while((l=is.read(buf))!=-1){
                os.write(buf, 0, l);
            } 
        } catch (MalformedURLException e1) {
            e1.printStackTrace();
        } catch (FileNotFoundException e1) {
            e1.printStackTrace();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
    }
}

FileOutUtils.java ==============>多线程保存到本地

package com.test.main;
 
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import .MalformedURLException;
import .URL;
import .URLConnection;
 
import org.jsoup.nodes.Element;
 
public class FileOutUtils extends Thread {
    private Element e;
    private String filepath;
    
    
    
    public FileOutUtils(Element e, String filepath) {
        this.e = e;
        this.filepath = filepath;
    }
    /**
     * 多线程下载
     * @author wangshiyu
     * @param e
     * @param filepath
     */
    public void savefile(Element e,String filepath){
        String src=e.attr("src");//获取img中的src路径
        // System.out.println(src);
        //获取后缀名
        String imageName = src.substring(src.lastIndexOf("/") + 1,src.length());
        //连接url
        URL url;
        try {
            url = new URL(src);
            URLConnection uri=url.openConnection();
            //获取数据流
            InputStream is=uri.getInputStream();
            //写入数据流
            File file = new File("E://imgs//"+filepath);
            if(!file.exists()){
                file.mkdirs();
            }
            OutputStream os = new FileOutputStream(new File("E://imgs//"+filepath+"//", imageName)); 
            byte[] buf = new byte[1024]; 
            int l=0; 
            while((l=is.read(buf))!=-1){
                os.write(buf, 0, l);
            } 
 
        } catch (MalformedURLException e1) {
            e1.printStackTrace();
        } catch (FileNotFoundException e1) {
            e1.printStackTrace();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
    }
    public void run() {
        this.savefile(this.e,this.filepath);
    }
}