java爬虫下载付费html网页模板

原创

公众号bigsai 2022-08-24 14:22:48 ©著作权

©著作权归作者所有：来自51CTO博客作者公众号bigsai的原创作品，请联系作者获取转载授权，否则将追究法律责任

前言

前一段时间我们有一个网页的projiect小项目，要求学习bootstarp。然而自己写的模板和别人写好的东西，无论从美观和手机运行的兼容性上差距都很巨大。中途我们放弃自己写的东西，开始偷别人的模板。有些甚至不会偷的同学甚至还付费下载，都什么年代了，程序员还要花钱买模板。那次结束后，突发奇想能不能写个程序，让他自动下载模板。经过不断努力和解决bug，最终取得了成功。

思路

大致思路为：输入模板的一个页面为url，通过这个链接遍历所有与之有关的链接放到hashset中（采用队列的宽度优先遍历bfs）。这个相关用字符判断链接前面的主要域名地址。（链出去的链接不处理，防止无限扩大）。同时，还要将各种url分类放到不同的set中。

html页面分析：抓取html链接。还要按行读取html文本分析其中可能隐藏的css文件（可能有背景图片）。获取js链接，获取image地址，css地址，（注意一定要储存绝对地址而不是相对地址）。还有的涉及到上层目录。需要处理。

css页面：按行分析。因为css中可能储存背景图片以及其他logo。
js：直接下载保存。
html：下载保存
image：下载保存

注意点：

所有下载链接或者其他活动都要在try catch进行，在catch中跳过这个步骤，执行相应步骤。
下载目录在download自行更改（默认F：//download）
添加jsoup的jar包
有些图片藏在js文件中和css文件中，所以需要去判断js文件和css文件，我这个只分析了css没分析css。
由于精力和时间问题，项目并没有晚上，由于笔者此时正则能力不足，大部分采用字符串分割查找或者contains查找，难免有疏漏
目前代码测试只针对17素材之家部分模板测试有效。其他站点未进行测试
只是小白，代码亢长低水平，大佬勿喷。
附上代码如下：

代码

启动主类getmoban

import java.io.IOException;
import java.util.Iterator;
import java.util.Scanner;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;


public class getmoban {

  public static void main(String[] args) throws IOException
  {
    ExecutorService ex=Executors.newFixedThreadPool(6);
    Scanner sc=new Scanner(System.in);
    System.out.println("请输入网址（别太大否则下载不完）");
    String url=sc.nextLine();
    geturl g=new geturl(url);//
    csssearch cssimage=new csssearch();
    System.out.println(g.file);
    g.judel();    
    Iterator it=g.htmlurlset.iterator();    
    while(it.hasNext())
    {
      String name=it.next();
      try {
        download download=new download(name);
        
       ex.execute(download);  
      }
      catch(Exception e){}
      //System.out.println("地址为" name);
    }
    Iterator it2=g.jsset.iterator();
    while(it2.hasNext())
    {
      String name=it2.next();
      try {
        download download=new download(name);
       ex.execute(download);  
      }
        catch(Exception e){}
      //System.out.println("js地址为" name);
    }
    Iterator it3=g.cssset.iterator();
    while(it3.hasNext())//css需要过滤其中是否有背景图片
    {
      String name=it3.next();
      try {
        download download=new download(name);
        ex.execute(download);
        cssimage.searchimage(name);
      }
        catch(Exception e){}
      //System.out.println("css地址为" name);
    }
    Iterator it4=g.imgset.iterator();
    while(it4.hasNext())
    {
      String name=it4.next();
      try {
        download download=new download(name);
       ex.execute(download);  
      }
        catch(Exception e){}
      //System.out.println("image地址为" name);
    }
    ex.shutdown();
    //judel();
  }
}

分析链接geturl

import java.io.IOException;
import java.util.ArrayDeque;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Queue;
import java.util.Set;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class geturl {
  
  public static String url="http://www.17sucai.com/preview/1/2014-11-28/jQuery用户注册表单验证代码/index.html";
  static String head="http";
  public geturl(String url)
  {
    this.url=url;
  }
  static String file=url;//文件路径
  {
    if(url.contains("http"))
    {
      head=file.split("//")[0];
      file=file.split("//")[1];
    }
    int last=file.lastIndexOf("/");
    file=file.substring(0, last);
  }
  static Set htmlurlset=new HashSet();//html
  static Set jsset=new HashSet();//js
  static Set imgset=new HashSet();//image
  static Set cssset=new HashSet();//css样式
  static Queue queue=new ArrayDeque();
  
//  public geturl() throws IOException 
//  {this.judel();}
  public static void judel() throws IOException 
  {
    queue.add(url);htmlurlset.add(url);
    while(!queue.isEmpty()&&queue!=null)//要防止链接无限扩大
    {
      String teamurl=queue.poll();//弹出头并且删除节点
      System.out.println(teamurl);
      
      if(!teamurl.endsWith(".com"))//有的网站短小，可能识别有错误  
      {
      if(file.indexOf("/")>0)
      {if(teamurl.contains(file.substring(0,file.indexOf("/"))))
      analyze(teamurl);}
      else
        analyze(teamurl);
      }
//      catch(Exception e) {System.out.println("cuo");}     
    }
    
  }
      
  public static void analyze(String URL)
  {
    try {
   Document doc;
    doc = Jsoup.connect(URL).timeout(20000).header("user-agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36").ignoreContentType(true).get();
     Elements all=doc.select("[class]");//检查
     Elements js=doc.getElementsByTag("script");
     Elements html=doc.select("a[href]");
     Elements img=doc.select("img");
     Elements css=doc.select("link[href]");
     for(Element e:all)
     {
       if(e.attr("style")!="")//找到藏在html的css的图片背景
       { 
         String tex=e.attr("style");
         if(tex.contains("url"))
         {
           String urladress=file;
          String imgurl=tex.split("url")[1];
          imgurl=imgurl.split("\\(")[1].split("\\)")[0];//转义字符串
          if(imgurl.startsWith("'")||imgurl.startsWith("\""))//注意转义字符串
          {
            imgurl=imgurl.substring(1,imgurl.length()-1);
          } 
          while(imgurl.startsWith(".."))
          {
            imgurl=imgurl.substring(imgurl.indexOf("/") 1);          
            urladress=urladress.substring(0,urladress.lastIndexOf("/"));
          }
          urladress=head "//" urladress "/" imgurl;
          imgset.add(urladress);
         }         
       }
     }
     for(Element htmlelement:html)
     {        
       String a=htmlelement.absUrl("href").split("#")[0];
       
       if(!a.equals(""))
       {
         if(!htmlurlset.contains(a)&&a.contains(file.substring(0,file.indexOf("/"))))//不存在继续遍历
         { queue.add(a);htmlurlset.add(a); //System.out.println(a);
         }       
       }         
     }
     for(Element jselement:js)//判断JS
     {
       String team=jselement.absUrl("src");  
       if(!team.equals(""))
       jsset.add(team);//添加

     }
     for(Element csselement:css)
     {
       String team=csselement.absUrl("href");
       if(!team.equals(""))//绝对路径
       cssset.add(team);      
       // System.out.println(e.attr("href"));
     }
     for(Element imageelement:img)
     {
       String team=imageelement.absUrl("src");
       if(!team.equals(""))//绝对路径
       imgset.add(team);
       
       //System.out.println(e.attr("href"));
     }
    }
    catch(Exception e)
    {
      if(!queue.isEmpty()) {
      URL=queue.poll();
       analyze(URL);}
    }
  }         
  }

分析css（css可能隐藏图片）csssearch

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

public class csssearch {

  public static void searchimage(String ur) throws IOException {
    if(ur.toLowerCase().contains("bootstarp")) {return;}//bootstarp.css过滤掉，肯定没图片
    Set imgset=new HashSet();
    //String ur="http://demo.cssmoban.com/cssthemes5/cpts_1019_bpi/css/style.css";
    String http="http";
    String fileurl=ur;
    if(fileurl.startsWith("http"))
    {
      http=fileurl.split("//")[0];//防止https协议
      fileurl=fileurl.split("//")[1];
    }
    fileurl=fileurl.substring(0,fileurl.lastIndexOf("/"));
    //System.out.println(fileurl);//测试
    URL url=new URL(ur);
     URLConnection conn = url.openConnection();
   conn.setConnectTimeout(1000);
   conn.setReadTimeout(5000);
   conn.connect();
   InputStream in= conn.getInputStream();
   InputStreamReader inp=new InputStreamReader(in);
   BufferedReader buf=new BufferedReader(inp);
   File file=new File("F:\\download\\" ur.split("//")[1]);
     if(!file.exists())
     {
       file.getParentFile().mkdirs();
       file.createNewFile();
     }
    // BufferedOutputStream bufout=new BufferedOutputStream(new FileOutputStream(file));
     String tex="";
     while((tex=buf.readLine())!=null)
     {
//      System.out.println(tex);
      if(tex.contains("url"))
      {
        String urladress=fileurl;
        String imgurl=tex.split("url")[1];
        imgurl=imgurl.split("\\(")[1].split("\\)")[0];//转义字符串
        if(imgurl.startsWith("'")||imgurl.startsWith("\""))//注意转义字符串
        {
          imgurl=imgurl.substring(1,imgurl.length()-1);
        }
        //System.out.println(imgurl);//测试
        while(imgurl.startsWith(".."))
        {
          imgurl=imgurl.substring(imgurl.indexOf("/") 1);          
          urladress=urladress.substring(0,urladress.lastIndexOf("/"));
        }
        urladress=http "//" urladress "/" imgurl;
        //System.out.println(urladress);
        //down.download(urladress);
        imgset.add(urladress);
      }
     }
  //   bufout.close();
     buf.close();
     inp.close();
     in.close();
     Iterator it=imgset.iterator();
     while(it.hasNext())
     {     
      String team=it.next();
      
      try {
        download down=new download(team);
        Thread t1=new Thread(down);
        t1.start();System.out.println(team "下载成功");}
      catch(Exception e) {System.out.println("下载失败：" team);}
     }
     
  }
}

download(线程池下载)

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;

public class download implements Runnable{
  
  public String ur;
  public download() {}
  public download(String ur)
  {
    this.ur=ur;
  }
  public static void download(String ur) throws IOException  
  {     
     //String ur="http://www.17sucai.com/preview/1266961/2018-06-22/wrj/index.html";
    String fileplace=ur;
    
    if(fileplace.contains("http"))
    {
      
      fileplace=fileplace.split("//")[1];
    }
   URL url = new URL(ur);
   URLConnection conn = url.openConnection();
   conn.setConnectTimeout(4000);
   conn.setReadTimeout(5000);
   conn.connect();
   InputStream in= conn.getInputStream();
   
   BufferedInputStream buf=new BufferedInputStream(in);
   File file=new File("F:\\download\\" fileplace);
   if(!file.exists())
   {
     file.getParentFile().mkdirs();
     file.createNewFile();
   }
   //System.out.print(file.getAbsolutePath()); 
   BufferedOutputStream bufout=new BufferedOutputStream(new FileOutputStream(file)); 
//   int b=0;
//   while((b=buf.read())!=-1)
//   {
//     bufout.write(b);
//     //System.out.println(b "");
//   }
   byte b[]=new byte[1024];
   int n=0;
   while((n=buf.read(b))!=-1)
   {
    bufout.write(b, 0, n);
   }
   in.close();
   buf.close();      
      bufout.close();
      
      //fullFileName.close();
   }
  @Override
  public void run() {
    try {
      download(ur);
      System.out.println(Thread.currentThread().getName() " 下载" ur "成功");
    } catch (IOException e) {
      // TODO 自动生成的 catch 块
      e.printStackTrace();
    }
    
  }
    
}