一、首先,引入依赖jar包
org.jsoup
jsoup
1.11.3
二、编写方法
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/*
-
爬虫
/
public class MainActivity {
/
* Jsoup爬取糗事百科数据
*/
public static List<Map<String,String>> spider(String url,int pages){List<Map<String,String>> list = new ArrayList<Map<String,String>>(); try { Document mozilla = Jsoup.connect(url+"/page/"+pages) .userAgent("Mozilla") .timeout(3000) .get(); Elements select1 = mozilla.select("div.article.block"); for (Element element : select1) { Document parse = Jsoup.parse(element.toString()); Elements select = parse.select("a h2");//作者 Elements select2 = parse.select("a img");//作者头像 Elements select3 = parse.select("a.contentHerf");//内容 Elements select4 = parse.select("div.thumb img");//内容图片 Elements select5 = parse.select("span.stats-vote i");//赞数量 Map<String,String> map = new HashMap<String, String>(); map.put("author", select.text());//作者 map.put("author_img", select2.size()>0?"http:"+select2.attr("src"):"");//作者头像 map.put("content", select3.text());//内容 map.put("content_img", select4.size()>0?"http:"+select4.attr("src"):"");//内容图片 map.put("number", select5.text());//赞数量 list.add(map); } } catch (IOException e) { list = null; } return list; }
}
三,测试
String url = “https://www.qiushibaike.com/8hr/page/”;
List<Map<String, String>> list = MainActivity.spider(url, 1);