今天直播如何做爬虫,怕取人家网站的信息,这个小说我感觉不错,所以想下载下来,但是没有直接下载的地址,所以就把他网站给爬下来了,如有冒犯,请站长联系俺。
目标网站
网站基本信息
网站是nginx服务器
编码是 utf-8的
目标的爬取dom
可以看到章节的页面代码是 id=list
节点
解析主页
package day111_24;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.yellowcong.http.common.Constants;
import com.yellowcong.http.utils.HttpClient;
/**
*
*作者:yellowcong
*日期:2017/11/24
*時間:8:47:19
*描述:
*/
public class Demo {
public static void main(String [] args) {
// https://www.yite.cc/book/dxjdntbb/
//获取路径
String htmlStr = HttpClient.get(Constants.HOME_URL);
//将解析的html转化为
Document homeDoc = Jsoup.parse(htmlStr);
//获取到所有的链接
Element element = homeDoc.getElementById("list");
//获取所有的章节信息
Elements pages = element.getElementsByTag("a");
System.out.println(pages.size());
for(int i=0;i<pages.size();i++) {
Element node = pages.get(i);
String hrefStr = node.attr("href");
String hrefContent = node.html();
System.out.printf("%s-->%s\r\n",hrefStr,hrefContent);
}
}
}
运行结果
代码运行的头和尾,我发现就297章和网站上的298章节对不上,后来发现,然来是王子少了一章。
结尾
界面上的头和尾
结尾
解析单独界面
查看对应的dom,可以看到id是 content
解析代码
package day111_24;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import com.yellowcong.http.common.Constants;
import com.yellowcong.http.utils.HttpClient;
/**
*
*作者:yellowcong
*日期:2017/11/24
*時間:9:40:17
*描述:
*/
public class Demo2 {
public static void main(String[] args) {
String url = "https://www.yite.cc/book/dxjdntbb/12640.html";
//获取路径
String htmlStr = HttpClient.get(url);
//将解析的html转化为
Document homeDoc = Jsoup.parse(htmlStr);
Element content = homeDoc.getElementById("content");
String str = content.text();
System.out.println(str);
}
}
运行代码
到此还没有结束,还需要设计多线程
目录结构
工具类 HttpClient
package com.yellowcong.http.utils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.http.HeaderIterator;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultConnectionKeepAliveStrategy;
import org.apache.http.impl.client.DefaultRedirectStrategy;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
public class HttpClient {
private static final Logger LOG = LogManager.getLogger(HttpClient.class);
/** 请求网站的编码,这个地方,我默认 写的是GB3212*/
private static final String DEFALUT_ENCODE = "UTF-8";
public static CloseableHttpClient httpClient = null;
public static HttpClientContext context = null;
public static CookieStore cookieStore = null;
public static RequestConfig requestConfig = null;
static {
init();
}
private static void init() {
context = HttpClientContext.create();
cookieStore = new BasicCookieStore();
// 配置超时时间(连接服务端超时1秒,请求数据返回超时2秒)
requestConfig = RequestConfig.custom().setConnectTimeout(120000).setSocketTimeout(60000)
.setConnectionRequestTimeout(60000).build();
// 设置默认跳转以及存储cookie
httpClient = HttpClientBuilder.create().setKeepAliveStrategy(new DefaultConnectionKeepAliveStrategy())
.setRedirectStrategy(new DefaultRedirectStrategy()).setDefaultRequestConfig(requestConfig)
.setDefaultCookieStore(cookieStore).build();
}
/**
* 发送get请求
*
* @param url
* @return response
* @throws ClientProtocolException
* @throws IOException
*/
public static String get(String url) {
HttpGet httpget = new HttpGet(url);
CloseableHttpResponse response = null;
try {
//伪装为浏览器
httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0");
//设定请求的参数
response= httpClient.execute(httpget, context);
return copyResponse2Str(response);
} catch(Exception e){
LOG.debug("请求失败\t"+url);
}finally {
try {
if(response != null){
response.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
/**
* 将返回的Response转化成String对象
* @param response 返回的Response
* @return
*/
private static String copyResponse2Str(CloseableHttpResponse response){
try {
int code = response.getStatusLine().getStatusCode();
//当请求的code返回值不是400的情况
if((code == HttpStatus.SC_MOVED_TEMPORARILY )
|| (code == HttpStatus.SC_MOVED_PERMANENTLY)
|| (code == HttpStatus.SC_SEE_OTHER)
|| (code == HttpStatus.SC_TEMPORARY_REDIRECT)) {
return null;
}else{
return copyInputStream2Str(response.getEntity().getContent());
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 将InputStream转化为String类型的数据
* @param in
* @return
*/
private static String copyInputStream2Str(InputStream in){
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(in,DEFALUT_ENCODE));
String line = null;
StringBuffer sb = new StringBuffer();
while((line = reader.readLine()) != null){
sb.append(line);
}
return sb.toString();
} catch (Exception e) {
LOG.debug("获取字符串失败");
}
return null;
}
/**
* 发送post请求,不带参数 的post
* @param url
* @return
*/
public static String post(String url){
return post(url, null);
}
/**
* 发从post 请求
* @param url
* @param parameters
* @return
* @throws ClientProtocolException
* @throws IOException
*/
public static String post(String url, Map<String,Object> parameters){
HttpPost httpPost = new HttpPost(url);
CloseableHttpResponse response = null;
try {
//设定请求的参数
setRequestParamter(parameters, httpPost);
//发送请求
response = httpClient.execute(httpPost, context);
return copyResponse2Str(response);
}catch(Exception e){
LOG.debug("请求失败\t"+url);
}finally {
try {
if(response != null){
response.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
/**
* 设定POST请求的参数
* @param parameters
* @param httpPost
* @throws UnsupportedEncodingException
*/
private static void setRequestParamter(Map<String, Object> parameters, HttpPost httpPost)
throws UnsupportedEncodingException {
List<NameValuePair> nvps;
//添加参数
if(parameters != null && parameters.size()>0){
nvps = new ArrayList<NameValuePair>();
for(Map.Entry<String, Object> map:parameters.entrySet()){
NameValuePair param = new BasicNameValuePair(map.getKey(), map.getValue().toString());
nvps.add(param);
}
httpPost.setEntity(new UrlEncodedFormEntity(nvps, DEFALUT_ENCODE));
}
}
/**
* 将 http://www.yellowcong.com?age=7&name=8
* 这种age=7&name=8 转化为map数据
* @param parameters
* @return
*/
@SuppressWarnings("unused")
private static List<NameValuePair> toNameValuePairList(String parameters) {
List<NameValuePair> nvps = new ArrayList<NameValuePair>();
String[] paramList = parameters.split("&");
for (String parm : paramList) {
int index = -1;
for (int i = 0; i < parm.length(); i++) {
index = parm.indexOf("=");
break;
}
String key = parm.substring(0, index);
String value = parm.substring(++index, parm.length());
nvps.add(new BasicNameValuePair(key, value));
}
System.out.println(nvps.toString());
return nvps;
}
/**
* 手动增加cookie
* @param name
* @param value
* @param domain
* @param path
*/
public void addCookie(String name, String value, String domain, String path) {
BasicClientCookie cookie = new BasicClientCookie(name, value);
cookie.setDomain(domain);
cookie.setPath(path);
cookieStore.addCookie(cookie);
}
/**
* 把结果console出来
*
* @param httpResponse
* @throws ParseException
* @throws IOException
*/
public static void printResponse(HttpResponse httpResponse) throws ParseException, IOException {
// 获取响应消息实体
HttpEntity entity = httpResponse.getEntity();
// 响应状态
System.out.println("status:" + httpResponse.getStatusLine());
System.out.println("headers:");
HeaderIterator iterator = httpResponse.headerIterator();
while (iterator.hasNext()) {
System.out.println("\t" + iterator.next());
}
}
/**
* 把当前cookie从控制台输出出来
*
*/
public static void printCookies() {
cookieStore = context.getCookieStore();
List<Cookie> cookies = cookieStore.getCookies();
for (Cookie cookie : cookies) {
System.out.println("key:" + cookie.getName() + " value:" + cookie.getValue());
}
}
/**
* 检查cookie的键值是否包含传参
*
* @param key
* @return
*/
public static boolean checkCookie(String key) {
cookieStore = context.getCookieStore();
List<Cookie> cookies = cookieStore.getCookies();
boolean res = false;
for (Cookie cookie : cookies) {
if (cookie.getName().equals(key)) {
res = true;
break;
}
}
return res;
}
/**
* 直接把Response内的Entity内容转换成String
*
* @param httpResponse
* @return
* @throws ParseException
* @throws IOException
*/
public static String toString(CloseableHttpResponse httpResponse) throws ParseException, IOException {
// 获取响应消息实体
HttpEntity entity = httpResponse.getEntity();
if (entity != null)
return EntityUtils.toString(entity);
else
return null;
}
}
常量
package com.yellowcong.http.common;
/**
*
*作者:yellowcong
*日期:2017/11/24
*時間:9:00:19
*描述:
*/
public class Constants {
//设定编码
public static final String WEB_ENCODE ="UTF-8";
//主页路径
public static final String HOME_URL ="https://www.yite.cc/book/dxjdntbb/";
}
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>yellowcong</groupId>
<artifactId>day111_24</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>day111_24</name>
<url>http://maven.apache.org</url>
<!-- 配置国内比较快的 阿里云的Maven仓库 -->
<repositories>
<repository>
<id>aliyunmaven</id>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
</repository>
</repositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<!-- 日志 -->
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.16</version>
</dependency>
<!-- 网页解析 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>
<!-- http协议 解析 BEGIN-->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.2</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.9</version>
</dependency>
<!-- http协议 解析 END-->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
问题合集
1、伪装成浏览器
所谓魔高一尺,道高一丈,所以,我就可以通过伪装浏览器来解决
//伪装为浏览器
httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0");
下面报错403,是由于做了处理,不能让爬虫访问这个网站