项目源代码 https://gitee.com/fakerlove/jsoup
文章目录
- 2. http-client 讲解
- 2.1 get 请求
- 2.2 get带请求
- 工具类
- 发送请求
- 2.3 Post 请求
- 2.4 Post 带参数
- 2.5 连接池
- 2.6 参数
2. http-client 讲解
2.1 get 请求
请求的网址
https://www.baidu.com
java 代码
package com.ak;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
//@Slf4j
public class demo {
public static void main(String[] args) {
// 打开浏览器
CloseableHttpClient aDefault = HttpClients.createDefault();
// 输入网址 https://movie.douban.com/chart
HttpGet httpGet=new HttpGet("https://www.baidu.com");
CloseableHttpResponse response=null;
try {
// 获取响应内容
response= aDefault.execute(httpGet);
// System.out.println(response);
if(response.getStatusLine().getStatusCode()==200){
String content = EntityUtils.toString(response.getEntity());
System.out.println(content.length());
}
} catch (IOException e) {
e.printStackTrace();
}finally {
// 关闭流
if(response!=null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
2.2 get带请求
请求的网址
https://movie.douban.com/tag/#/?sort=U&range=0,10&tags=电影
工具类
package com.ak.utils;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Random;
public class HttpUtils {
public static PoolingHttpClientConnectionManager cm;
public static ArrayList<String> agents;
static {
// 创建连接池管理器
cm = new PoolingHttpClientConnectionManager();
// 设置连接数
cm.setMaxTotal(100);
// 设置每个主机(理解为网站,如:百度10个、网易10个)的最大连接数
cm.setDefaultMaxPerRoute(10);
//初始化 User-Agent 信息
agents = new ArrayList<String>();
// 添加 User-Agent 信息
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36");
agents.add("Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
agents.add("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11");
agents.add("Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER");
agents.add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0");
agents.add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36");
System.out.println("<--------- HttpUtils initialization success --------->");
}
public static RequestConfig getConfig() {
RequestConfig config = RequestConfig.custom()
// 创建连接的最长时间
.setConnectTimeout(1000)
// 获取连接最长时间
.setConnectionRequestTimeout(1000)
// 数据传输最长时间
.setSocketTimeout(10 * 1000)
.build();
return config;
}
}
发送请求
package com.ak;
import com.ak.utils.HttpUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.Random;
/**
* 请求内容
* 爬取豆瓣的内容,因为豆瓣使用了爬虫所以需要一些工具类
*/
public class demo2 {
public static void main(String[] args) {
// 打开浏览器
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(HttpUtils.cm).build();
CloseableHttpResponse response=null;
// 设置参数
// 分析这个
// https://movie.douban.com/tag/#/?sort=U&range=0,10&tags=%E5%8A%A8%E4%BD%9C,%E7%94%B5%E5%BD%B1
try {
URIBuilder uriBuilder = new URIBuilder("https://movie.douban.com/tag/");
uriBuilder.setParameter("sort","U");
uriBuilder.setParameter("range","0,10");
uriBuilder.setParameter("tags","电影");
// 输入网址
HttpGet httpGet=new HttpGet(uriBuilder.build());
httpGet.setConfig(HttpUtils.getConfig());
int agentNum = new Random().nextInt(HttpUtils.agents.size());
httpGet.addHeader("User-Agent", HttpUtils.agents.get(agentNum));
// 获取响应内容
response= httpClient.execute(httpGet);
if(response.getStatusLine().getStatusCode()==200){
String content = EntityUtils.toString(response.getEntity());
System.out.println(content);
}
} catch (Exception e) {
e.printStackTrace();
}finally {
// 关闭流
if(response!=null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
爬取
https://www.baidu.com/s&wd=faker
类似于想要查找faker 的信息
package com.ak;
import com.ak.utils.HttpUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.Random;
/**
* 请求内容
* 爬取豆瓣的内容,因为豆瓣使用了爬虫所以需要一些工具类
*/
public class demo2 {
public static void main(String[] args) {
// 打开浏览器
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(HttpUtils.cm).build();
CloseableHttpResponse response=null;
// 设置参数
// 分析这个
// https://movie.douban.com/tag/#/?sort=U&range=0,10&tags=%E5%8A%A8%E4%BD%9C,%E7%94%B5%E5%BD%B1
try {
// https://movie.douban.com/tag/
URIBuilder uriBuilder = new URIBuilder("https://www.baidu.com/s");
uriBuilder.setParameter("wd","faker");
// uriBuilder.setParameter("range","0,10");
// uriBuilder.setParameter("tags","电影");
// 输入网址
HttpGet httpGet=new HttpGet(uriBuilder.build());
httpGet.setConfig(HttpUtils.getConfig());
int agentNum = new Random().nextInt(HttpUtils.agents.size());
httpGet.addHeader("User-Agent", HttpUtils.agents.get(agentNum));
// 获取响应内容
response= httpClient.execute(httpGet);
if(response.getStatusLine().getStatusCode()==200){
String content = EntityUtils.toString(response.getEntity());
System.out.println(content);
}
} catch (Exception e) {
e.printStackTrace();
}finally {
// 关闭流
if(response!=null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
2.3 Post 请求
请求网址
https://www.baidu.com
java代码
区别在于他们之间的HttpPost和HttpGet
package com.ak;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
//@Slf4j
public class demo3 {
public static void main(String[] args) {
// 打开浏览器
CloseableHttpClient aDefault = HttpClients.createDefault();
// 输入网址 https://movie.douban.com/chart
HttpPost httppost=new HttpPost("https://www.baidu.com");
CloseableHttpResponse response=null;
try {
// 获取响应内容
response= aDefault.execute(httppost);
// System.out.println(response);
if(response.getStatusLine().getStatusCode()==200){
String content = EntityUtils.toString(response.getEntity());
System.out.println(content);
}
} catch (IOException e) {
e.printStackTrace();
}finally {
// 关闭流
if(response!=null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
2.4 Post 带参数
爬取网址
http://yun.itheima.com/search
java 代码
package com.ak;
import com.ak.utils.HttpUtils;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
//@Slf4j
public class demo4 {
public static void main(String[] args) {
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(HttpUtils.cm).build();
CloseableHttpResponse response=null;
// 设置参数
// 分析这个
// https://www.baidu.com/s
try {
// 封装表单对象
List<NameValuePair> params=new ArrayList<>();
params.add(new BasicNameValuePair("keys","java"));
// 输入网址
HttpPost httpPost=new HttpPost("http://yun.itheima.com/search");
UrlEncodedFormEntity formEntity=new UrlEncodedFormEntity(params,"utf-8");
int agentNum = new Random().nextInt(HttpUtils.agents.size());
// 设置配置
httpPost.setConfig(HttpUtils.getConfig());
// 设置表单内容
httpPost.setEntity(formEntity);
// 设置代理
httpPost.addHeader("User-Agent", HttpUtils.agents.get(agentNum));
// 获取响应内容
response= httpClient.execute(httpPost);
System.out.println(response);
if(response.getStatusLine().getStatusCode()==200){
String content = EntityUtils.toString(response.getEntity());
System.out.println(content);
}
} catch (Exception e) {
e.printStackTrace();
}finally {
// 关闭流
if(response!=null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
2.5 连接池
package com.ak.utils;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Random;
public class HttpUtils {
public static PoolingHttpClientConnectionManager cm;
public static ArrayList<String> agents;
static {
// 创建连接池管理器
cm = new PoolingHttpClientConnectionManager();
// 设置连接数
cm.setMaxTotal(100);
// 设置每个主机(理解为网站,如:百度10个、网易10个)的最大连接数
cm.setDefaultMaxPerRoute(10);
//初始化 User-Agent 信息
agents = new ArrayList<String>();
// 添加 User-Agent 信息
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36");
agents.add("Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");
agents.add("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11");
agents.add("Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER");
agents.add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0");
agents.add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0");
agents.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36");
System.out.println("<--------- HttpUtils initialization success --------->");
}
/**
* 获取页面源代码
*
* @param url 网页链接
* @return 页面源代码
*/
public String doGetHtml(String url) {
// 通过连接池获取 httpClient
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
HttpGet httpGet = new HttpGet(url);
// 伪造 User-Agent(反反爬虫)
// 生成一个范围在 0-x(不包含x)内的任意正整数
int agentNum = new Random().nextInt(agents.size());
httpGet.addHeader("User-Agent", agents.get(agentNum));
// 设置请求信息
httpGet.setConfig(getConfig());
// 定义 response,方便 finally 中关闭
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
// 获取并判断,状态码是否正常(正常值:200)
if (response.getStatusLine().getStatusCode() == 200) {
// 判断响应体是否为空,不为空则获取内容
if (response.getEntity() != null) {
// 获取响应体,并指定 UTF-8 编码
String content = EntityUtils.toString(response.getEntity(), "utf8");
return content;
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
// 判断并关闭 response
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
// 不关闭 httpClient,交给连接池管理
}
System.out.println("<--------- doGetHtml() ERROR --------->");
return "";
}
public static RequestConfig getConfig() {
RequestConfig config = RequestConfig.custom()
// 创建连接的最长时间
.setConnectTimeout(1000)
// 获取连接最长时间
.setConnectionRequestTimeout(1000)
// 数据传输最长时间
.setSocketTimeout(10 * 1000)
.build();
return config;
}
}
2.6 参数
public static RequestConfig getConfig() {
RequestConfig config = RequestConfig.custom()
// 创建连接的最长时间
.setConnectTimeout(1000)
// 获取连接最长时间
.setConnectionRequestTimeout(1000)
// 数据传输最长时间
.setSocketTimeout(10 * 1000)
.build();
return config;
}