最后更新:2020年8月31日11:42:00

 

一、概述

URLConnection是java.net包中的一个抽象类,其主要用于实现应用程序与URL之间的通信;

HttpURLConnection继承自URLConnection,也是抽象类;

在网络爬虫中,可以使用URLConnection或HttpURLConnection请求URL获取流数据,通过对流数据的操作,获取具体的实体内容;

 

二、实例化

1、说明

URLConnection与HttpURLConnection都是抽象类,无法直接创建实例化对象,但可以通过java.net包URL类中的openConnection()方法创建URLConnection与HttpURLConnection实例;

 

2、代码示例

package com.zb.book.connection;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;

public class Main {
public static void main(String[] args) throws IOException {
URL url = new URL("http://www.baidu.com/");
URLConnection urlConnection = url.openConnection();
HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
}
}

 

三、通过GET请求获取响应体html

package com.zb.book.connection;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;

public class Main {
public static void main(String[] args) throws IOException {
//创建URL
URL url = new URL("http://www.baidu.com/");
//创建连接
HttpURLConnection connection = (HttpURLConnection)url.openConnection();
//允许input
connection.setDoInput(true);
//设置请求方法是GET
connection.setRequestMethod("GET");
//进行连接操作
connection.connect();
//如果响应码等于200
if(HttpURLConnection.HTTP_OK == connection.getResponseCode()){
//创建BufferedReader输入流来读取URL的响应,并设置编码
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8));
//读取内容
String readLine;
StringBuilder response = new StringBuilder();
while (null != (readLine = bufferedReader.readLine())){
response.append(readLine);
}
bufferedReader.close();
System.out.println(response.toString());
}
}
}

 

四、模拟提交表单(POST请求)

package com.zb.book.connection;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;

//模拟提交表单(POST请求)
public class PostSubmitForm {
public static void main(String[] args) throws IOException {
//初始化提交表单的参数
String wen = "EH629625211CS";
String action = "ajax";
//初始化URL
URL url = new URL("http:www.***.com/ems.php");
//创建连接
HttpURLConnection connection = (HttpURLConnection)url.openConnection();
//允许Output
connection.setDoOutput(true);
connection.setRequestMethod("POST");
//拼接请求参数
byte[] bytes = ("wen" + "=" + wen + "&" + "action" + "=" + action).getBytes();
//在连接中添加参数
connection.getOutputStream().write(bytes);
//定义BufferedReader输入流来读取URL的响应,这里设置编码
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8));
String line;
StringBuilder html= new StringBuilder();
while ((line = bufferedReader.readLine()) != null){
html.append(line);
}
System.out.println(html.toString());
bufferedReader.close();
}
}

 

五、设置头消息

package com.zb.book.connection;

import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;

//设置头信息
public class SetHeader {
public static void main(String[] args) throws IOException {
//初始化URL
URL url = new URL("http://www.***.com.cn/b.asp");
URLConnection connection = url.openConnection();
//HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
//添加请求头信息
connection.setRequestProperty("Accept", "text/html");
connection.setRequestProperty("Accept-Language","zh-CN,zh;q=0.5");
connection.setRequestProperty("Host","www.***.com.cn");
connection.setRequestProperty("Cache-Control","max-age=0");
connection.setRequestProperty("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36");
connection.connect();
//其后进行其他操作即可
}
}

 

六、连接超时设置

1、概述

使用URLConnection或HttpURLConnection时,可以设置两种超时时间,分别是连接超时时间(ConnectTimeout)和读取超时时间(ReadTimeout);

 

2、代码演示

package com.zb.book.connection;

import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;

//设置超时时间
public class SetTimeout {
public static void main(String[] args) throws IOException {
//初始化URL
URL url = new URL("http://www.***.com.cn/b.asp");
URLConnection connection = url.openConnection();
//HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
//设置超时时间
connection.setConnectTimeout(10000);//连接超时
connection.setReadTimeout(10000);//读取超时
//后续可进行其他操作
}
}

 

七、代理服务器的使用

package com.zb.book.connection;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URL;
import java.net.URLConnection;

//代理服务器的设置
public class SetProxy {
public static void main(String[] args) throws IOException {
//创建代理对象
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("171.97.67.160", 3128));
//初始化URL
URL url = new URL("http://www.***.com.cn/b.asp");
//创建连接对象并设置代理
URLConnection connection = url.openConnection(proxy);
//建立连接
connection.connect();
}
}

 

八、HTTPS请求认证

package com.crawler;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.security.cert.X509Certificate;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
public class URLConnectionSSL {
public static void main(String[] args) throws IOException {
initUnSecureTSL();
//使用URLConnection请求数据
URL url = new URL("https://cn.kompass.com/a/hospitality-tourism-hotel-and-catering-industries/78/");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
int statusCode = conn.getResponseCode(); //获取响应状态码
String responseBody = null;
//如果响应状态码为200
if (HttpURLConnection.HTTP_OK == statusCode) {
// 定义BufferedReader输入流来读取URL的响应 ,这里设置编码
BufferedReader bufferedReader = new BufferedReader(
new InputStreamReader(conn.getInputStream(), "utf-8"));
//读取内容
String readLine = null;
StringBuffer response = new StringBuffer();
while (null != (readLine = bufferedReader.readLine())) {
response.append(readLine);
}

bufferedReader.close();
responseBody = response.toString();
}
System.out.println(responseBody);
}
private static void initUnSecureTSL() {
// 创建信任管理器(不验证证书)
final TrustManager[] trustAllCerts = new TrustManager[]{new X509TrustManager() {
//检查客户端证书
public void checkClientTrusted(final X509Certificate[] chain, final String authType) {
//do nothing 接受任意客户端证书
}
//检查服务器端证书
public void checkServerTrusted(final X509Certificate[] chain, final String authType) {
//do nothing 接受任意服务器端证书
}
//返回受信任的X509证书
public X509Certificate[] getAcceptedIssuers() {
return null; //或者return new X509Certificate[0];
}
}};
try {
// 创建SSLContext对象,并使用指定的信任管理器初始化
SSLContext sslContext = SSLContext.getInstance("SSL");
sslContext.init(null, trustAllCerts, new java.security.SecureRandom());
// 基于信任管理器创建套接字工厂 (ssl socket factory)
SSLSocketFactory sslSocketFactory = sslContext.getSocketFactory();
//为HttpsURLConnection配置套接字工厂SSLSocketFactory
HttpsURLConnection.setDefaultSSLSocketFactory(sslSocketFactory);
//正常访问Https协议网站
} catch (Exception e) {
e.printStackTrace();
}
}
}