最近学习网页设计,想仿网络上的一个页面,图片素材一个一个地保存起来太麻烦。就想着里利用JAVA来实现一个小小的网页图片爬虫,代码很简单,不一会儿就实现了,但是当我访问https协议的图片时,一直报javax.net.ssl.SSLKeyException异常,到我存储图片的目录,http协议的已经爬取下来了,但https的没有。花了三天,还是一直卡这个异常。一直百度,Google找了很久。看大神们的博客源码,一个个写的都差不多,我也照着写了一遍,但是就是没有找到办法实现HTTPS的访问一直出现javax.net.ssl.SSLKeyException。
没办法,不能只是copy,找了很多教程发现原来http与https是有区别的。简单理解就是https是一种加密的http,也就是在HTTP上加入了SSL协议,SSL依靠证书与服务器进行验证,从而进行客户端与服务器之间的数据通信加密,……。切入正题,要进行HTTPS的连接就必须验证证书。但是证书我们没有,所想要连接就必须获得证书。获取网页证书其实也挺简单,访问https网站时,浏览器地址窗口会有个绿色小锁标志,
查看证书
点击复制到文件……,就可以保存到本地。保存到本地后在连接https的时候读取证书文件作为验证就可以连接到https了,不过这一方法我没有写,因为每抓取一个网站的图片就要获取这个网站的证书,还是有点麻烦的。所以我我使用的是下面是第二种方法:跳过证书验证,即访问任何HTTPS网站都不需要验证证书,虽然说不安全,但是对于只是爬取图片来说,没差了。废话不多说上代码
- 跳过SSL证书验证方法
public static SSLSocketFactory createSSL() throws KeyManagementException, NoSuchAlgorithmException, NoSuchProviderException, KeyStoreException, CertificateException, FileNotFoundException, IOException{
TrustManager[] tm =new TrustManager[]{
myTrustManager
};
SSLContext sslContext = SSLContext.getInstance("TLS");
sslContext.init(null, tm, null);
SSLSocketFactory ssf = sslContext.getSocketFactory();
HttpsURLConnection.setDefaultHostnameVerifier(ignoreHostnameVerifier);
HttpsURLConnection.setDefaultSSLSocketFactory(sslContext.getSocketFactory());
return ssf;
}
public static TrustManager myTrustManager = new X509TrustManager()
{
@Override
public void checkClientTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {}
@Override
public void checkServerTrusted(X509Certificate[] arg0, String arg1){}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
这就实现了HTTPS连接时的验证问题。其实网上的代码也是如此,但是我就是在这个地方卡了很久,网上搜索的代码都是这个类型。但是都没有提到我接下来遇到的问题。我觉得这个问题虽然很小,但是对于新手来说很致命,就像我一样,所以我自己需要记下来,顺便可以分享给同样遇到这个问题的朋友。代码正确但却又报javax.net.ssl.SSLKeyException异常。其实关键在于Jre中lib/ext文件夹中的文件,具体为什么我也不清楚,搜索问题是有人提到这个文件夹,所以我就试了试。我把这个文件夹的内容导入到Eclipse项目中,但是问题还是没有解决,所以我对比了一下java安装目录下JDK和JRE,发现JRE/lib包含的东西比JDK中的东西多,好吧其实我也不懂。就报着尝试的态度把项目中的JAVA运行环境改成JRE,想不到运行可以了,简直不可思议。在项目中右击运行方式,英文版的是好像是RunAS,打开运行配置,选JRE,选择备用JRE,点击已安装
点进去后添加(Add)JRE,JRE主目录填写JRE安装的位置(非JDK),完成。
完成后,选中你添加的JRE,点击应用,确认。此时再运行,发现不报错了。本地也找到爬取的图片,完成。
下面贴上完整的代码:Utils类
package com.get;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.ssl.SSLContext;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLContextBuilder;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
/**
*
* @author 90604
* 工具类
*/
public class Utils {
// 获取img标签正则
public static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
// 获取src路径的正则
public static final String IMGSRC_REG = "[a-zA-z]+://[^\\s]*";
//获取标签background
public static final String BACKGROUND_REG =
"url/((/S+?)/)";
//获取图片链接(这一个使用,其他没使用)
public static final String HTTP_IMG = "(http|https)://.+\\.(jpg|gif|png)";
//获取网页源码
public static String getHtml(String urlString) throws IOException{
URL url = new URL(urlString);
HttpURLConnection hrc = (HttpURLConnection) url.openConnection();
InputStream in = hrc.getInputStream();
String html = Utils.convertStreamToString(in);
return html;
}
//获取网页源码,利用HttpClient
public static String setImageConnectTool(String url){
String html = null;
RequestConfig globalConfig =
RequestConfig.custom()
.setCookieSpec(CookieSpecs.STANDARD)
.setConnectionRequestTimeout(5000)//设置从connect Manager获取Connection 超时时间,单位毫秒。这个属性是新加的属性,因为目前版本是可以共享连接池的。
.setConnectTimeout(6000)//设置连接超时时间,单位毫秒
.build();
//创建httpClient实例
CloseableHttpClient httpClient = HttpClients.custom()
.setDefaultRequestConfig(globalConfig)
.build();
//url代表每张图片下载地址
HttpGet httpGet = new HttpGet(url);
//创建httpget请求
httpGet.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0");
//执行get请求
try {
//获取get请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//获取响应实体
HttpEntity entity = response.getEntity();
System.out.println(response.getStatusLine());
InputStream in = entity.getContent(); //得到请求回来的数据
//得到请求到的页面
Utils.convertStreamToString(in);
} catch (ClientProtocolException e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
} catch (IOException e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
}
return html;
}
public static String convertStreamToString(InputStream in) throws UnsupportedEncodingException {
BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
StringBuilder sb = new StringBuilder();
String line = null;
String str = System.getProperty("line.separator");
try {
while ((line = reader.readLine()) != null) {
sb.append(line + str);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return sb.toString();
}
//获取ImageUrl地址
public static List<String> getImageUrl(String html,String reg){
Matcher matcher=Pattern.compile(reg).matcher(html);
List<String>listimgurl=new ArrayList<String>();
while (matcher.find()){
listimgurl.add(matcher.group());
}
return listimgurl;
}
//获取ImageSrc地址
public List<String> getImageSrc(List<String> listimageurl){
List<String> listImageSrc=new ArrayList<String>();
for (String image:listimageurl){
Matcher matcher=Pattern.compile(IMGSRC_REG).matcher(image);
while (matcher.find()){
listImageSrc.add(matcher.group().substring(0, matcher.group().length()-1));
}
}
return listImageSrc;
}
public static String getImageName(String urlName){
String str = null;
int start = urlName.lastIndexOf("/");
int end = urlName.length();
str = urlName.substring(start+1, end);
return str;
}
}
ImageFile类(保存图片文件):
package com.get;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.NoSuchProviderException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
public class ImageFile implements Runnable {
private String url;
private String name;
private static final String PATH = "D:\\img\\";
static HostnameVerifier ignoreHostnameVerifier = new HostnameVerifier(){
@Override
public boolean verify(String arg0, SSLSession arg1) {
return true;
}
};
public ImageFile(String url,String name){
this.url = url;
this.name = name;
}
@Override
public void run() {
OutputStream os = null;
InputStream in = null;
SSLSocketFactory ssf = null;
File dir = new File(PATH);
if (!dir.exists()) {
dir.mkdirs();
System.out.println("图片存放于"+PATH+"目录下");
}
File file = new File(PATH+name);
try {
os = new FileOutputStream(file);
URL u = new URL(this.url);
if (u.getProtocol().toLowerCase().equals("https")) {
HttpsURLConnection https = (HttpsURLConnection)u.openConnection();
https.setSSLSocketFactory(createSSL());
https.setConnectTimeout(5000);
https.setReadTimeout(5000);
https.setDoOutput(true);
https.setRequestMethod("GET");
https.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0");
https.connect();
System.out.println(https.getResponseCode() + " " + https.getResponseMessage());
in = https.getInputStream();
} else {
HttpURLConnection conn = (HttpURLConnection)u.openConnection();
conn.connect();
System.out.println(conn.getResponseCode() + " " + conn.getResponseMessage());
in = conn.getInputStream();
}
byte[] buff = new byte[1024];
while(true){
int readed = in.read(buff);//读取内容长度
if(readed == -1){
break;
}
byte[] temp = new byte[readed];
System.arraycopy(buff, 0, temp, 0, readed);//内容复制
os.write(temp);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch(MalformedURLException e){
e.printStackTrace();
} catch(IOException e){
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
} finally{
try {
os.close();
if(in != null)
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static SSLSocketFactory createSSL() throws KeyManagementException, NoSuchAlgorithmException, NoSuchProviderException, KeyStoreException, CertificateException, FileNotFoundException, IOException{
TrustManager[] tm =new TrustManager[]{
myTrustManager
};
SSLContext sslContext = SSLContext.getInstance("TLS");
sslContext.init(null, tm, null);
SSLSocketFactory ssf = sslContext.getSocketFactory();
HttpsURLConnection.setDefaultHostnameVerifier(ignoreHostnameVerifier);
HttpsURLConnection.setDefaultSSLSocketFactory(sslContext.getSocketFactory());
return ssf;
}
public static TrustManager myTrustManager = new X509TrustManager()
{
@Override
public void checkClientTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {}
@Override
public void checkServerTrusted(X509Certificate[] arg0, String arg1){}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
}
主函数:
package com.get;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.NoSuchProviderException;
import java.util.ArrayList;
import java.util.List;
public class Main_getImage {
public static void main(String[] args) throws KeyManagementException, NoSuchAlgorithmException, NoSuchProviderException, IOException {
String html = null;
List<String> list = new ArrayList<String>();//存放ImageURL
try {
html = Utils.getHtml("http://m.lashou.com/");//返回的是字符串Html
} catch (IOException e) {
e.printStackTrace();
}
//利用工具类,获取每张图片的URL
list = Utils.getImageUrl(html, Utils.HTTP_IMG);
for(String string :list){
if(string.indexOf(".gif") != (string.length()-4) ||string.indexOf(".jpg") != (string.length()-4)
|| string.indexOf(".png") != (string.length()-4)){
//需改进
String s[] = string.split("\" original=\"");
for(String ss:s){
System.out.println(ss+"\n");
new Thread(new ImageFile(ss,Utils.getImageName(ss))).start();
}
}else{
System.out.println(string+"\n");
new Thread(new ImageFile(string,Utils.getImageName(string))).start();
System.out.println(Utils.getImageName(string));
}
}
}
}