开发简单爬虫项目系列踩坑

  • 背景介绍
  • 场景问题


背景介绍

应用Httpclient、OkHttp、RestTemplate进行一系列解析及复杂模拟人为过程

一言不合,先走读代码瞧瞧

@Configuration
public class RestTemplateConfig {

    /**
     * 原本准备使用连接池,考虑到cookie httpclient默认使用问题
     */
    @Bean
    public RestTemplate defaultRestTemplate() {
        return new RestTemplate(httpRequestFactory());
    }


    public static RestTemplate getPrototypeRestTemplate(){
        Registry<ConnectionSocketFactory> registry = RegistryBuilder.<ConnectionSocketFactory>create()
                .register("http", PlainConnectionSocketFactory.getSocketFactory())
                .register("https", SSLConnectionSocketFactory.getSocketFactory())
                .build();
        PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(registry);
        //设置整个连接池最大连接数 根据自己的场景决定
        connectionManager.setMaxTotal(200);
        //路由是对maxTotal的细分
        connectionManager.setDefaultMaxPerRoute(50);
        SocketConfig socketConfig = SocketConfig.custom()
                .setSoKeepAlive(false)
                .setSoLinger(1)
                .setSoReuseAddress(true)
                .setSoTimeout(5 * 1000)   //超时时间设置
                .setTcpNoDelay(true).build();
        RequestConfig requestConfig = RequestConfig.custom()
                .setConnectTimeout(5 * 1000)//建立网络超时时间
                .setSocketTimeout(5 * 1000)//通讯过程中的超时时间
                .setConnectionRequestTimeout(5 * 1000) //分配等待连接池分配的超时时间
                .build();
		//避免服务器端吊死客户端连接,例如:服务端keepAlive贼长或,client -1 为永远
        ConnectionKeepAliveStrategy keepAliveStrategy = new DefaultConnectionKeepAliveStrategy() {
            @Override
            public long getKeepAliveDuration(final HttpResponse response, final HttpContext context) {
                long keepAlive = super.getKeepAliveDuration(response, context);
                if (keepAlive == -1) {
                    keepAlive = 5000;
                }
                return keepAlive;
            }
        };

        return new RestTemplate( new HttpComponentsClientHttpRequestFactory(
                //只有服务器返回header location不为空,且DefaultRedirectStrategy只支持GET、HEAD客户端跳转,不支持POST
                HttpClientBuilder.create()
                        //每次新建连接都会从CookieStore找,注意控制,如果需要所有连接公用cookie才启用
                        //.setDefaultCookieStore(Constants.httpCookieStore)
                        //.setDefaultSocketConfig(socketConfig)
                        //.setConnectionManager(connectionManager).setConnectionManagerShared(true)
                        //只有服务器返回header location不为空,且DefaultRedirectStrategy只支持GET、HEAD客户端跳转,不支持POST
                        .setRedirectStrategy(new CustomRedirectStrategy())
                        .setKeepAliveStrategy(keepAliveStrategy)
                        .setDefaultRequestConfig(requestConfig)
                        //.setRetryHandler(new DefaultHttpRequestRetryHandler(4, true))
                        .build()));
    }


    @Bean
    public ClientHttpRequestFactory httpRequestFactory() {
        return new HttpComponentsClientHttpRequestFactory(httpClient());
    }

    @Bean
    public HttpClient httpClient() {
        Registry<ConnectionSocketFactory> registry = RegistryBuilder.<ConnectionSocketFactory>create()
                .register("http", PlainConnectionSocketFactory.getSocketFactory())
                .register("https", SSLConnectionSocketFactory.getSocketFactory())
                .build();
        PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(registry);
        //设置整个连接池最大连接数 根据自己的场景决定
        connectionManager.setMaxTotal(200);
        //路由是对maxTotal的细分
        connectionManager.setDefaultMaxPerRoute(50);
        SocketConfig socketConfig = SocketConfig.custom()
                .setSoKeepAlive(false)
                .setSoLinger(1)
                .setSoReuseAddress(true)
                .setSoTimeout(120000)   //超时时间设置
                .setTcpNoDelay(true).build();
        RequestConfig requestConfig = RequestConfig.custom()
                .setConnectTimeout(20 * 1000)//建立网络超时时间
                .setSocketTimeout(20 * 1000)//通讯过程中的超时时间
                .setConnectionRequestTimeout(20 * 1000) //分配等待连接池分配的超时时间
                .build();

        ConnectionKeepAliveStrategy keepAliveStrategy = new DefaultConnectionKeepAliveStrategy() {
            @Override
            public long getKeepAliveDuration(final HttpResponse response, final HttpContext context) {
                long keepAlive = super.getKeepAliveDuration(response, context);
                if (keepAlive == -1) {
                    keepAlive = 5000;
                }
                return keepAlive;
            }
        };

        return HttpClientBuilder.create()
                //.disableAutomaticRetries()//关闭自动处理重定向
                .setDefaultSocketConfig(socketConfig)
               // .setDefaultCookieSpecRegistry(r)
                //.setDefaultCookieStore(Constants.httpCookieStore)
                .setConnectionManager(connectionManager).setConnectionManagerShared(true)
                //只有服务器返回header location不为空,且DefaultRedirectStrategy只支持GET、HEAD客户端跳转,不支持POST
                .setRedirectStrategy(new CustomRedirectStrategy())
                //.setDefaultHeaders(Constants.location_header_list)
                .setKeepAliveStrategy(keepAliveStrategy)
                .setDefaultRequestConfig(requestConfig)
               // .setRetryHandler(new DefaultHttpRequestRetryHandler(4, true))
                .build();
    }


    /**
     * 基于OkHttp3配置RestTemplate
     * @return
     */
    @Bean
    public RestTemplate okHttpRestTemplate(OkHttpClient okHttpClient) {
        return new RestTemplate(new OkHttp3ClientHttpRequestFactory(okHttpClient));
    }

}

用来解决子域名和一级域名公用cookie的问题,当时httpclient没找到解决方式,后来才找到的

public class OkHttpCookieManager implements CookieJar {

    Map<String,List<Cookie>> cacheCookie = new HashMap<>();

    /**
     * 拿cookie填充request
     * @param url
     * @return
     */
    @NotNull
    @Override
    public List<Cookie> loadForRequest(@NotNull HttpUrl url) {
        return putCookie(url.host().split("\\.")[1],null);
    }

    private List<Cookie> putCookie(String domainSp2,List<Cookie> cookieAppend){
        List<Cookie> cookies = cacheCookie.get(domainSp2);
        if(CollectionUtils.isEmpty(cookies)){
            cookies = new ArrayList<>();
        }
        if(cookieAppend != null)
            cookies.addAll(cookieAppend);

        cacheCookie.put(domainSp2,cookies);
        return cookies;
    }

    /**
     * 将response Cookie响应保存
     * @param url
     * @param cookies
     */
    @Override
    public void saveFromResponse(@NotNull HttpUrl url, @NotNull List<Cookie> cookies) {
        putCookie(url.host().split("\\.")[1],cookies);
    }
}
@Configuration
public class OkHttpConfig {

    @Bean
    public X509TrustManager x509TrustManager() {
        return new X509TrustManager() {
            @Override
            public void checkClientTrusted(X509Certificate[] x509Certificates, String s) {
            }
            @Override
            public void checkServerTrusted(X509Certificate[] x509Certificates, String s) {
            }
            @Override
            public X509Certificate[] getAcceptedIssuers() {
                return new X509Certificate[]{};
            }
        };
    }
    @Bean
    public SSLSocketFactory sslSocketFactory() {
        try {
            //信任任何链接
            SSLContext sslContext = SSLContext.getInstance("TLS");
            sslContext.init(null, new TrustManager[]{x509TrustManager()}, new SecureRandom());
            return sslContext.getSocketFactory();
        } catch (NoSuchAlgorithmException | KeyManagementException e) {
            e.printStackTrace();
        }
        return null;
    }
    /**
     * Create a new connection pool with tuning parameters appropriate for a single-user application.
     * The tuning parameters in this pool are subject to change in future OkHttp releases. Currently
     */
    @Bean
    public ConnectionPool pool() {
        return new ConnectionPool(30, 1, TimeUnit.MINUTES);
    }

    @Bean
    public OkHttpClient okHttpClient() {
        return new OkHttpClient.Builder()
                .hostnameVerifier((hostname, session) -> true)
                .sslSocketFactory(sslSocketFactory(), x509TrustManager())
                //.retryOnConnectionFailure(false)//是否开启缓存
                .connectionPool(pool())//连接池
                .connectTimeout(30L, TimeUnit.SECONDS)
                .readTimeout(30L, TimeUnit.SECONDS)
                .followRedirects(true).followSslRedirects(true)
                .cookieJar(new OkHttpCookieManager())
                .build();
    }
}

以上产生了3套restTeamplte、分别以okHttp连接池、httclient连接池、不包含连接池的httpclient

场景问题

为什么要这么写?那就衍生出以下场景问题(搜遍全网的资料太少)

1.HttpClient建立连接池后会复用连接,且Cookstore进行重用,即访问x.com/s和访问x.com/m的携带request cookie会一致造成登录串号问题

解决方式:

  • 不使用连接池,保持每个连接都是new HttClient或HttpClientBuilder.create
  • setDefaultCookieStore(Constants.httpCookieStore) 可以通过设置Cookie存储然后在具体操作时对Cookie进行筛选,这样client拿到的就不是原来的cookieStore了
  • RequestConfig requestConfig = RequestConfig.custom() .setCookieSpec(CookieSpecs.IGNORE_COOKIES) .build();使用cookie策略忽略Cookie,但这时都需要外部在header中进行cookie追加处理

2.请求后服务端进行302重定向,无法获取目标数据

解决方案:基本上很多帖子都讲到了,配置服务端重定向
setRedirectStrategy(new LaxRedirectStrategy())
此处的重定向策略可以自定义,client提供的有俩种,

com.gargoylesoftware.htmlunit.httpclient.HtmlUnitRedirectStrategie
org.apache.http.impl.client.LaxRedirectStrategy

判断是否满足重定向的区别不同:

  1. HtmlUnit增加location判断、同时调用默认的DefaultRedirectStrategy.isRedirected进行状态判断
  2. LaxRedirectStrategy 是沿用父类的响应状态外增加HttpRequst.Method来判断
HtmlUnit:
return super.isRedirected(request, response, context) && response.getFirstHeader("location") != null;
public class LaxRedirectStrategy extends DefaultRedirectStrategy {
    private static final String[] REDIRECT_METHODS = new String[]{"GET", "POST", "HEAD"};

    public LaxRedirectStrategy() {
    }

    protected boolean isRedirectable(String method) {
        String[] arr$ = REDIRECT_METHODS;
        int len$ = arr$.length;

        for(int i$ = 0; i$ < len$; ++i$) {
            String m = arr$[i$];
            if (m.equalsIgnoreCase(method)) {
                return true;
            }
        }

        return false;
    }
}

3.经过服务器端跳转后会set-Cookie,如何获取?此时外部API无法获取,直接得到的是目标对象response

解决方式:
重写RedirectStrategy跳转策略,即setRedirectStrategy(new CustomRedirectStrategy()),我这里采用的是和LaxRedirectStrategy 同样的处理方式,对请求方式重定向,对getLocationURI进行重写,因为要获取302之后的reponse header中的location用来截取参数,同时获取响应cookie

注意:如果在多线程多连接的情况下,重写需要注意区分cookie,不然cookie会串,可通过ThreadLocal、request、response取值做标识区分;

@Immutable
public class CustomRedirectStrategy extends DefaultRedirectStrategy {
private static final String[] REDIRECT_METHODS = new String[]{"GET", "POST", "HEAD"};


protected boolean isRedirectable(String method) {
    String[] arr$ = REDIRECT_METHODS;
    int len$ = arr$.length;

    for (String m : arr$) {
        if (m.equalsIgnoreCase(method)) {
            return true;
        }
    }

    return false;
}

@Override
public URI getLocationURI(HttpRequest request, HttpResponse response, HttpContext context) throws ProtocolException {
    Header location = response.getFirstHeader("location");
    if(location != null && location.getValue().contains("xxx")){
        List<String> cookieList = new ArrayList<>();

        String xxx = request.getFirstHeader("Upgrade-Insecure-Request").getValue();
        for(Header header: response.getHeaders(HttpHeaders.SET_COOKIE)){
            cookieList.add(header.getName() + "=" + header.getValue());
        }
        /*ThreadContextUtil.put("NEW_FORWARD_URL",location.getValue());
        ThreadContextUtil.put("COOKIE_LIST",cookieList);*/
        if(Constants.xxx.size() > 1000)
            Constants.xx.clear();
        if(Constants.xxx.size() > 1000)
            Constants.xxx.clear();

        Constants.xxx.put(xxx,location.getValue());
        Constants.xx.put(xx,cookieList);
    }

    return super.getLocationURI(request, response, context);
}
}

4.整合OkHttp遇到的https访问问题

//此处通过信任所有主机处理
OkHttpClient.Builder().hostnameVerifier((hostname, session) -> true)

5.使用体验
如果是底层业务系统开发,可采用kotlin语言的okHttp高效、简单(但API不友好),对比之下RestTemplate使用上手友好的多,不过暴露的可操作入口也是极少