HttpClient可以通过模拟请求获取响应的方式实现数据抓取,相较于HtmlUnit模拟页面点击的抓取方式,这种通过模拟请求的方式更加高效,可以有针对性实现一个请求来获取响应数据,避免了页面大连不必要的请求和响应。但同时,这个特点也使得他不能执行嵌入在HTML页面中的javascript代码,猜测内容类型,重新格式化请求/重定向URI,或者其它和HTTP运输无关的功能,只是单纯的获得响应报文。
//[1]声明HttpClient CloseableHttpClient client = HttpClients.createDefault(); //[2]增加post访问地址,如果是get请求,使用HttpGet String url=""; HttpPost httpPost=new HttpPost(url); //[3]普通传参 List<NameValuePair> param = new ArrayList(); param.add(new BasicNameValuePair("key", "value")); //[4]请求参数拼接 httpPost.setEntity(new UrlEncodedFormEntity(param, "GBK")); //[3]组装json传参数 JSONObject jsonParam = new JSONObject(); jsonParam.put("key", value); //[4]请求响应实体类,设置编码并加入json StringEntity entity = new StringEntity(jsonParam.toString(),"utf-8"); entity.setContentType("application/json"); httpPost.setEntity(entity); //[5]设置请求头信息 httpPost.setHeader("POST", ""); //使用代理 HttpHost proxy = new HttpHost(ip,port,"http"); RequestConfig config = RequestConfig.custom().setProxy(proxy).setConnectionRequestTimeout(2000).build(); httpPost.setConfig(config); //[6]发送请求并获取响应 CloseableHttpResponse response = client.execute(httpPost); //[7]获取响应实体 HttpEntity httpEntity=response.getEntity(); //[8]解析响应实体中的数据为字符串 String result = EntityUtils.toString(httpEntity, "gb2312"); //解析json串 Document document=Jsoup.parseBodyFragment(result.toString); JSONObject jsonObject=new JSONObject(document.text()); JSONArray jsonArray=new JSONArray(jsonObject.get("datas")); for(int i=0;i<jsonArray.length();i++){ JSONObject jsonObject1=jsonArray.getJSONObject(i); } //[9]销毁实体 EntityUtils.consume(entity); EntityUtils.consume(httpEntity); //[10]释放连接 httpPost.releaseConnection(); client.close(); response.close();
使用代理服务器时,如果需要判断代理服务是否可用,可以使用telnet,java实现如下
private TelnetClient telnet; private String serverIP; private int serverPort; private String termtype; /** * 终端类型的选定可以解决部分乱码问题 */ public static String WINDOWS = "VT220"; public static String UNIX = "VT100"; public static String LINUX = "VT100"; public TelnetTest(String server, String termtype,int serverPort) { this.serverIP=server; this.termtype=termtype; this.serverPort=serverPort; } public boolean test(){ boolean flag=false; try { if (termtype != null &&!termtype.trim().equals("")) { telnet = new TelnetClient(termtype); } telnet.setConnectTimeout(6 * 1000);// 连接超时6秒钟 telnet.connect(serverIP, serverPort); telnet.setSoTimeout(4 * 1000);// 阻塞4秒 flag=true; } catch (Exception e) { e.printStackTrace(); }finally{ try { telnet.disconnect(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } return flag; }
相关博文:使用HtmlUnit实现数据抓取 http://zuohao1990.blog.51cto.com/6057850/1726548