访问页面:

private static void viewPages() throws Exception{

 

        Thread.sleep(500);

 

        new Thread(new Runnable() {

 

            @Override

            public void run() {

                try {

 

                    while(!urlWaitingQueue.isEmpty()) {

 

                        String url = urlWaitingQueue.peek();

 

                        final String finalUrl = url;

 

                        // build a client, like open a browser

                        CloseableHttpClient httpClient = HttpClients.createDefault();

 

                        // create get method, like input url in the browser

                        //HttpGet httpGet = new HttpGet("http://www.dxy.cn");

                        HttpPost httpPost = new HttpPost(finalUrl);

 

                        StringBuffer stringBuffer = new StringBuffer();

                        HttpResponse response;

 

 

                        //List<NameValuePair> keyValue = new ArrayList<NameValuePair>();

 

                        //  Post parameter

                        //            keyValue.add(new BasicNameValuePair("username", "zhu"));

                        //

                        //            httpPost.setEntity(new UrlEncodedFormEntity(keyValue, "UTF-8"));

 

 

                        // access and get response

                        response = httpClient.execute(httpPost);

 

                        // record access URL

                        urlQueue.putIfAbsent(finalUrl, Boolean.TRUE);

 

                        if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {

 

                            HttpEntity httpEntity = response.getEntity();

                            if (httpEntity != null) {

                                logger.info("viewPages访问URL:" + finalUrl);

                                BufferedReader reader = new BufferedReader(

                                        new InputStreamReader(httpEntity.getContent(), "UTF-8"));

 

                                String line = null;

                                if (httpEntity.getContentLength() > 0) {

 

                                    stringBuffer = new StringBuffer((int) httpEntity.getContentLength());

 

                                    while ((line = reader.readLine()) != null) {

                                        stringBuffer.append(line);

                                    }

 

                                    System.out.println(finalUrl + "内容: " + stringBuffer);

                                }

                            }

 

                        }

 

                    }

 

 

                } catch (Exception e) {

                    logger.error("view pages error", e);

                }

            }

 

        }).start();

 

 

    }

三. 总结及将来要实现功能

 

以上贴出了简易版Java爬虫的核心实现模块, 基本上拿起来就能测试。

控制爬取速度(调度模块), 使用代理IP访问(收集网络代理模块)的实现在你可以在自己的版本中会慢慢加上...