
private static void viewPages() throws Exception{




        new Thread(new Runnable() {



            public void run() {

                try {


                    while(!urlWaitingQueue.isEmpty()) {


                        String url = urlWaitingQueue.peek();


                        final String finalUrl = url;


                        // build a client, like open a browser

                        CloseableHttpClient httpClient = HttpClients.createDefault();


                        // create get method, like input url in the browser

                        //HttpGet httpGet = new HttpGet("http://www.dxy.cn");

                        HttpPost httpPost = new HttpPost(finalUrl);


                        StringBuffer stringBuffer = new StringBuffer();

                        HttpResponse response;



                        //List<NameValuePair> keyValue = new ArrayList<NameValuePair>();


                        //  Post parameter

                        //            keyValue.add(new BasicNameValuePair("username", "zhu"));


                        //            httpPost.setEntity(new UrlEncodedFormEntity(keyValue, "UTF-8"));



                        // access and get response

                        response = httpClient.execute(httpPost);


                        // record access URL

                        urlQueue.putIfAbsent(finalUrl, Boolean.TRUE);


                        if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {


                            HttpEntity httpEntity = response.getEntity();

                            if (httpEntity != null) {

                                logger.info("viewPages访问URL:" + finalUrl);

                                BufferedReader reader = new BufferedReader(

                                        new InputStreamReader(httpEntity.getContent(), "UTF-8"));


                                String line = null;

                                if (httpEntity.getContentLength() > 0) {


                                    stringBuffer = new StringBuffer((int) httpEntity.getContentLength());


                                    while ((line = reader.readLine()) != null) {




                                    System.out.println(finalUrl + "内容: " + stringBuffer);









                } catch (Exception e) {

                    logger.error("view pages error", e);








三. 总结及将来要实现功能


以上贴出了简易版Java爬虫的核心实现模块, 基本上拿起来就能测试。

控制爬取速度(调度模块), 使用代理IP访问(收集网络代理模块)的实现在你可以在自己的版本中会慢慢加上...