一:什么是爬虫?爬虫是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。

二:写java爬虫需要具备什么基础知识?jdbc:操作数据库。

ehcache(redis):重复url判断。

log4j:日志记录。

httpclient:发送http请求。

jsoup:解析返回的网页内容。

三:举个例子博客园首页爬取 地址:博客园 - 代码改变世界。

项目结构

pom.xml:项目maven依赖

xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
4.0.0
com.inossem
BlogSpider
0.0.1-SNAPSHOT
mysql
mysql-connector-java
5.1.37
org.apache.httpcomponents
httpclient
4.5.2
org.jsoup
jsoup
1.10.1
log4j
log4j
1.2.16
net.sf.ehcache
ehcache
2.10.3
commons-io
commons-io
2.5
log4j.rootLogger=INFO, stdout,D
#Console
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target = System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n
#D
log4j.appender.D = org.apache.log4j.RollingFileAppender
log4j.appender.D.File = D://bloglogs/log.log
log4j.appender.D.MaxFileSize=100KB
log4j.appender.D.MaxBackupIndex=100
log4j.appender.D.Append = true
log4j.appender.D.layout = org.apache.log4j.PatternLayout
log4j.appender.D.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [ %t:%r ] - [ %p ] %m%n
ehcache.xml:缓存相关
maxElementsInMemory="1"
eternal="true"
overflowToDisk="true"/>
name="cnblog"
maxElementsInMemory="1"
diskPersistent="true"
eternal="true"
overflowToDisk="true"/>
dbUrl=jdbc:mysql://localhost:3306/db_blogs?autoReconnect=true
dbUserName=root
dbPassword=root
jdbcName=com.mysql.jdbc.Driver
cacheFilePath=D://ehcache//ehcache.xml
imageFilePath=D://blogImages/
imagePath=http://localhost:8080/BlogCms/static/blogmIages/
DateUtil.java: 获取日期工具类
package com.inossem.blog.util;
import java.text.SimpleDateFormat;
import java.util.Date;
/**
* 日期工具类
* @author user
*
*/
public class DateUtil {
/**
* 获取当前年月日路径
* @return
* @throws Exception
*/
public static String getCurrentDatePath()throws Exception{
Date date=new Date();
SimpleDateFormat sdf=new SimpleDateFormat("yyyy/MM/dd");
return sdf.format(date);
}
public static void main(String[] args) {
try {
System.out.println(getCurrentDatePath());
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
DbUtil.java: 数据库操作工具类
package com.inossem.blog.util;
import java.sql.Connection;
import java.sql.DriverManager;
/**
* 数据库工具类
* @author user
*
*/
public class DbUtil {
/**
* 获取连接
* @return
* @throws Exception
*/
public Connection getCon()throws Exception{
Class.forName(PropertiesUtil.getValue("jdbcName"));
Connection con=DriverManager.getConnection(PropertiesUtil.getValue("dbUrl"), PropertiesUtil.getValue("dbUserName"), PropertiesUtil.getValue("dbPassword"));
return con;
}
/**
* 关闭连接
* @param con
* @throws Exception
*/
public void closeCon(Connection con)throws Exception{
if(con!=null){
con.close();
}
}
public static void main(String[] args) {
DbUtil dbUtil=new DbUtil();
try {
dbUtil.getCon();
System.out.println("数据库连接成功");
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("数据库连接失败");
}
}
}
PropertiesUtil.java: 获取配置文件信息工具类
package com.inossem.blog.util;
import java.io.*;
import java.util.Properties;
/**
* properties工具类
* @author user
*
*/
public class PropertiesUtil {
/**
* 根据key获取value值
* @param key
* @return
*/
public static String getValue(String key){
Properties prop=new Properties();
try {
InputStream in = new FileInputStream("src/main/resources/spider.properties");
prop.load(in);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return prop.getProperty(key);
}
public static void main(String[] args) {
System.out.println(getValue("imageFilePath"));
}
}
CnBlogSpider.java: 爬虫
package com.inossem.blog.spider;
import com.inossem.blog.util.DateUtil;
import com.inossem.blog.util.DbUtil;
import com.inossem.blog.util.PropertiesUtil;
import net.sf.ehcache.Cache;
import net.sf.ehcache.CacheManager;
import net.sf.ehcache.Status;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
public class CnBlogSpider {
private static Logger logger = Logger.getLogger(CnBlogSpider.class);
private static final String URL = "http://www.cnblogs.com/";
private static Connection con = null;
private static CacheManager manager = null; // cache管理器
private static Cache cache = null; // cache缓存对象
/**
* 解析主页
*/
private static void parseHomePage() {
logger.info("开始爬取" + URL + "网页");
manager = CacheManager.create(PropertiesUtil.getValue("cacheFilePath"));
cache = manager.getCache("cnblog");
CloseableHttpClient httpClient = HttpClients.createDefault(); // 获取HttpClient实例
HttpGet httpget = new HttpGet(URL); // 创建httpget实例
RequestConfig config = RequestConfig.custom().setSocketTimeout(100000) // 设置读取超时时间
.setConnectTimeout(5000) // 设置连接超时时间
.build();
httpget.setConfig(config);
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpget);
} catch (ClientProtocolException e) {
logger.error(URL + "-ClientProtocolException", e);
} catch (IOException e) {
logger.error(URL + "-IOException", e);
}
if (response != null) {
HttpEntity entity = response.getEntity(); // 获取返回实体
// 判断返回状态是否为200
if (response.getStatusLine().getStatusCode() == 200) {
String webPageContent = null;
try {
webPageContent = EntityUtils.toString(entity, "utf-8");
parseHomeWebPage(webPageContent);
} catch (ParseException e) {
logger.error(URL + "-ParseException", e);
} catch (IOException e) {
logger.error(URL + "-IOException", e);
}
} else {
logger.error(URL + "-返回状态非200");
}
} else {
logger.error(URL + "-连接超时");
}
try {
if (response != null) {
response.close();
}
if (httpClient != null) {
httpClient.close();
}
} catch (Exception e) {
logger.error(URL + "Exception", e);
}
if (cache.getStatus() == Status.STATUS_ALIVE) {
cache.flush(); // 把缓存写入文件
}
manager.shutdown();
logger.info("结束爬取" + URL + "网页");
}
/**
* 解析首页内容 提取博客link
*
* @param webPageContent
*/
private static void parseHomeWebPage(String webPageContent) {
if ("".equals(webPageContent)) {
return;
}
Document doc = Jsoup.parse(webPageContent);
Elements links = doc.select("#post_list .post_item .post_item_body h3 a");
for (int i = 0; i < links.size(); i++) {
Element link = links.get(i);
String url = link.attr("href");
System.out.println(url);
if (cache.get(url) != null) { // 如果缓存中存在就不插入
logger.info(url + "-缓存中存在");
continue;
}
parseBlogLink(url);
}
}
/**
* 解析博客链接地址 获取博客内容
*
* @param link
*/
private static void parseBlogLink(String link) {
logger.info("开始爬取" + link + "网页");
CloseableHttpClient httpClient = HttpClients.createDefault(); // 获取HttpClient实例
HttpGet httpget = new HttpGet(link); // 创建httpget实例
RequestConfig config = RequestConfig.custom().setSocketTimeout(100000) // 设置读取超时时间
.setConnectTimeout(5000) // 设置连接超时时间
.build();
httpget.setConfig(config);
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpget);
} catch (ClientProtocolException e) {
logger.error(URL + "-ClientProtocolException", e);
} catch (IOException e) {
logger.error(URL + "-IOException", e);
}
if (response != null) {
HttpEntity entity = response.getEntity(); // 获取返回实体
// 判断返回状态是否为200
if (response.getStatusLine().getStatusCode() == 200) {
String blogContent = null;
try {
blogContent = EntityUtils.toString(entity, "utf-8");
parseBlogPage(blogContent, link);
} catch (ParseException e) {
logger.error(URL + "-ParseException", e);
} catch (IOException e) {
logger.error(URL + "-IOException", e);
}
} else {
logger.error(URL + "-返回状态非200");
}
} else {
logger.error(URL + "-连接超时");
}
try {
if (response != null) {
response.close();
}
if (httpClient != null) {
httpClient.close();
}
} catch (Exception e) {
logger.error(URL + "Exception", e);
}
logger.info("结束爬取" + link + "网页");
}
/**
* 解析博客内容,提取有效信息
*
* @param blogContent
* @param link
*/
private static void parseBlogPage(String blogContent, String link) {
if ("".equals(blogContent)) {
return;
}
Document doc = Jsoup.parse(blogContent);
Elements titleElements = doc.select("#cb_post_title_url"); // 获取博客标题
if (titleElements.size() == 0) {
logger.error(link + "-未获取到博客标题");
return;
}
String title = titleElements.get(0).text();
System.out.println("博客标题:" + title);
Elements contentElements = doc.select("#cnblogs_post_body"); // 获取博客内容
if (contentElements.size() == 0) {
logger.error(link + "-未获取到博客内容");
return;
}
String content = contentElements.get(0).html();
System.out.println("博客内容:" + content);
/**
* 处理图片内容
*/
// Elements imgElements = contentElements.select("img"); // 获取所有图片元素
// List imgUrlList = new LinkedList();
// for (int i = 0; i < imgElements.size(); i++) {
// Element imgEle = imgElements.get(i);
// String url = imgEle.attr("src");
// imgUrlList.add(url);
// System.out.println(url);
// }
//
// if (imgUrlList.size() > 0) {
// Map replaceImgMap = downLoadImages(imgUrlList);
// String newContent = replaceWebPageImages(content, replaceImgMap);
// content = newContent;
// }
// 插入数据库
String sql = "insert into t_article values(null,?,?,?,now())";
try {
PreparedStatement pstmt = con.prepareStatement(sql);
pstmt.setString(1, title);
pstmt.setString(2, content);
pstmt.setString(3, link);
if (pstmt.executeUpdate() == 1) {
logger.info(link + "-成功插入数据库");
cache.put(new net.sf.ehcache.Element(link, link));
logger.info(link + "-已加入缓存");
} else {
logger.info(link + "-插入数据库失败");
}
} catch (SQLException e) {
logger.error("SQLException", e);
}
}
/**
* 把原来的网页图片地址换成本地新的
*
* @param content
* @param replaceImgMap
* @return
*/
private static String replaceWebPageImages(String content, Map replaceImgMap) {
for (String url : replaceImgMap.keySet()) {
String newPath = replaceImgMap.get(url);
content = content.replace(url, newPath);
}
return content;
}
/**
* 下载图片到本地
*
* @param imgUrlList
* @return
*/
private static Map downLoadImages(List imgUrlList) {
Map replaceImgMap = new HashMap();
RequestConfig config = RequestConfig.custom().setSocketTimeout(10000) // 设置读取超时时间
.setConnectTimeout(5000) // 设置连接超时时间
.build();
CloseableHttpClient httpClient = HttpClients.createDefault(); // 获取HttpClient实例
for (int i = 0; i < imgUrlList.size(); i++) {
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
String url = imgUrlList.get(i);
logger.info("开始爬取" + url + "图片");
CloseableHttpResponse response = null;
try {
HttpGet httpget = new HttpGet(url); // 创建httpget实例
httpget.setConfig(config);
response = httpClient.execute(httpget);
} catch (ClientProtocolException e) {
logger.error(url + "-ClientProtocolException");
} catch (IOException e) {
logger.error(url + "-IOException");
}
if (response != null) {
HttpEntity entity = response.getEntity(); // 获取返回实体
// 判断返回状态是否为200
if (response.getStatusLine().getStatusCode() == 200) {
try {
InputStream inputStream = entity.getContent();
String imageType = entity.getContentType().getValue();
String urlB = imageType.split("/")[1];
String uuid = UUID.randomUUID().toString();
String currentDatePath = DateUtil.getCurrentDatePath();
String newPath = PropertiesUtil.getValue("imagePath") + currentDatePath + "/" + uuid + "." + urlB;
FileUtils.copyToFile(inputStream, new File(PropertiesUtil.getValue("imageFilePath") + currentDatePath + "/" + uuid + "." + urlB));
replaceImgMap.put(url, newPath);
} catch (UnsupportedOperationException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
} else {
logger.error("返回状态非200");
}
} else {
logger.error("连接超时");
}
try {
if (response != null) {
response.close();
}
} catch (Exception e) {
logger.error("Exception", e);
}
logger.info("结束爬取" + url + "图片");
}
return replaceImgMap;
}
public static void start() {
DbUtil dbUtil = new DbUtil();
try {
con = dbUtil.getCon();
} catch (Exception e) {
logger.error("创建数据库连接失败", e);
}
parseHomePage();
}
public static void main(String[] args) {
start();
}
}
DownLoadImageTest.java: 下载图片测试类
package com.inossem.blog.spider;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.UUID;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.log4j.Logger;
import com.inossem.blog.util.DateUtil;
import com.inossem.blog.util.PropertiesUtil;
public class DownLoadImageTest {
private static Logger logger=Logger.getLogger(DownLoadImageTest.class);
private static final String link="http://images2015.cnblogs.com/blog/952033/201705/952033-20170511210141910-342481715.png";
public static void main(String[] args) {
logger.info("开始爬取"+link+"图片");
CloseableHttpClient httpClient=HttpClients.createDefault(); // 获取HttpClient实例
HttpGet httpget=new HttpGet(link); // 创建httpget实例
RequestConfig config=RequestConfig.custom().setSocketTimeout(10000) // 设置读取超时时间
.setConnectTimeout(5000) // 设置连接超时时间
.build();
httpget.setConfig(config);
CloseableHttpResponse response=null;
try {
response=httpClient.execute(httpget);
} catch (ClientProtocolException e) {
logger.error("ClientProtocolException",e);
} catch (IOException e) {
logger.error("IOException",e);
}
if(response!=null){
HttpEntity entity=response.getEntity(); // 获取返回实体
// 判断返回状态是否为200
if(response.getStatusLine().getStatusCode()==200){
try {
InputStream inputStream=entity.getContent();
String imageType=entity.getContentType().getValue();
String urlB=imageType.split("/")[1];
String uuid=UUID.randomUUID().toString();
FileUtils.copyToFile(inputStream, new File(PropertiesUtil.getValue("imageFilePath")+DateUtil.getCurrentDatePath()+"/"+uuid+"."+urlB));
} catch (UnsupportedOperationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}else{
logger.error("返回状态非200");
}
}else{
logger.error("连接超时");
}
try{
if(response!=null){
response.close();
}
if(httpClient!=null){
httpClient.close();
}
}catch(Exception e){
logger.error("Exception", e);
}
logger.info("结束爬取"+link+"图片");
}
}
创建数据库脚本:
CREATE DATABASE db_blogs;
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for t_article
-- ----------------------------
DROP TABLE IF EXISTS `t_article`;
CREATE TABLE `t_article` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增主键id',
`title` varchar(200) DEFAULT NULL COMMENT '博客标题',
`content` longtext COMMENT '博客正文内容',
`orUrl` varchar(1000) DEFAULT NULL COMMENT '源博客地址',
`crawlerDate` datetime DEFAULT NULL COMMENT '爬虫博客日期',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=23 DEFAULT CHARSET=utf8;
在D:\ehcache 下创建ehcache.xml
运行CnBlogSpider.java

打开数据库查看数据并选择一条数据

我们选取 标题:【机器学习】算法原理详细推导与实现(一):线性回归的博客内容。

创建txt文档:博客内容.txt

复制内容到博客内容.txt中,并修改文件名称为博客内容.html

打开html部分截图如下:

这样就爬取成功了,大功告成!!!