maven
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<!--jsoupxpath框架 爬虫-->
<dependency>
<groupId>cn.wanghaomiao</groupId>
<artifactId>JsoupXpath</artifactId>
<version>2.3.2</version>
</dependency>
<!--帮助类框架 hutool-->
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>4.5.1</version>
</dependency>
</dependencies>
代码
package com.hskj.tvdate.reptile;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.seimicrawler.xpath.JXDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import cn.hutool.core.date.DateUtil;
import cn.hutool.http.HttpUtil;
import cn.hutool.json.JSONObject;
import cn.hutool.json.JSONUtil;
/**
* @program: tvdate
* @description:爬取百度风云榜的数据
* @author: hw
* @create: 2020-01-29 17:18
*/
@Component
public class BaiDuSituation {
@Autowired JdbcTemplate jdbcTemplate;
static ExecutorService executorService = Executors.newFixedThreadPool(30);
/** 影视类型和网址的key-value */
static final Map<String, Object> MAP =
new HashMap<String, Object>() {
{
put("电影", "http://top.baidu.com/buzz?b=26&c=1&fr=topcategory_c1");
put("电视剧", "http://top.baidu.com/buzz?b=4&c=2&fr=topcategory_c2");
put("综艺", "http://top.baidu.com/buzz?b=19&c=3&fr=topcategory_c3");
put("动漫", "http://top.baidu.com/buzz?b=23&c=5&fr=topcategory_c5");
put("少儿", "http://top.baidu.com/buzz?b=1677&fr=topbuzz_b23_c5");
put("纪录片", "http://top.baidu.com/buzz?b=1678&fr=topbuzz_b23_c5");
}
};
/** 影视类型和影视简介的key-value value后需加上影视名称的UrlEncode后的字符, 并且链接返回的的数据是unicode编码,需要转译 */
static final Map<String, Object> KEY_VALUE =
new HashMap<String, Object>() {
{
put("电影", "http://top.baidu.com/detail/intro?boardid=26&keyword=");
put("电视剧", "http://top.baidu.com/detail/intro?boardid=4&keyword=");
put("综艺", "http://top.baidu.com/detail/intro?boardid=19&keyword=");
put("动漫", "http://top.baidu.com/detail/intro?boardid=23&keyword=");
put("少儿", "http://top.baidu.com/detail/intro?boardid=1677&keyword=");
put("纪录片", "http://top.baidu.com/detail/intro?boardid=1678&keyword=");
}
};
private static final Logger log = LoggerFactory.getLogger(BaiDuSituation.class);
/** 爬取数据的方法 */
@Scheduled(cron = "00 00 12 * * ?")
public void addBaiduData() throws Exception {
String today = DateUtil.today();
log.info("百度风云榜爬取数据定时任务开始执行");
for (Map.Entry<String, Object> url : MAP.entrySet()) {
String urls = url.getValue().toString();
String type = url.getKey();
// 请求链接
Document document = Jsoup.parse(new URL(urls).openStream(), "GBK", urls);
JXDocument underTest = JXDocument.create(document.toString());
// 标题 名称
String title = "//td[@class='keyword']/a[1]/text()";
String index = "//td[@class='last']/span/text()";
List<Object> titles = underTest.sel(title);
List<Object> indexs = underTest.sel(index);
for (int i = 0; i < titles.size(); i++) {
int finalI = i;
Thread thread =
new Thread(
() -> {
// 标题
Object titleName = titles.get(finalI);
// 指数
Object index2 = indexs.get(finalI);
String urlDeCode = strToUrlDeCode(titleName.toString());
String urli = KEY_VALUE.get(type) + urlDeCode;
Map<String, Object> jsonToMap = summaryJsonToMap(HttpUtil.get(urli));
Object urlBaike = null;
Object imageUrl = null;
Object imageBig = null;
Object brief = null;
try {
// 百科
urlBaike = jsonToMap.get("url");
// 小图封面
imageUrl = jsonToMap.get("image");
// 大图封面
imageBig = jsonToMap.get("orin_image");
// 简介
brief = jsonToMap.get("abstract");
} catch (Exception e) {
log.info("该影片没有简介:{}", titleName);
}
addBaiduDateBase(
titleName, type, brief, index2, urlBaike, imageUrl, imageBig, finalI, today);
});
executorService.execute(thread);
}
}
}
/**
* 将中文字符转成urlcode
*
* @param str 中文字符
* @return urlcode
*/
public static String strToUrlDeCode(String str) {
String deCode = "";
try {
deCode = URLEncoder.encode(str, "gb18030");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return deCode;
}
return deCode;
}
/**
* 将简介的json转成map
*
* @param json 简介的json
* @return 转成后的map key: url.百度百科 image.小图封面 orin_image.大图封面 abstract.简介
*/
public static Map<String, Object> summaryJsonToMap(String json) {
JSONObject content = null;
try {
JSONObject jsonObject = JSONUtil.parseObj(json);
content = JSONUtil.parseObj(jsonObject.get("content"));
} catch (Exception e) {
}
return content;
}
public void addBaiduDateBase(Object... obj) {
StringBuffer sb =
new StringBuffer(
"insert into baidu_situation(create_day,no,title,`type`,brief,`index`,url_baike,image_url,image_big)values(");
sb.append(
"'" + obj[8] + "','" + obj[7] + "','" + obj[0] + "','" + obj[1] + "','" + obj[2] + "','"
+ obj[3] + "','" + obj[4] + "','" + obj[5] + "','" + obj[6] + "'");
sb.append(")");
int update = jdbcTemplate.update(sb.toString());
if (update < 1) {
log.error("数据库插入数据库失败,参数为:{}", obj);
}
}
}
数据库
CREATE TABLE `baidu_situation` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`title` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '标题名称',
`type` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '类型',
`no` int(11) DEFAULT NULL COMMENT '排名',
`brief` varchar(2550) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '简介',
`index` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '热度指数',
`url_baike` varchar(500) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '百科链接',
`image_url` varchar(500) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '封面链接',
`image_big` varchar(500) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '大图封面链接',
`create_day` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '创建的年月日 用于做索引',
`remark` varchar(255) CHARACTER SET utf8 DEFAULT NULL COMMENT '备注',
`create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
`state` int(11) NOT NULL DEFAULT '0' COMMENT '数据有效性(0-有效,1-无效)',
PRIMARY KEY (`id`),
UNIQUE KEY `titleDay` (`title`,`type`,`create_day`) USING BTREE COMMENT '唯一索引做标识防止重复爬取'
) ENGINE=InnoDB AUTO_INCREMENT=301 DEFAULT CHARSET=utf8 COLLATE=utf8_czech_ci;