环境及工具
集群环境:
CentOS 7
Hadoop 3.1.3
Hive 3.1.2
Sqoop 1.4.6
MySQL 5.7
开发环境:
win10
IDEA
JDK1.8
spring版本任意
1.页面需求
1.1福利标签词云图
点击按钮出现福利标签词云图,对招聘信息进行数据分析和处理,提取求职信息中公司福利标签并将其综合统计。
1.2职位区域分布图
点击按钮出现职位区域分布图,对招聘信息进行数据分析和处理,提取求职信息中职位城市并将其综合统计。
1.3薪资分布柱状图
点击按钮出现薪资分布柱状图,对招聘信息进行数据分析和处理,提取求职信息中职位薪资并将其合并后统计。
1.4技能标签分布图
点击按钮出现技能标签分布图,对招聘信息进行数据分析和处理,提取求职信息中职位技能标签并将其合并后统计。
2.总体设计
利用linux和HDFS技术完成招聘网站下的数据进行预处理、离线分析与可视化展示。首先利用HDFS、MapReduce、Hive、Sqoop等核心大数据技术组件实现离线数据仓库平台的搭建;然后,利用大数据日志的特点,开发相应的MapReduce程序完成对招聘信息进行数据提取和清洗等,产生招聘职位的Mysql数据库;最后,对求职数据库进行分析,生成技能标签分布图、薪资分布柱状图、职位区域分布与福利标签词云图,并在网页上展示,为求职者提供直观的职位信息参考。
2.1数据预处理
通过编写MapReduce程序,实现将采集的源数据进行预处理得到目标数据的过程。
图2.1
2.2数据分析
通过使用基于分布式文件系统的Hive对招聘数据进行分析。 Hive是建立在Hadoop分布式文件系统上的数据仓库,它提供了一系列工具,能够对存储在HDFS中的数据进行数据提取、转换和加载(ETL)。
针对招聘网站的职位数据分析项目,将Hive数据仓库设计为星型模型,星型模型是由一张事实表和多张维度表组成。
图2.2
2.3数据可视化
运行Sqoop将Hive中的表数据导出到关系型数据库中,方便后续进行数据可视化处理。职位分析可视化系统以JavaWeb为基础搭建,通过SSM(Spring、Springmvc、Mybatis)框架实现后端功能,前端在Jsp中使用Echarts实现可视化展示,前后端的数据交互是通过SpringMVC与AJAX交互实现。
3.代码及运行界面
jobweb项目
3.1 pom文件配置(注意修改版本号)
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>jobcase</groupId>
<artifactId>jobcase-clean</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.1.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.1.3</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.30</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.1.3</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.44</version>
</dependency>
</dependencies>
<!--MAVEN打包插件依赖-->
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
3.2 controller层
IndexController.java
package cn.itcast.controller;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.ResponseBody;
import cn.itcast.service.CityService;
import cn.itcast.service.CompanyService;
import cn.itcast.service.KillService;
import cn.itcast.service.SalaryService;
@Controller
public class IndexController {
//职位分布
@Autowired
private CityService cityService;
@RequestMapping("/index")
public String showIndex() {
return "index";
}
@RequestMapping(value = "/city",produces = "application/json;charset=UTF-8")
@ResponseBody
public String getCity() {
String data = cityService.getCityData();
return data;
}
//薪资分布
@Autowired
private SalaryService salaryService;
@RequestMapping(value = "/salary",produces = "application/json;charset=UTF-8")
@ResponseBody
public String getSalary() {
String data = salaryService.getSalaryData();
return data;
}
//福利标签
@Autowired
private CompanyService companyService;
@RequestMapping(value = "/company",produces = "application/json;charset=UTF-8")
@ResponseBody
public String getCompany() {
String data = companyService.getCompanyData();
return data;
}
//技能标签
@Autowired
private KillService killService;
@RequestMapping(value = "/kill" ,produces = "application/json;charset=UTF-8")
@ResponseBody
public String getKill(){
String data = killService.getKillData();
return data;
}
}
3.3 mapper层
CityMapper.java
package cn.itcast.mapper;
import java.util.List;
import cn.itcast.pojo.CityPojo;
public interface CityMapper {
public List<CityPojo> selectCity();
}
CompanyMapper.java
package cn.itcast.mapper;
import java.util.List;
import cn.itcast.pojo.CompanyPojo;
public interface CompanyMapper {
public List<CompanyPojo> selectCompany();
}
KillMapper.java
package cn.itcast.mapper;
import java.util.List;
import cn.itcast.pojo.KillPojo;
public interface KillMapper {
public List<KillPojo> selectKill();
}
SalaryMapper.java
package cn.itcast.mapper;
import java.util.List;
import cn.itcast.pojo.SalaryPojo;
public interface SalaryMapper {
public List<SalaryPojo> selectSalary();
}
3.4 Pojo层
CityPojo.java
package cn.itcast.pojo;
/**
* 职位区域分布
* @author zhoum
*
*/
public class CityPojo {
private String city;
private int count;
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
@Override
public String toString() {
return "{\"name\":\"" + city
+ "\",\"value\":"
+ String.valueOf(count) + "}";
}
}
CompanyPojo.java
package cn.itcast.pojo;
/**
* 实现福利标签词云图
* @author zhoum
*
*/
public class CompanyPojo {
private int count;
private String company;
public String getCompany(){
return company;
}
public void setCompany(String company) {
this.company = company;
}
public int getCount() {
return count;
}
public void setCount(int count){
this.count = count;
}
@Override
public String toString() {
return "{\"name\":\"" + company
+ "\",\"value\":"
+String.valueOf(count)+ "}";
}
}
KillPojo.java
package cn.itcast.pojo;
public class KillPojo {
private String kills;
private int count;
public String getKills() {
return kills;
}
public void setKills(String kills){
this.kills = kills;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
@Override
public String toString() {
return "{\"name\":\"" + kills
+"\",\"value\":"
+String.valueOf(count)+ "}";
}
}
SalaryPojo.java
package cn.itcast.pojo;
/**
* 薪资分布
* @author zhoum
*
*/
public class SalaryPojo {
private String salary;
private int count;
public String getSalary() {
return salary;
}
public void setSalary(String salary) {
this.salary = salary;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
@Override
public String toString() {
return "{\"name\":\"" + salary
+ "\",\"value\":"
+String.valueOf(count)+ "}";
}
}
3.5 service.impl层
CityServiceImpl.java
package cn.itcast.service.impl;
import cn.itcast.mapper.CityMapper;
import cn.itcast.pojo.CityPojo;
import cn.itcast.service.CityService;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.List;
@Service
public class CityServiceImpl implements CityService {
@Autowired
private CityMapper mapper;
@Override
public String getCityData() {
List<CityPojo> lists = mapper.selectCity();
ArrayList<String> resultData = new ArrayList<String>();
for (CityPojo cityPojo : lists) {
resultData.add(cityPojo.toString());
}
ObjectMapper om = new ObjectMapper();
String beanJson= null;
try {
beanJson=om.writeValueAsString(resultData);
} catch (JsonProcessingException e) {
e.printStackTrace();
}
return beanJson;
}
}
CompanyServiceImpl.java
package cn.itcast.service.impl;
import cn.itcast.mapper.CompanyMapper;
import cn.itcast.pojo.CompanyPojo;
import cn.itcast.service.CompanyService;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.ArrayList;
import java.util.List;
@Service
public class CompanyServiceImpl implements CompanyService {
@Autowired
private CompanyMapper mapper;
@Transactional
public String getCompanyData() {
List<CompanyPojo> lists = mapper.selectCompany();
List<String> resultData = new ArrayList<String>();
for (CompanyPojo companyPojo : lists) {
resultData.add(companyPojo.toString());
}
ObjectMapper om = new ObjectMapper();
String beanJson=null;
try {
beanJson=om.writeValueAsString(resultData);
} catch (JsonProcessingException e) {
e.printStackTrace();
}
return beanJson;
}
}
KillServiceImpl.java
package cn.itcast.service.impl;
import cn.itcast.mapper.KillMapper;
import cn.itcast.pojo.KillPojo;
import cn.itcast.service.KillService;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.ArrayList;
import java.util.List;
@Service
public class KillServiceImpl implements KillService {
@Autowired
private KillMapper mapper;
@Transactional
public String getKillData() {
List<KillPojo> lists = mapper.selectKill();
List<String> resultData = new ArrayList<String>();
for (KillPojo killPojo : lists) {
resultData.add(killPojo.toString());}
ObjectMapper om = new ObjectMapper();String beanJson = null;
try {
beanJson = om.writeValueAsString(resultData);
} catch (JsonProcessingException e) {
e.printStackTrace();
}
return beanJson;
}
}
SalaryServiceImpl.java
package cn.itcast.service.impl;
import cn.itcast.mapper.SalaryMapper;
import cn.itcast.pojo.SalaryPojo;
import cn.itcast.service.SalaryService;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.ArrayList;
import java.util.List;
@Service
public class SalaryServiceImpl implements SalaryService {
@Autowired
private SalaryMapper salaryMapper;
@Transactional
public String getSalaryData() {
List<SalaryPojo> lists = salaryMapper.selectSalary();
ArrayList<String> resultData = new ArrayList<>();
for (SalaryPojo salaryPojo : lists) {
resultData.add(salaryPojo.toString());
}
ObjectMapper om = new ObjectMapper();
String beanJson = null;
try {
beanJson=om.writeValueAsString(resultData);
} catch (JsonProcessingException e) {
e.printStackTrace();
}
return beanJson;
}
}
3.6 service层
CityService.java
package cn.itcast.service;
public interface CityService {
public String getCityData();
}
CompanyService.java
package cn.itcast.service;
public interface CompanyService {
public String getCompanyData();
}
CompanyService.java
package cn.itcast.service;
public interface KillService {
public String getKillData();
}
CompanyService.java
package cn.itcast.service;
public interface SalaryService {
public String getSalaryData();
}
这里顺序有误,应该先运行jobcase对数据先进行预处理
jobcase项目
CleanJob.java
package com.position.clean;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
public class CleanJob {
/**
* 删除指定字符
* @param str 要处理的字符串
* @param delChar 要删除的字符
* @return
*/
public static String deleteString(String str,char delChar){
StringBuffer stringBuffer = new StringBuffer("");
for(int i = 0; i < str.length() ; i++){
if (str.charAt(i) != delChar){
stringBuffer.append(str.charAt(i));
}
}
return stringBuffer.toString();
}
/**
* 处理合并标签
* @param position
* @param company
* @return
* @throws JSONException
*/
public static String mergeString(String position, JSONArray company) throws JSONException {
String result = "";
if (company.length() != 0){
for (int i = 0; i < company.length(); i++) {
result = result + company.get(i) + "-";
}
}
if (position != ""){
String[] split = position.split("|;|,|、|,|;|/");
for(int i = 0; i < split.length; i++) {
result = result + split[i].replaceAll("[\\pP\\p{Punct}]", "");
}
}
return result.substring(0,result.length() - 1);
}
/**
* 处理技能标签
* @param killData
* @return
* @throws JSONException
*/
public static String killResult(JSONArray killData) throws JSONException {
String result = "";
if (killData.length() != 0){
for (int i = 0; i < killData.length(); i++) {
result = result + killData.get(i) + "-";
}
return result.substring(0,result.length() - 1);
}
return result;
}
public static String resultToString(JSONArray jobdata) throws JSONException {
String jobResultData = "";
for(int i = 0;i < jobdata.length();i++){
// 获取每条职位信息
String everyData=jobdata.get(i).toString();
// 将 String 类型的数据转为 JSON 对象
JSONObject everyDataJson=new JSONObject(everyData);
// 获取职位信息中的城市数据
String city=everyDataJson.getString("city");
// 获取职位信息中的薪资数据
String salary=everyDataJson.getString("salary");
// 获取职位信息中的福利标签数据
String positionAdvantage = everyDataJson.getString("positionAdvantage");
JSONArray companyLabelList = everyDataJson.getJSONArray("companyLabelList");
// 获取职位信息中的技能标签数据
JSONArray skillLables = everyDataJson.getJSONArray("skillLables");
// 处理薪资字段数据
String salaryNew = deleteString(salary,'k');
String welfare = mergeString(positionAdvantage,companyLabelList);
String kill = killResult(skillLables);
if(i==jobdata.length()-1){
jobResultData = jobResultData + city + "," + salaryNew + "," + welfare + "," + kill;
} else{
jobResultData = jobResultData + city + "," + salaryNew + "," + welfare + "," + kill + "\n";
}
}
return jobResultData;
}
}
CleanMain.java
package com.position.clean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.BasicConfigurator;
public class CleanMain {
public static void main(String[] args) throws Exception {
// 控制台输出日志
BasicConfigurator.configure();
// 初始化Hadoop环境
Configuration conf = new Configuration();
// 从虚拟机运行行获取 “两个” 参数
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
// 判断从虚拟机获取的是不是两个参数,如果不是两个参数:控制错误输出,并退出程序
if (otherArgs.length != 2){
System.err.println("Usage : wordcount<in> <out>");
System.exit(2);
}
// 定义一个新的job,第一个参数是Hadoop的配置信息,第二个是job的名字
Job job = new Job(conf,"job");
// 设置主类
job.setJarByClass(CleanMain.class);
// 设置mapper类
job.setMapperClass(CleanMapper.class);
// 设置处理小文件的对象
job.setInputFormatClass(CombineTextInputFormat.class);
// 设置n个小文件之和不能大于 2M
CombineTextInputFormat.setMinInputSplitSize(job,2097152);
// 设置n个小文件之和大于 2M,需要满足 n+1 个小文件之和不能大于 4M
CombineTextInputFormat.setMaxInputSplitSize(job,4194304);
// 设置job输出数据的key类
job.setOutputKeyClass(Text.class);
// 设置job输出数据的value类
job.setOutputValueClass(NullWritable.class);
// 设置输入路径
FileInputFormat.addInputPath(job,new Path(otherArgs[0]));
// 设置输出路径
FileOutputFormat.setOutputPath(job,new Path(otherArgs[1]));
// 退出程序
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
CleanMapper.java
package com.position.clean;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
import java.io.IOException;
public class CleanMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
String jobResultData = "";
// 将每个数据文件的内容转换为String类型
String reptileData = value.toString();
// 通过截取字符串的方式获取content中的数据
String jobData = reptileData.substring(reptileData.indexOf("=", reptileData.indexOf("=") + 1) + 1, reptileData.length() - 1);
try {
// 获取content中的数据内容
JSONObject contentJson = new JSONObject(jobData);
String contentData = contentJson.getString("content");
// 获取content下的positionResult中的数据内容
JSONObject positionResultJson = new JSONObject(contentData);
String positionResultData = positionResultJson.getString("positionResult");
// 获取result中的数据内容
JSONObject resultJson = new JSONObject(positionResultData);
JSONArray resultData = resultJson.getJSONArray("result");
jobResultData = CleanJob.resultToString(resultData);
// 输出
context.write(new Text(jobResultData),NullWritable.get());
} catch (JSONException e) {
e.printStackTrace();
}
}
}
运行页面: