前几篇文章,我们对慕课网的课程进行了爬取,本文就对数据进行统计和可视化,让这些数据更直观的展现出来。
介绍
Flask 是基于Python的非常流行的框架之一,主要用于web开发,适合开发中小型项目,易于扩展。Flask的官方网站是 http://flask.pocoo.org/ 。
Echarts (http://echarts.baidu.com/ )是百度出品的,基于Canvas的,纯Javascript 的图表库,提供直观,生动,可交互,可个性化定制的数据可视化图表。创新的拖拽重计算、数据视图、值域漫游等特性大大增强了用户体验,赋予了用户对数据进行挖掘、整合的能力。
搭建Flaskweb项目
安装必要的依赖库
pip install Flask
pip install PyMySQL
web项目目录结构如下:
├── web│
│ ├── static
│ │ └── js
│ │ ├── dark.js
│ │ └── echarts.min.js
│ ├── templates
│ │ └── index.html
│ ├── __init__.py
│ └── views.py
├── runserver.py
其中runserver.py为项目启动文件:
#!/usr/bin/python
# -*- coding: utf-8 -*-
from web import app
if __name__ == '__main__':
app.run(host='0.0.0.0', debug=True)
__init__.py是项目的主文件
# -*- coding: utf-8 -*-
from flask import Flask
app = Flask(__name__)
import web.views
views.py 为视图函数:
# -*- coding: utf-8 -*-
import contextlib
import pymysql
from flask import jsonify, make_response, render_template, request
from web import app
# 数据库连接
# 定义上下文管理器,连接后自动关闭连接
@contextlib.contextmanager
def mysql(host='127.0.0.1',
port=3306,
user='root',
passwd='abc-123',
db='demo_db',
charset='utf8'):
conn = pymysql.connect(
host=host, port=port, user=user, passwd=passwd, db=db, charset=charset)
cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)
try:
yield cursor
finally:
conn.commit()
cursor.close()
conn.close()
# 首页
@app.route('/')
def hello_world():
return render_template('index.html')
# 每个课程类型的课程数
@app.route('/api/type')
def api_type():
with mysql() as cursor:
cursor.execute(
"SELECT type as name,count(id) as value from imooc_courses GROUP BY type"
)
return json_success(cursor.fetchall())
# 每个学习方向的课程数
@app.route('/api/cate')
def api_cate():
with mysql() as cursor:
cursor.execute(
"SELECT cate as name,count(id) as value from imooc_courses GROUP BY cate"
)
cate_data = cursor.fetchall()
cate_data_new = transform_cate(cate_data)
return json_success(cate_data_new)
# 所以课程的学习人数
@app.route('/api/learn_num')
def api_learn_num():
with mysql() as cursor:
cursor.execute(
"SELECT title as name,learn_num as value from imooc_courses ORDER BY learn_num ASC"
)
return json_success(cursor.fetchall())
# 每个方向的学习人数
@app.route('/api/learn_num_cate')
def api_learn_num_cate():
with mysql() as cursor:
cursor.execute(
"SELECT cate as name,CAST(sum(learn_num) AS CHAR) as value from imooc_courses GROUP BY cate ORDER BY sum(learn_num) DESC"
)
cate_data = cursor.fetchall()
cate_data_new = transform_cate(cate_data)
return json_success(cate_data_new)
# 难度级别
@app.route('/api/difficulty_level')
def api_difficulty_level():
with mysql() as cursor:
cursor.execute(
"SELECT difficulty_level as name,count(id) as value from imooc_courses GROUP BY difficulty_level"
)
return json_success(cursor.fetchall())
# 课程评分
@app.route('/api/overall_rating')
def api_overall_rating():
with mysql() as cursor:
cursor.execute(
"SELECT overall_rating as name,count(id) as value from imooc_courses GROUP BY overall_rating order by overall_rating+0 ASC"
)
return json_success(cursor.fetchall())
# 课程评分
@app.route('/api/duration')
def api_duration():
with mysql() as cursor:
cursor.execute(
"SELECT duration as name,count(id) as value from imooc_courses GROUP BY duration order by duration+0 ASC"
)
return json_success(cursor.fetchall())
# 学习人数与评分的关系
@app.route('/api/bubble_gradient')
def api_bubble_gradient():
with mysql() as cursor:
cursor.execute(
"SELECT overall_rating,learn_num,0,title FROM imooc_courses")
return json_success(cursor.fetchall())
# 搜索
@app.route('/api/search')
def api_search():
if request.values.get('keywords'):
keywords = request.values.get('keywords')
else:
keywords = ''
with mysql() as cursor:
cursor.execute("SELECT * FROM imooc_courses WHERE title like '%" +
keywords + "%' or cate like '%" +
keywords + "%' or type like '%" +
keywords + "%' or brief like '%" +
keywords + "%' order by learn_num desc limit 50")
return json_success(cursor.fetchall())
# 由于一个课程可能存在多少cate,以逗号分隔,所以此处重新组合
def transform_cate(cate_data):
cate_data_tmp = {}
for item in cate_data:
if item['name'] == '':
item['name'] = '其他'
if item['name'].find(',') > 0:
for item_sub in item['name'].split(','):
if item_sub not in cate_data_tmp.keys():
cate_data_tmp[item_sub] = item['value']
else:
cate_data_tmp[item_sub] = int(
cate_data_tmp[item_sub]) + int(item['value'])
else:
if item['name'] not in cate_data_tmp.keys():
cate_data_tmp[item['name']] = item['value']
else:
cate_data_tmp[item['name']] = int(
cate_data_tmp[item['name']]) + int(item['value'])
cate_data_new = []
for key in cate_data_tmp:
cate_data_new.append({'name': key, 'value': cate_data_tmp[key]})
return cate_data_new
# 返回json数据
def json_success(data):
data = {'status': 'success', 'data': data, 'info': '成功'}
response = make_response(jsonify(data))
# 支持跨域
response.headers['Access-Control-Allow-Origin'] = '*'
response.headers['Access-Control-Allow-Methods'] = 'GET,POST'
return response
templates\index.html为模板文件,主要是通过views.py接口提供的数据,用Echarts进行可视化。
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title>数据可视化分析</title>
<link rel="stylesheet" href="https://cdn.bootcss.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u"
crossorigin="anonymous">
<link rel="stylesheet" href="https://cdn.bootcss.com/bootstrap/3.3.7/css/bootstrap-theme.min.css" integrity="sha384-rHyoN1iRsVXV4nD0JutlnGaslCJuC7uwjduW9SVrLvRYooPp2bWYgmgJQIXwl/Sp"
crossorigin="anonymous">
<script src="https://cdn.bootcss.com/jquery/3.2.1/jquery.js"></script>
<script src="https://cdn.bootcss.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa"
crossorigin="anonymous"></script>
<script src="http://echarts.baidu.com/dist/echarts.min.js"></script>
<script src="http://echarts.baidu.com/asset/theme/dark.js"></script>
<style>
body {
background: #1b1b1b;
color: #eee;
}
.echarts_box {
margin: 10px 0px 20px 0px;
}
.echarts_title {
background-color: #9E2222;
padding: 5px 10px;
margin-bottom: 0px;
}
.search .list-group .list-group-item{
background: rgb(51, 51, 51);
border: 0px;
border-bottom: 3px solid #1b1b1b;
border-top: 3px solid #1b1b1b;
}
.course_img{
width: 100%;
border-radius: 10px;
}
#search {display: none;}
#search_form{
margin-top: 5px;
margin-bottom: 5px;
}
#search_form .input-group *{
border-radius: 0px;
}
#search_form .input-group input{
background: rgb(51, 51, 51);
border: 1px solid rgb(51, 51, 51);
color: #fff;
}
#search_form .input-group button{
background: #9E2222;
border-color: #9E2222;
color: #fff;
}
</style>
</head>
<body>
<div class="container">
<div class="row">
<div class="col-md-6">
<h3>数据可视化分析<small>(慕课网)</small></h3>
<p>数据来源
<a href="http://www.imooc.com" target="_blank">慕课网</a>,使用python-scrapy爬取数据,解析,预处理,缓存于mysql。
</p>
<p>可视化采用python的flask框架获取统计数据,使用
<a href="http://echarts.baidu.com" target="_blank">Echarts</a>进行简单的可视化。</p>
</div>
<div class="col-md-6">
<ul class="nav nav-pills pull-right" style="margin-top: 50px">
<li role="stat" class="active"><a href="javascript:;">统计</a></li>
<li role="search"><a href="javascript:;">搜索</a></li>
</ul>
</div>
</div>
<div class="search show-line row" id="search">
<div class="col-md-12">
<p class="echarts_title">课程搜索</p>
<form class="row" id="search_form">
<div class="col-lg-12 pull-right">
<div class="input-group">
<input name="keywords" type="text" class="form-control" placeholder="请输入课程关键词">
<span class="input-group-btn">
<button class="btn btn-default" type="submit">Go!</button>
</span>
</div>
</div>
</form>
<ul class="list-group"></ul>
</div>
</div>
<div class="stat show-line row">
<div class="echarts_box col-md-5">
<p class="echarts_title">课程类型课程数汇总</p>
<div id="type" style="with:100%;height:400px"></div>
</div>
<div class="echarts_box col-md-7">
<p class="echarts_title">学习方向课程数汇总</p>
<div id="cate" style="with:100%;height:400px"></div>
</div>
<div class="echarts_box col-md-12">
<p class="echarts_title">课程时长统计</p>
<div id="duration" style="with:100%;height:300px"></div>
</div>
<div class="echarts_box col-md-4">
<p class="echarts_title">课程难度级别</p>
<div id="difficulty_level" style="with:100%;height:300px"></div>
</div>
<div class="echarts_box col-md-8">
<p class="echarts_title">课程评分统计</p>
<div id="overall_rating" style="with:100%;height:300px"></div>
</div>
<div class="echarts_box col-md-12">
<p class="echarts_title">学习人数与评分的关系</p>
<div id="bubble_gradient" style="with:100%;height:300px"></div>
</div>
<div class="echarts_box col-md-12">
<p class="echarts_title">学习人数排行</p>
<div id="learn_num_cate" style="with:100%;height:500px"></div>
<div id="learn_num" style="with:100%;height:600px"></div>
</div>
</div>
<script type="text/javascript">
// var baseUrl = 'http://127.0.0.1:5000';
var baseUrl = location.href;
baseUrl = baseUrl.substring(0, baseUrl.lastIndexOf('/'));;
$('.nav li a').click(function(event) {
$(this).parent('li').addClass('active').siblings('li').removeClass('active');
$("div."+$(this).parent('li').attr('role')).show().siblings('.show-line').hide();
});
get_data('/api/type', 'type', show_pie_simple);
get_data('/api/cate', 'cate', show_pie_simple);
get_data('/api/learn_num', 'learn_num', show_bar_tick_align);
get_data('/api/learn_num_cate', 'learn_num_cate', show_pie_simple);
get_data('/api/difficulty_level', 'difficulty_level', show_pie_custom);
get_data('/api/overall_rating', 'overall_rating', show_line_stack);
get_data('/api/duration', 'duration', show_area_simple);
get_data('/api/bubble_gradient', 'bubble_gradient', show_bubble_gradient);
get_data('/api/search', 'search', show_search);
$("input[name=keywords]").keyup(function(event) {
var keywords = $(this).val();
get_data('/api/search?keywords='+keywords, 'search', show_search);
return false;
});
$("#search_form").submit(function(event) {
var keywords = $('input[name=keywords]').val();
get_data('/api/search?keywords='+keywords, 'search', show_search);
return false;
});
function show_search(data,elementId){
var itemHtml = '';
$.each(data, function(index, val) {
itemHtml += '<div class="list-group-item clearfix">';
itemHtml += '<div class="col-md-3"><a href="'+val.course_url+'" target="_blank"><img class="course_img" src="'+val.image+'" /></a></div>';
itemHtml += '<div class="col-md-9"><h4><a href="'+val.course_url+'" target="_blank">'+val.title+'</a></h4>';
itemHtml += '<p>方向:'+val.cate+' / '+val.type+' 学习人数:'+val.learn_num+'</p>';
itemHtml += '<p>难度:'+val.difficulty_level+' 评分:'+val.overall_rating+' 评论数:'+val.evaluation_number+'</p>';
itemHtml += '<p class="brief">'+val.brief+'</p>';
itemHtml += '</div>';
itemHtml += '</div>';
});
$('#'+elementId).find('.list-group').html(itemHtml);
}
function show_pie_custom(data, elementId) {
legendData = [];
data.forEach(function (value, index, array) {
legendData.push(value.name);
});
option = {
tooltip: {
trigger: 'item',
formatter: "{b} : {c} ({d}%)"
},
visualMap: {
show: true,
min: 80,
max: 600,
},
series: [
{
type: 'pie',
radius: '55%',
center: ['50%', '50%'],
data: data.sort(function (a, b) { return a.value - b.value; }),
roseType: 'radius',
animationType: 'scale',
animationEasing: 'elasticOut',
animationDelay: function (idx) {
return Math.random() * 200;
}
}
]
};
show_echarts(elementId, option);
}
// 指定图表的配置项和数据
function show_pie_simple(data, elementId) {
legendData = [];
data.forEach(function (value, index, array) {
legendData.push(value.name);
});
option = {
tooltip: {
trigger: 'item',
formatter: "{b} : {c} ({d}%)"
},
legend: {
type: 'scroll',
orient: 'vertical',
left: 10,
top: 10,
bottom: 10,
data: legendData
},
series: [
{
// name: '课程类型',
type: 'pie',
radius: '70%',
center: ['55%', '50%'],
data: data,
itemStyle: {
emphasis: {
shadowBlur: 10,
shadowOffsetX: 0,
shadowColor: 'rgba(0, 0, 0, 0.5)'
}
}
}
]
};
if (elementId == 'learn_num_cate') {
option.legend.type = '';
}
show_echarts(elementId, option);
}
function show_bar_tick_align(data, elementId) {
xAxisData = [];
seriesData = [];
data.forEach(function (value, index, array) {
xAxisData.push(value.name);
seriesData.push(value.value);
});
showDataZoom = xAxisData.length > 50 ? true : false;
option = {
color: ['#3398DB'],
tooltip: {
trigger: 'axis',
axisPointer: { // 坐标轴指示器,坐标轴触发有效
type: 'shadow', // 默认为直线,可选为:'line' | 'shadow'
label: {
show: true
}
}
},
toolbox: {
show: true,
feature: {
mark: { show: true },
// dataView : {show: true, readOnly: false},
magicType: { show: true, type: ['line', 'bar'] },
restore: { show: true },
// saveAsImage : {show: true}
}
},
calculable: true,
grid: {
left: '3%',
right: '4%',
bottom: '3%',
containLabel: true
},
xAxis: [
{
type: 'category',
data: xAxisData,
axisTick: {
alignWithLabel: true
},
axisLabel: {
interval: 0,//横轴信息全部显示
rotate: 60,//60度角倾斜显示
}
}
],
yAxis: [
{
type: 'value'
}
],
dataZoom: [
{
show: showDataZoom,
start: 92,
end: 100
}
],
series: [
{
type: 'bar',
barWidth: '60%',
data: seriesData
}
]
};
show_echarts(elementId, option);
}
function show_line_stack(data, elementId) {
xAxisData = [];
seriesData = [];
data.forEach(function (value, index, array) {
xAxisData.push(value.name);
seriesData.push(value.value);
});
option = {
tooltip: {
trigger: 'axis'
},
grid: {
left: '3%',
right: '4%',
bottom: '3%',
containLabel: true
},
xAxis: {
type: 'category',
boundaryGap: false,
data: xAxisData
},
yAxis: {
type: 'value'
},
series: [
{
type: 'line',
data: seriesData
},
]
};
show_echarts(elementId, option);
}
function show_area_simple(data, elementId) {
xAxisData = [];
seriesData = [];
newData = [];
data.forEach(function (value, index, array) {
name = value.name
if (name.split('小时').length > 1) {
names = name.split('小时');
min = parseInt(names[0] * 60) + parseInt(names[1].replace('分', ''))
} else {
min = parseInt(name.replace('分', ''))
}
newData.push({
name: name,
value: value,
min: min
});
});
newData.sort(by("min"));
newData.forEach(function (value, index, array) {
xAxisData.push(value.name);
seriesData.push(value.value);
});
option = {
tooltip: {
trigger: 'axis',
position: function (pt) {
return [pt[0], '10%'];
}
},
xAxis: {
type: 'category',
boundaryGap: false,
data: xAxisData
},
yAxis: {
type: 'value',
boundaryGap: [0, '5%']
},
series: [
{
type: 'line',
smooth: true,
symbol: 'none',
sampling: 'average',
itemStyle: {
normal: {
color: 'rgb(255, 70, 131)'
}
},
areaStyle: {
normal: {
color: new echarts.graphic.LinearGradient(0, 0, 0, 1, [{
offset: 0,
color: 'rgb(255, 158, 68)'
}, {
offset: 1,
color: 'rgb(255, 70, 131)'
}])
}
},
data: seriesData
}
]
};
show_echarts(elementId, option);
}
function show_bubble_gradient(dataO, elementId) {
var data = [];
dataO.forEach(function (value, index, array) {
data.push([value.learn_num, value.overall_rating, 0, value.title])
});
data = [data];
option = {
xAxis: {
splitLine: {
lineStyle: {
type: 'dashed'
}
}
},
yAxis: {
splitLine: {
lineStyle: {
type: 'dashed'
}
},
scale: true
},
series: [{
data: data[0],
type: 'scatter',
// symbolSize: function (data) {
// return Math.sqrt(data[2]) / 5e2;
// },
label: {
emphasis: {
show: true,
formatter: function (param) {
return param.data[3];
},
position: 'top'
}
},
itemStyle: {
normal: {
shadowBlur: 10,
shadowColor: 'rgba(120, 36, 50, 0.5)',
shadowOffsetY: 5,
color: new echarts.graphic.RadialGradient(0.4, 0.3, 1, [{
offset: 0,
color: 'rgb(251, 118, 123)'
}, {
offset: 1,
color: 'rgb(204, 46, 72)'
}])
}
}
}]
};
show_echarts(elementId, option);
}
function get_data(url, elementId, callback) {
$.getJSON(baseUrl + url, {}, function (json, textStatus) {
if (json.status == 'success') {
callback(json.data, elementId);
}
});
}
function show_echarts(elementId, option) {
// 使用刚指定的配置项和数据显示图表。
var myChart = echarts.init(document.getElementById(elementId), 'dark');
myChart.setOption(option);
}
//by函数接受一个成员名字符串做为参数
//并返回一个可以用来对包含该成员的对象数组进行排序的比较函数
function by(name) {
return function (o, p) {
var a, b;
if (typeof o === "object" && typeof p === "object" && o && p) {
a = o[name];
b = p[name];
if (a === b) {
return 0;
}
if (typeof a === typeof b) {
return a < b ? -1 : 1;
}
return typeof a < typeof b ? -1 : 1;
}
else {
throw ("error");
}
}
}
</script>
</div>
</body>
</html>
不多解释了,请看代码。
运行项目
python runserver.py
线上项目请结合uwsgi+nginx部署,这里就不多说啦。
最终效果
数据可视化分析(慕课网) 数据来源 慕课网,使用python-scrapy爬取数据,解析,预处理,缓存于mysql。
可视化采用python的flask框架获取统计数据,使用 Echarts进行简单的可视化。