1.本地数据集上传到数据仓库Hive

1.1命令

sudo mkdir -R /usr/local/bigdatacase/dataset 
//这里会提示你输入当前用户(本教程是hadoop用户名)的密码  
//下面给hadoop用户赋予针对bigdatacase目录的各种操作权限  
cd /usr/local/
sudo chown -R hadoop:hadoop ./bigdatacase  
cd bigdatacase  

//下面就可以解压缩user.zip文件  
cd ~  //表示进入hadoop用户的目录  
cd 下载  
ls  
unzip user.zip -d /usr/local/bigdatacase/dataset  
cd /usr/local/bigdatacase/dataset  
ls  
head -5 raw_user.csv

1.2数据集进行预处理

1.2.1删除首行

cd /usr/local/bigdatacase/dataset  
//下面删除raw_user中的第1行  
//1d表示删除第1行,同理,3d表示删除第3行,nd表示删除第n行
sed -i '1d' raw_user.csv   
//下面删除small_user中的第1行  
sed -i '1d' small_user.csv  
//下面再用head命令去查看文件的前5行记录,就看不到字段名称这一行了  
head -5 raw_user.csv  
head -5 small_user.csv

1.2.2 添加字段

添加随机省份字段

1.3 导入HDFS中

1.4 Hive创建数据库

//这个为小数据集,下面的为大数据集
hive>  create database dblab;
hive>  create database dblab_big;
//创建表并映射入HDFS中的文件
hive>  CREATE EXTERNAL TABLE dblab.bigdata_user(id INT,uid STRING,item_id STRING,behavior_type INT,item_category STRING,visit_date DATE,province STRING) COMMENT 'Welcome to xmu dblab!' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/bigdatacase/dataset';

到目前为止就完成了数据的预处理,添加省份,创建Hive数据库,上传到HDFS,并且映射到Hive里面

2.Hive数据分析语句

Hive数据分析就是写SQL语句对刚刚映射的表进行查询、筛选、统计等等一系列操作。

2.1 查看表的属性

uid

item_id

behavior_type

item_category

visit_date

province

1

用户ID

商品ID

用户行为

商品分类

产生时间

用户地理位置

uid

item_id

behavior_type

item_category

visit_date

province

1

10001082

285259775

1

4076

2014-12-08

广东

2

10001082

4368907

1

5503

2014-12-08

湖南

3

10001082

285259788

4

4076

2014-12-08

北京

behaviour_type属性1234代表的意思

1

2

3

4

1

浏览

收藏

加购物车

购买

2.2 SQL语句

2.2.1 2014-12-11~12号有多少条购买商品的记录

//首先查询表中的数据
select * from bigdata_user limit 10;

//然后查询10号之后
select * from bigdata_user where visit_date >'2014-12-10' limit 10;

//再添加限制 13号之前
select * from bigdata_user where visit_date >'2014-12-10' and visit_date <'2014-12-13' limit 10;

//购买行为
select * from bigdata_user where visit_date >'2014-12-10' and visit_date <'2014-12-13' and behavior_type='4'  limit 10;

2.2.2 分析每月1-31号购买情况

按购买日期统计用户的购买商品记录数、用户数

//首先搜索出所有信息
select * from bigdata_user limit 10;  
  
//行为日期,搜索出日期  
select * from bigdata_user limit 10;  
select day(visit_date) from bigdata_user limit 10;  
  
//购买行为的记录数、不同用户数  
select count(*) from bigdata_user where behavior_type='4';  
select count(distinct uid) from bigdata_user where behavior_type='4';  
  
//按日期统计记录数、用户数  
select count(*),day(visit_date) from bigdata_user where behavior_type='4' group by day(visit_date) limit 10;  
select count(distinct uid),day(visit_date) from bigdata_user where behavior_type='4' group by day(visit_date) limit 10;  
  
//保存为表格  
create table day_count as select count(*),day(visit_date) from bigdata_user where behavior_type='4' group by day(visit_date);   
create table day_uid as select count(distinct uid),day(visit_date) from bigdata_user where behavior_type='4' group by day(visit_date);

2.2.3 找到一天购买很多商品的用户

查询某一天在该网站的购买 4项以上商品的用户id

//12号+购买行为  
select * from bigdata_user where behavior_type='4'and visit_date='2014-12-12' limit 10;  
  
//按用户编号分组  
select uid from bigdata_user where behavior_type='4'and visit_date='2014-12-12' group by uid limit 10;  
  
//按用户分组统计  
select uid,count(*) from bigdata_user where behavior_type='4'and visit_date='2014-12-12' group by uid limit 10;  
  
//12号,购买,4项以上  
select uid,count(*) from bigdata_user where behavior_type='4'and visit_date='2014-12-12' group by uid having count(*)>4 limit 10;  
select uid,count(*) from bigdata_user where behavior_type='4'and visit_date='2014-12-12' group by uid having count(behavior_type='4')>4 limit 10;

2.2.4 查询商品的购买数量和购买用户的数量,排序购买数量

查询出每个商品的购买次数 和购买人数

//查询所有购买商品集合
select * from bigdata_user where behavior_type='4' limit 10;

//查询每个商品的购买次数
select item_id,count(1) buy_number  from bigdata_user where behavior_type='4' group by item_id  order by buy_number desc  limit 10;


//查询每个商品购买的用户数量
select item_id,count(distinct uid) user_number from bigdata_user where behavior_type='4' group by item_id  order by user_number desc  limit 10;


//查询每个商品购买的总数量和购买的用户数量
select item_id,count(1) buy_number,count(distinct uid) user_number  from bigdata_user where behavior_type='4' group by item_id  order by buy_number desc  limit 10;

2.2.5 查询广东每日的商品购买率(购买量/浏览量)

//广东每日浏览商品数
select visit_date, count(*) browse_number from bigdata_user where province='广东' and behavior_type='1'  group by visit_date order by browse_number desc ;

//广东每日购买览商品数
select  visit_date, count(*) buy_number from bigdata_user where province='广东' and behavior_type='4'  group by visit_date order by buy_number desc ;

//为以上两个创建单独的表出来
//创建表,每日用户浏览数量
create table browse_user as select  visit_date, count(*) browse_number from bigdata_user where province='广东' and behavior_type='1'  group by visit_date order by browse_number desc ;

//创建表,每日用户购买数量
create table buy_user as select  visit_date, count(*) buy_number from bigdata_user where province='广东' and behavior_type='4'  group by visit_date order by buy_number desc ;

//每日浏览购买成功率
//通过左联查询出每日的浏览量和购买量,并且算出每日购买率
select browse_user.visit_date, browse_number,buy_number,buy_number/browse_number buy_rate from browse_user  left join buy_user  on browse_user.visit_date=buy_user.visit_date;

2.2.6查询2019-11-11每个省的浏览数量,购买数量和比例

//查询2019-12-12每个省的浏览数量,购买数量和比例

//a查询2014-12-12每个省的浏览数量
select province,count(1) browse_number from bigdata_user where visit_date='2014-12-12' and behavior_type='1' group by province order by browse_number desc;

//b查询2014-12-12每个省的购买数量
select province,count(1) buy_number from bigdata_user where visit_date='2014-12-12' and behavior_type='4' group by province order by buy_number desc;

//以下语句太长格式为
select a.province,a.browse_number,b.buy_number,两个相除 from (表格a) a 
left join (表格b) b on a.province=b.province;


//一条语句
//直接显示每个省浏览数量、购买数量、购买率
select a.province,a.browse_number,b.buy_number,b.buy_number/a.browse_number rate from  ( select province,count(1) browse_number from bigdata_user where visit_date='2014-12-12' and behavior_type='1' group by province order by browse_number desc) a left join ( select province,count(1) buy_number from bigdata_user where visit_date='2014-12-12' and behavior_type='4' group by province order by buy_number desc ) b on a.province=b.province;

2.2.7用户10001082在2014-12-12号当天活跃度:该用户点击行为占该天所有点击行为的比例

//该天所有点击行为数量
select count(*) users from bigdata_user where visit_date='2014-12-12' and behavior_type='1';

//该用户的点击量
select count(*) user_number from bigdata_user where visit_date='2014-12-12' and behavior_type='1' and uid=10001082;

//结合上面两个语句,显示用户点击量、当天点击量和比例
select '10001082' uid,a.users,b.user_number,b.user_number/a.users rate from ( select count(*) users from bigdata_user where visit_date='2014-12-12' and behavior_type='1' ) a ,( select count(*) user_number  from bigdata_user where visit_date='2014-12-12' and behavior_type='1' and uid=10001082  ) b;

2.2.8查询出用户最喜欢的商品集(收藏)

//查询出用户最喜欢的商品集
select item_id,count(1) collect from bigdata_user where behavior_type='2'  group by item_id order by collect desc limit 20 ;

2.2.9某个地区的用户当天浏览网站的次数

//查询出每个省浏览网站的次数
select province,count(behavior_type) count from bigdata_user where behavior_type='1' group by province;

//把结果存入表格
create table browse_count as select province,count(behavior_type) count from bigdata_user where behavior_type='1' group by province;

//查看结果
select * from browse_count;

2.2.10查询潜力最大的商品(购物车)

//查询加入购物车最多的商品
select item_id,count(behavior_type) cart_number from bigdata_user where behavior_type='3'  group by item_id order by cart_number desc limit 20;

2.2.11查询用户浏览和购买的比例

//查询用户浏览的数量  
select uid,count(1) br from bigdata_user where behavior_type='1' group by uid;  
  
//查询用户购买的数量  
select uid,count(1) bn from bigdata_user where behavior_type='4' group by uid ;  
  
//查询用户浏览和购买的比例,集成在一个语句上  
select a.uid,a.br,b.bn,b.bn/a.br rate from ( select uid,count(1) br from bigdata_user where behavior_type='1' group by uid ) a left join (select uid,count(1) bn from bigdata_user where behavior_type='4' group by uid  ) b on a.uid=b.uid order by rate desc limit 20;  
  
//如果要保存表,前面加上create table user_buy_rate as  ..  
create table user_buy_rate as 加上最后一个语句

3.Hive、MySQL、HBase数据互导

3.1 Hive数据分析结果(如用户购买与浏览比例),从Hive传输到MySQL中

1 在Hive中准备好数据表

hive准备待传输的数据

use dblab;

//创建数据
create table if not exists user_buy_rate  row format delimited fields terminated by '\t' as  select a.uid,a.br,b.bn,b.bn/a.br rate from ( select uid,count(1) br from bigdata_user where behavior_type='1' group by uid ) a left join (select uid,count(1) bn from bigdata_user where behavior_type='4' group by uid  ) b on a.uid=b.uid order by rate desc  ;

2 mysql准备

mysql中
//a显示数据库
show databases;

//b创建数据库dblab
create database if not exists dblab;


//c创建mysql表
create table if not exists `user_buy_rate` (`uid` varchar(100), `br` bigint,`bn` bigint,`rate` double) engine=InnoDB DEFAULT CHARSET =utf8;

//查看表结构
desc user_buy_rate;

3 sqoop进行传输

//sqoop进行数据传输
sqoop export --connect jdbc:mysql://127.0.0.1:3306/dblab?useSSL=false --username root --password 123456 --table user_buy_rate --export-dir /user/hive/warehouse/dblab.db/user_buy_rate --input-fields-terminated-by '\t'

3.2 Hive数据分析结果(如用户购买与浏览比例),从MySQL传输到Hbase中

1 Hbase准备

hbase中

//在Hbase中创建一个user_buy_rate表,历史版本保留数量为5
hbase> create 'user_buy_rate', { NAME => 'f1', VERSIONS => 5}

2 Sqoop传输

//sqoop传输
sqoop  import  --connect jdbc:mysql://localhost:3306/dblab --username root --password 123456 --table user_buy_rate --hbase-table table user_buy_rate --column-family f1 --hbase-row-key uid --hbase-create-table -m 1

3 Hbase查看

//查看HBase中user_action表数据
habse> scan 'user_buy_rate',{LIMIT=>10}  #只查询前面10行