package com.shujia.sql
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object Demo04DSL {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName("Demo04DSL")
.master("local")
.config("spark.sql.shuffle.partitions", 2) //默认200 避免shuffle过后产生过多的分区 导致生成的task数量过多
.getOrCreate()
//导入function以及隐式转换
import org.apache.spark.sql.functions._
import spark.implicits._
//DSL的常用方法
/**
* SQL中常用的一些关键字:
* select 查询
* from tb
* where 过滤
* group by 分组 --> max、min、avg、sum、count 聚合函数
* having 分组聚合后的过滤
* order by 排序
* limit 限制返回的数据条数
* 关联:
* 内连接:inner join、join
* 外连接:left join、right join
* 全外连接:full outer join(很少使用)
* 连接:
* union、union all
* 去重:distinct
* 常用的函数:
* 字符串函数、数值函数、日期函数、窗口函数、条件函数
*/
//构建DataFrame
val stuDF: DataFrame = spark
.read
.format("csv") //文本文件统一用csv读取
.option("sep", ",") //最好在读取文本类文件的时候加上sep分隔符
.schema("id String,name String,age Int,gender String,clazz String") //给数据附上结构(列名及列的类型)
.load("Spark/data/students.txt") //指定读取文件的路径
//对多次使用的DataFrame进行cache
stuDF.cache()
//打印DataFrame(表)结构
// stuDF.printSchema()
//select
stuDF.select("id", "name", "age") //字符串表达式
// .show()
stuDF.select($"id", $"name", $"age") //使用 $ 取列 ==> 列表达式(推荐)
// .show()
//where、filter
//过滤 文科班 的学生
stuDF.where("clazz like '文科%'") //字符串表达式
// .show()
stuDF.where($"clazz" like "文科%") //列表达式
// .show()
//使用filter 即支持 字符串表达式 也支持 列表达式 还支持 函数式编程的方式
//DataFrame 跟 DataSet[Row] 没有区别
//DataSet 相比 RDD 会有更多好用的API可以快速完成数据分析
//DataFrame中的数据实际上都是一个个的Row的对象 是有结构的
stuDF.filter(row => row.getAs[String]("clazz").contains("文科"))
// .show()
//用完记得释放缓存
stuDF.unpersist()
//统计班级人数
/**
* groupBy会产生shuffle
* 在spark SQL中默认shuffle过后的分区数为 200 那么会产生200个任务
* 可以通过 spark.sql.shuffle.partitions 进行设置
*/
stuDF
.groupBy($"clazz")
.agg(count($"id"))
// .show()
stuDF
.groupBy($"clazz")
.agg(countDistinct($"id") as "cnt") //注意数据有重复 需要使用countDistinct
// .show()
//having 分组聚合后的过滤
//统计班级人数 并过滤出 人数大于90的班级
/**
* 为什么在DSL中聚合后可以使用where过滤,在SQL中只能用having?
* 在Spark SQL的DSL中会按照从头往后的顺序执行
*/
stuDF
.groupBy($"clazz")
.agg(countDistinct($"id") as "cnt")
.where($"cnt" > 90)
// .show()
//orderBy limit
//统计班级人数 并过滤出 人数大于90的班级 并按照人数从大到小进行排列
stuDF
.groupBy($"clazz")
.agg(countDistinct($"id") as "cnt")
.where($"cnt" > 90)
//在Spark SQL中 orderBy跟sort是一样的 理论上都是做全局排序
//如果不是可以通过改变分区数做到全局排序
// .coalesce(1)
.orderBy($"cnt".desc)
.limit(3)
// .show()
//联接 相当于union
val stuSampleDF1: Dataset[Row] = stuDF.sample(0.01, 1)
val stuSampleDF2: Dataset[Row] = stuDF.sample(0.01, 1)
// stuSampleDF1.show()
// stuSampleDF2.show()
//只有格式一致的两份DataFrame才可以进行联接
/**
* SQL中的union会进行去重,union all不会
* Spark SQL中的union实际上是RDD中的union方法 不会进行去重
* 如果需要去重 可以接distinct
*/
stuSampleDF1
.union(stuSampleDF2)
// .show()
//通过union+distinct实现SQL中union可以去重的效果
stuSampleDF1
.union(stuSampleDF2)
.distinct()
// .show()
//关联
val scoDF: DataFrame = spark
.read
.format("csv")
.option("sep", ",")
.schema("id String,sub_id String,score Int")
.load("Spark/data/score.txt")
//inner join
stuDF
//两个DF在关联时如果关联字段相同 使用列表达式会有问题
.join(scoDF, "id")
// .show()
//left join、right join
//joinType可以指定:
// inner, cross, outer, full, full_outer, left, left_outer, right, right_outer, left_semi, left_anti
// 一定要注意 关联的字段名相同时 又需要指定关联的方式 必须将关联的字段放入List中 作为参数传入
stuDF
.join(scoDF, List("id"), "left")
// .show()
//将scoDF中id列改名为sid
val newScoDF: DataFrame = scoDF.withColumnRenamed("id", "sid")
newScoDF
.join(stuDF, $"id" === $"sid", "right")
// .show()
//DSL中实现case when
//计算每个学生的平均分 考虑个科目总分不一样的情况(数值归一化)
//如果平均分低于60 则 输出不及格
//如果平均分大于等于60 则 输出及格
//如果平均分大于等于70 则 输出中等
//如果平均分大于等于80 则 输出良好
//如果平均分大于等于90 则 输出优秀
//读取科目表并构建DataFrame
val subDF: DataFrame = spark
.read
.format("csv")
.option("sep", ",")
.schema("sub_id String,sub_name String,sub_score String")
.load("Spark/data/subject.txt")
//基于科目表 将所有学生的分数转化为百分制
scoDF
.join(subDF, "sub_id")
.select($"id", $"sub_id", round(($"score" * 100.toDouble / $"sub_score"), 2) as "new_score")
.groupBy($"id")
//计算平均分
.agg(round(avg($"new_score"), 2) as "avg_score")
//实现case when
.select($"id", $"avg_score",
when($"avg_score" >= 90, "优秀")
.when($"avg_score" >= 80, "良好")
.when($"avg_score" >= 70, "中等")
.when($"avg_score" >= 60, "及格")
.otherwise("不及格") as "等级"
)
.sort($"avg_score" desc)
// .show()
/**
* 基于dianxin数据统计:
* 每个城市游客数量最多的前三个区县
*
*/
val dianXinDF: DataFrame = spark
.read
.format("csv")
.option("sep", ",")
.schema("mdn String,grid_id Bigint,city_id Int,county_id Int,duration Int,start_time String,end_time String,pt String")
.load("Spark/data/dianxin.csv")
dianXinDF.printSchema()
dianXinDF.show()
// SQL的方式
dianXinDF.createOrReplaceTempView("dianxin")
spark.sql(
"""
|select tt1.city_id
| ,tt1.county_id
| ,tt1.cnt
| ,tt1.rn
|from (
| select t1.city_id
| ,t1.county_id
| ,t1.cnt
| ,row_number() over(partition by t1.city_id order by t1.cnt desc) as rn
| from (
| SELECT city_id
| ,county_id
| ,count(distinct mdn) as cnt
| from dianxin
| group by city_id,county_id
| ) t1
|) tt1 where tt1.rn<=3
|""".stripMargin).show(50)
// DSL的方式
dianXinDF
.groupBy($"city_id", $"county_id")
.agg(countDistinct($"mdn") as "cnt")
.select($"city_id", $"county_id", $"cnt"
, row_number() over Window.partitionBy($"city_id").orderBy($"cnt".desc) as "rn")
.where($"rn" <= 3)
.show(50)
}
}
root
|-- mdn: string (nullable = true)
|-- grid_id: long (nullable = true)
|-- city_id: integer (nullable = true)
|-- county_id: integer (nullable = true)
|-- duration: integer (nullable = true)
|-- start_time: string (nullable = true)
|-- end_time: string (nullable = true)
|-- pt: string (nullable = true)
+--------------------+---------------+-------+---------+--------+--------------+--------------+--------+
| mdn| grid_id|city_id|county_id|duration| start_time| end_time| pt|
+--------------------+---------------+-------+---------+--------+--------------+--------------+--------+
|D55433A437AEC8D8D...|117210031795040| 83401| 8340104| 301|20180503190539|20180503233517|20180503|
|D55433A437AEC8D8D...|117205031830040| 83401| 8340104| 510|20180503085547|20180503172154|20180503|
|D55433A437AEC8D8D...|117210031800040| 83401| 8340104| 37|20180503180350|20180503180350|20180503|
|D55433A437AEC8D8D...|117210031820040| 83401| 8340104| 10|20180503173254|20180503173254|20180503|
|47BE1E866CFC071DB...|117135031850040| 83401| 8340104| 11|20180503224834|20180503224834|20180503|
|47BE1E866CFC071DB...|119560032075040| 83211| 8321112| 0|20180503204816|20180503204816|20180503|
|47BE1E866CFC071DB...|119560032075040| 83211| 8321112| 1|20180503104337|20180503104337|20180503|
|47BE1E866CFC071DB...|119805031860040| 83204| 8320412| 1|20180503203340|20180503203400|20180503|
|47BE1E866CFC071DB...|118850031995040| 83201| 8320104| 0|20180503100209|20180503100209|20180503|
|47BE1E866CFC071DB...|121455031245040| 83101| 8310106| 13|20180503185355|20180503185355|20180503|
|D55433A437AEC8D8D...|117210031795040| 83401| 8340104| 301|20180503190539|20180503233517|20180503|
|D55433A437AEC8D8D...|117205031830040| 83401| 8340104| 510|20180503085547|20180503172154|20180503|
|D55433A437AEC8D8D...|117210031800040| 83401| 8340104| 37|20180503180350|20180503180350|20180503|
|D55433A437AEC8D8D...|117210031820040| 83401| 8340104| 10|20180503173254|20180503173254|20180503|
|47BE1E866CFC071DB...|117135031850040| 83401| 8340104| 11|20180503224834|20180503224834|20180503|
|47BE1E866CFC071DB...|119560032075040| 83211| 8321112| 0|20180503204816|20180503204816|20180503|
|47BE1E866CFC071DB...|119560032075040| 83211| 8321112| 1|20180503104337|20180503104337|20180503|
|47BE1E866CFC071DB...|119805031860040| 83204| 8320412| 1|20180503203340|20180503203400|20180503|
|47BE1E866CFC071DB...|118850031995040| 83201| 8320104| 0|20180503100209|20180503100209|20180503|
|47BE1E866CFC071DB...|121455031245040| 83101| 8310106| 13|20180503185355|20180503185355|20180503|
+--------------------+---------------+-------+---------+--------+--------------+--------------+--------+
only showing top 20 rows
+-------+---------+---+---+
|city_id|county_id|cnt| rn|
+-------+---------+---+---+
| null| null| 0| 1|
| 81101| 8110105| 2| 1|
| 81101| 8110106| 1| 2|
| 81101| 8110102| 1| 3|
| 81304| 8130426| 1| 1|
| 83201| 8320115| 14| 1|
| 83201| 8320111| 13| 2|
| 83201| 8320104| 9| 3|
| 83202| 8320211| 4| 1|
| 83202| 8320203| 3| 2|
| 83202| 8320206| 2| 3|
| 83210| 8321023| 1| 1|
| 83211| 8321183| 9| 1|
| 83211| 8321112| 5| 2|
| 83211| 8321181| 2| 3|
| 83305| 8330502| 2| 1|
| 83305| 8330521| 2| 2|
| 83305| 8330522| 2| 3|
| 83404| 8340403| 9| 1|
| 83404| 8340404| 7| 2|
| 83404| 8340421| 6| 3|
| 83407| 8340702| 2| 1|
| 83407| 8340711| 2| 2|
| 83407| 8340721| 1| 3|
| 83408| 8340811| 4| 1|
| 83408| 8340826| 3| 2|
| 83408| 8340823| 3| 3|
| 83410| 8341003| 2| 1|
| 83410| 8341021| 1| 2|
| 83410| 8341023| 1| 3|
| 83413| 8341302| 7| 1|
| 83413| 8341321| 3| 2|
| 83413| 8341322| 2| 3|
| 83416| 8341602| 2| 1|
| 83416| 8341623| 2| 2|
| 83416| 8341621| 1| 3|
| 83501| 8350124| 1| 1|
| 83501| 8350121| 1| 2|
| 83501| 8350111| 1| 3|
| 83503| 8350303| 1| 1|
| 83505| 8350582| 2| 1|
| 83507| 8350782| 1| 1|
| 83507| 8350781| 1| 2|
| 83507| 8350784| 1| 3|
| 83509| 8350922| 1| 1|
| 83611| 8361124| 1| 1|
| 83701| 8370113| 1| 1|
| 83701| 8370104| 1| 2|
| 83701| 8370181| 1| 3|
| 83703| 8370303| 2| 1|
+-------+---------+---+---+
only showing top 50 rows
+-------+---------+---+---+
|city_id|county_id|cnt| rn|
+-------+---------+---+---+
| null| null| 0| 1|
| 81101| 8110105| 2| 1|
| 81101| 8110106| 1| 2|
| 81101| 8110102| 1| 3|
| 81304| 8130426| 1| 1|
| 83201| 8320115| 14| 1|
| 83201| 8320111| 13| 2|
| 83201| 8320104| 9| 3|
| 83202| 8320211| 4| 1|
| 83202| 8320203| 3| 2|
| 83202| 8320206| 2| 3|
| 83210| 8321023| 1| 1|
| 83211| 8321183| 9| 1|
| 83211| 8321112| 5| 2|
| 83211| 8321181| 2| 3|
| 83305| 8330502| 2| 1|
| 83305| 8330521| 2| 2|
| 83305| 8330522| 2| 3|
| 83404| 8340403| 9| 1|
| 83404| 8340404| 7| 2|
| 83404| 8340421| 6| 3|
| 83407| 8340702| 2| 1|
| 83407| 8340711| 2| 2|
| 83407| 8340721| 1| 3|
| 83408| 8340811| 4| 1|
| 83408| 8340826| 3| 2|
| 83408| 8340823| 3| 3|
| 83410| 8341003| 2| 1|
| 83410| 8341021| 1| 2|
| 83410| 8341023| 1| 3|
| 83413| 8341302| 7| 1|
| 83413| 8341321| 3| 2|
| 83413| 8341322| 2| 3|
| 83416| 8341602| 2| 1|
| 83416| 8341623| 2| 2|
| 83416| 8341621| 1| 3|
| 83501| 8350124| 1| 1|
| 83501| 8350121| 1| 2|
| 83501| 8350111| 1| 3|
| 83503| 8350303| 1| 1|
| 83505| 8350582| 2| 1|
| 83507| 8350782| 1| 1|
| 83507| 8350781| 1| 2|
| 83507| 8350784| 1| 3|
| 83509| 8350922| 1| 1|
| 83611| 8361124| 1| 1|
| 83701| 8370113| 1| 1|
| 83701| 8370104| 1| 2|
| 83701| 8370181| 1| 3|
| 83703| 8370303| 2| 1|
+-------+---------+---+---+
only showing top 50 rows
[root@master ~]# hdfs dfs -mkdir /data/spark
[root@master ~]# cd ..
[root@master /]# cd /usr/local/soft
[root@master soft]# ls
0?? hive-1.2.1 redis show
A?? jars redis-6.2.6 spark-2.4.5
data jdk1.8.0_171 scripts sqoop-1.4.7
hadoop-2.7.6 packages shell test.txt
hbase-1.4.6 phoenix-4.15.0 shell01 zookeeper-3.4.6
[root@master soft]# cd data/
[root@master data]# ls
data_skew.txt score.sql testArray
deal_tb.txt scoreStruct testArray2.txt
dept.txt score.txt testLieToLine.txt
DIANXIN.csv students_dt.txt theZenOfPython.txt
DIANXIN.sql student.sql theZen.txt
emp.txt students.txt udtfData.txt
new_db.sql students_year_month.txt wordcount
new_score.txt subject.txt words.txt
scoreMap ${system:java.io.tmpdir} yiqing.csv
[root@master data]# cat words.txt
java,c,c++,sql,scale,python
mysql,springboot,redis
hadoop,hive,hbase,spark,flink
kafka,sqoop,flume,datax,kettle,flink
phoenix
mysql,springboot,redis
hadoop,hive,hbase,spark,flink
kafka,sqoop,flume,datax,kettle,flink
phoenix
mysql,springboot,redis
hadoop,hive,hbase,spark,flink
kafka,sqoop,flume,datax,kettle,flink
phoenix
mysql,springboot,redis
hadoop,hive,hbase,spark,flink
kafka,sqoop,flume,datax,kettle,flink
phoenix
[root@master data]# hdfs dfs -mkdir -p /data/spark/wc/input
[root@master data]# hdfs dfs -put words.txt /data/spark/wc/input
[root@master jars]# ls
hadoop-1.0-SNAPSHOT.jar
HBase-1.0-SNAPSHOT.jar
HBase-1.0-SNAPSHOT-jar-with-dependencies.jar
HIve-1.0-SNAPSHOT.jar
HiveUDF-1.0.jar
Spark-1.0-SNAPSHOT.jar
zookeeper.out
[root@master jars]# rm -r Spark-1.0-SNAPSHOT.jar
rm:是否删除普通文件 "Spark-1.0-SNAPSHOT.jar"?y
[root@master jars]# rz -E
rz waiting to receive.
[root@master jars]# spark-submit --class com.shujia.sql.Demo05Submit --master yarn --deploy-mode client --executor-memory 512M --total-executor-cores 1 Spark-1.0-SNAPSHOT.jar
+----------+---+
| word|cnt|
+----------+---+
| c| 1|
| c++| 1|
| sql| 1|
| scale| 1|
|springboot| 4|
| redis| 4|
| sqoop| 4|
| flume| 4|
| java| 1|
| python| 1|
| mysql| 4|
| hadoop| 4|
| hive| 4|
| hbase| 4|
| spark| 4|
| flink| 8|
| kafka| 4|
| datax| 4|
| kettle| 4|
| phoenix| 4|
+----------+---+
[root@master jars]# spark-shell --master yarn-client
scala> import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
scala> val spark: SparkSession =SparkSession.builder().appName("Demo05Submit").config("spark.sql.shuffle.partitions", 2).getOrCreate()
22/05/17 21:23:31 WARN sql.SparkSession$Builder: Using an existing SparkSession; some configuration may not take effect.
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@20d1737
scala> val lineDF: DataFrame = spark.read.format("csv").option("sep", "|").schema("line String").load("/data/spark/wc/input")
lineDF: org.apache.spark.sql.DataFrame = [line: string]
scala> lineDF.show()
[Stage 0:> +--------------------+
| line|
+--------------------+
|java,c,c++,sql,sc...|
|mysql,springboot,...|
|hadoop,hive,hbase...|
|kafka,sqoop,flume...|
| phoenix|
|mysql,springboot,...|
|hadoop,hive,hbase...|
|kafka,sqoop,flume...|
| phoenix|
|mysql,springboot,...|
|hadoop,hive,hbase...|
|kafka,sqoop,flume...|
| phoenix|
|mysql,springboot,...|
|hadoop,hive,hbase...|
|kafka,sqoop,flume...|
| phoenix|
+--------------------+
scala> lineDF.createOrReplaceTempView("word_count")
scala> spark.sql("select t1.word,count(*) as cnt from(select explode(split(line, ',')) as word from word_count )t1 group by t1.word").show()
22/05/17 21:31:41 WARN metastore.ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 1.2.0
22/05/17 21:31:41 WARN metastore.ObjectStore: Failed to get database default, returning NoSuchObjectException
22/05/17 21:31:41 WARN metastore.ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
[Stage 1:> +----------+---+
| word|cnt|
+----------+---+
| c| 1|
| c++| 1|
| sql| 1|
| scale| 1|
|springboot| 4|
| redis| 4|
| sqoop| 4|
| flume| 4|
| java| 1|
| python| 1|
| mysql| 4|
| hadoop| 4|
| hive| 4|
| hbase| 4|
| spark| 4|
| flink| 8|
| kafka| 4|
| datax| 4|
| kettle| 4|
| phoenix| 4|
+----------+---+
[root@master jars]# spark-sql --master yarn-client
[root@master jars]# ls
derby.log
hadoop-1.0-SNAPSHOT.jar
HBase-1.0-SNAPSHOT.jar
HBase-1.0-SNAPSHOT-jar-with-dependencies.jar
HIve-1.0-SNAPSHOT.jar
HiveUDF-1.0.jar
metastore_db
Spark-1.0-SNAPSHOT.jar
spark-warehouse
zookeeper.out
[root@master jars]# rm -rf metastore_db/
[root@master jars]# ls
derby.log
hadoop-1.0-SNAPSHOT.jar
HBase-1.0-SNAPSHOT.jar
HBase-1.0-SNAPSHOT-jar-with-dependencies.jar
HIve-1.0-SNAPSHOT.jar
HiveUDF-1.0.jar
Spark-1.0-SNAPSHOT.jar
spark-warehouse
zookeeper.out
[root@master jars]# cd ..
[root@master soft]# cd hive-1.2.1/
[root@master hive-1.2.1]# ls
bin examples lib NOTICE RELEASE_NOTES.txt tmp
conf hcatalog LICENSE README.txt scripts
[root@master hive-1.2.1]# pwd
/usr/local/soft/hive-1.2.1
[root@master hive-1.2.1]# cd conf/
[root@master conf]# ls
beeline-log4j.properties.template
hive-default.xml.template
hive-env.sh
hive-env.sh.template
hive-exec-log4j.properties.template
hive-log4j.properties.template
hive-site.xml
ivysettings.xml
[root@master conf]# vim hive-site.xml
<property>
<name>hive.metastore.uris</name>
<value>thrift://master:9083</value>
</property>
[root@master jars]# hive
Logging initialized using configuration in jar:file:/usr/local/soft/hive-1.2.1/lib/hive-common-1.2.1.jar!/hive-log4j.properties
hive> show databases;
OK
default
stu
test1
testsqoop
Time taken: 0.561 seconds, Fetched: 4 row(s)
hive> use stu;
OK
Time taken: 0.015 seconds
hive> show tables;
OK
new_stu
student
Time taken: 0.02 seconds, Fetched: 2 row(s)
hive> select * from student limit 10;
OK
1500100006 边昂雄 21 男 理科二班
1500100009 沈德昌 21 男 理科一班
1500100011 宰运华 21 男 理科三班
1500100012 梁易槐 21 女 理科一班
1500100015 宦怀绿 21 女 理科一班
1500100017 高芷天 21 女 理科五班
1500100018 骆怜雪 21 女 文科六班
1500100026 向鹏池 21 男 理科四班
1500100027 路辰锟 21 男 文科四班
1500100029 滕旭炎 21 男 理科二班
Time taken: 0.272 seconds, Fetched: 10 row(s)
[root@master conf]# cp /usr/local/soft/hive-1.2.1/conf/hive-site.xml /usr/local/soft/spark-2.4.5/conf/
[root@master conf]# pwd
/usr/local/soft/hive-1.2.1/conf
[root@master conf]# cd ..
[root@master hive-1.2.1]# cd ..
[root@master soft]# cd spark-2.4.5/
[root@master spark-2.4.5]# cd conf/
[root@master conf]# ls
docker.properties.template slaves.template
fairscheduler.xml.template spark-defaults.conf
hive-site.xml spark-defaults.conf.template
log4j.properties.template spark-env.sh
metrics.properties.template spark-env.sh.template
slaves
[root@master conf]# cd ..
[root@master hive-1.2.1]# cd ..
[root@master soft]# cd spark-2.4.5/
[root@master spark-2.4.5]# cd conf/
[root@master conf]# ls
docker.properties.template slaves.template
fairscheduler.xml.template spark-defaults.conf
hive-site.xml spark-defaults.conf.template
log4j.properties.template spark-env.sh
metrics.properties.template spark-env.sh.template
slaves
[root@master conf]# cd ..
[root@master spark-2.4.5]# cd ..
[root@master soft]# cd hive-1.2.1/
[root@master hive-1.2.1]# ls
bin examples lib NOTICE RELEASE_NOTES.txt
conf hcatalog LICENSE README.txt scripts
[root@master hive-1.2.1]# cd lib/
[root@master lib]# cp /usr/local/soft/hive-1.2.1/lib/mysql-connector-java-5.1.49.jar /usr/local/soft/spark-2.4.5/jars/
[root@master lib]# pwd
/usr/local/soft/hive-1.2.1/lib
[root@master lib]# cd ..
[root@master hive-1.2.1]# cd ..
[root@master soft]# cd spark-2.4.5/
[root@master spark-2.4.5]# cd conf/
[root@master spark-2.4.5]# spark-sql --master yarn-client --conf spark.sql.shuffle.partitions=2
spark-sql> show databases;
22/05/18 10:01:20 INFO codegen.CodeGenerator: Code generated in 138.743596 ms
default
stu
test1
testsqoop
Time taken: 0.625 seconds, Fetched 4 row(s)
22/05/18 10:01:20 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.625 seconds, Fetched 4 row(s)
spark-sql> use stu;
Time taken: 0.035 seconds
22/05/18 10:03:06 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.035 seconds
spark-sql> show tables;
22/05/18 10:03:12 INFO codegen.CodeGenerator: Code generated in 7.876977 ms
stu new_stu false
stu student false
Time taken: 0.065 seconds, Fetched 2 row(s)
22/05/18 10:03:12 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.065 seconds, Fetched 2 row(s)
spark-sql> select * from student limit 10;
1500100006 边昂雄 21 男 理科二班
1500100009 沈德昌 21 男 理科一班
1500100011 宰运华 21 男 理科三班
1500100012 梁易槐 21 女 理科一班
1500100015 宦怀绿 21 女 理科一班
1500100017 高芷天 21 女 理科五班
1500100018 骆怜雪 21 女 文科六班
1500100026 向鹏池 21 男 理科四班
1500100027 路辰锟 21 男 文科四班
1500100029 滕旭炎 21 男 理科二班
Time taken: 2.89 seconds, Fetched 10 row(s)
22/05/18 10:04:27 INFO thriftserver.SparkSQLCLIDriver: Time taken: 2.89 seconds, Fetched 10 row(s)
spark-sql> select clazz,count(*) from student group by clazz limit 10;
理科六班 92
理科五班 70
文科二班 87
文科四班 81
理科二班 79
理科三班 68
理科一班 78
文科六班 104
理科四班 91
文科五班 84
Time taken: 4.572 seconds, Fetched 10 row(s)
22/05/18 10:07:28 INFO thriftserver.SparkSQLCLIDriver: Time taken: 4.572 seconds, Fetched 10 row(s)
spark-sql> create database stu_spark;
Time taken: 0.593 seconds
22/05/18 10:11:11 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.593 seconds
spark-sql> show databases;
default
stu
stu_spark
test1
testsqoop
Time taken: 0.016 seconds, Fetched 5 row(s)
22/05/18 10:11:36 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.016 seconds, Fetched 5 row(s)
spark-sql> use stu_spark;
Time taken: 0.031 seconds
22/05/18 10:11:58 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.031 seconds
spark-sql> create table student(
> id string,
> name string,
> age int,
> gender string,
> clazz string
> )
> ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
> STORED AS textfile
> location '/data/spark/stu/input/';
Time taken: 0.202 seconds
22/05/18 10:12:32 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.202 seconds
spark-sql> create external table score(
> student_id string,
> cource_id string,
> sco int
> )
> ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
> STORED AS textfile
> location '/data/spark/score/input/';
Time taken: 0.073 seconds
22/05/18 10:12:56 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.073 seconds
spark-sql> show tables;
stu_spark score false
stu_spark student false
Time taken: 0.028 seconds, Fetched 2 row(s)
22/05/18 10:14:21 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.028 seconds, Fetched 2 row(s)
spark-sql> load data local inpath "/usr/local/soft/data/students.txt" into table student;
22/05/18 10:16:20 ERROR hdfs.KeyProviderCache: Could not find uri with key [dfs.encryption.key.provider.uri] to create a keyProvider !!
22/05/18 10:16:20 INFO metadata.Hive: Renaming src: file:/usr/local/soft/data/students.txt, dest: hdfs://master:9000/data/spark/stu/input/students.txt, Status:true
Time taken: 0.63 seconds
22/05/18 10:16:20 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.63 seconds
spark-sql> select * from student limit 10;
001 zs 20 男 文科一班
1500100001 施笑槐 22 女 文科六班
1500100002 吕金鹏 24 男 文科六班
1500100003 单乐蕊 22 女 理科六班
1500100004 葛德曜 24 男 理科三班
1500100005 宣谷芹 22 女 理科五班
1500100006 边昂雄 21 男 理科二班
1500100007 尚孤风 23 女 文科六班
1500100008 符半双 22 女 理科六班
1500100009 沈德昌 21 男 理科一班
Time taken: 0.244 seconds, Fetched 10 row(s)
22/05/18 10:16:43 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.244 seconds, Fetched 10 row(s)
spark-sql> load data local inpath "/usr/local/soft/data/score.txt" into table score;
22/05/18 10:17:38 INFO metadata.Hive: Renaming src: file:/usr/local/soft/data/score.txt, dest: hdfs://master:9000/data/spark/score/input/score.txt, Status:true
Time taken: 0.182 seconds
22/05/18 10:17:38 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.182 seconds
spark-sql> select * from score limit 10;
1500100001 1000001 98
1500100001 1000002 5
1500100001 1000003 137
1500100001 1000004 29
1500100001 1000005 85
1500100001 1000006 52
1500100002 1000001 139
1500100002 1000002 102
1500100002 1000003 44
1500100002 1000004 18
Time taken: 0.21 seconds, Fetched 10 row(s)
22/05/18 10:17:47 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.21 seconds, Fetched 10 row(s)
spark-sql> select tt1.id
> ,tt1.name
> ,tt1.clazz
> ,tt1.sum_score
> ,tt1.rn
> from (
> select t1.id
> ,t1.name
> ,t1.clazz
> ,t2.sum_score
> ,row_number() over(partition by t1.clazz order by t2.sum_score desc) as rn
> from student t1
> join (
> select student_id as id
> ,sum(sco) as sum_score
> from score
> group by student_id
> ) t2 on t1.id = t2.id
> ) tt1 where tt1.rn <= 3;
1500100418 蓟海昌 文科二班 611 1
1500100823 宓新曦 文科二班 547 2
1500100954 咸芷天 文科二班 533 3
1500100873 路鸿志 文科四班 612 1
1500100258 湛昌勋 文科四班 604 2
1500100116 文元蝶 文科四班 520 3
1500100929 满慕易 理科三班 630 1
1500100184 夔寻巧 理科三班 580 2
1500100598 宰金鹏 理科三班 551 3
1500100834 谷念薇 理科二班 586 1
1500100104 咸冰蝶 理科二班 533 2
1500100762 聂德明 理科二班 524 3
1500100080 巫景彰 理科五班 628 1
1500100547 廖向南 理科五班 584 2
1500100839 明雁桃 理科五班 557 3
1500100596 田晨潍 理科六班 587 1
1500100903 於依云 理科六班 549 2
1500100563 禄昆鹏 理科六班 536 3
1500100308 黄初夏 文科一班 628 1
1500100875 马向南 文科一班 595 2
1500100943 许昌黎 文科一班 580 3
1500100160 云冰真 文科三班 568 1
1500100434 黎雨珍 文科三班 550 2
1500100572 臧忆香 文科三班 550 3
1500100930 闻运凯 文科五班 589 1
1500100949 颜沛槐 文科五班 564 2
1500100904 阎元蝶 文科五班 562 3
1500100136 黎昆鹏 文科六班 583 1
1500100900 查思菱 文科六班 562 2
1500100716 丰冷霜 文科六班 560 3
1500100235 沈香巧 理科一班 520 1
1500100773 傅元蝶 理科一班 492 2
1500100925 卞乐萱 理科一班 486 3
1500100635 蓬怀绿 理科四班 534 1
1500100590 逄中震 理科四班 530 2
1500100939 耿智杰 理科四班 530 3
Time taken: 2.978 seconds, Fetched 36 row(s)
22/05/18 10:23:54 INFO thriftserver.SparkSQLCLIDriver: Time taken: 2.978 seconds, Fetched 36 row(s)