1 ,原始数据 :
- goods.csv : 2 万条
款色号,款号,大类,中类,小类,性别,年季,商品年份,商品季节,颜色,上市月份,上市温度,订货评分,运营评分,波段,售价,系列,版型
BAXL3399A04,BAXL3399,内搭,衬衫,长袖正统衬衫,,2016秋,2016,秋,A04,,,,,2016秋三,799,经典商务,合身版
NWLJ4530L01,NWLJ4530,配件,皮具,福袋,,2015秋,2015,秋,L01,,,,,,999,经典商务,无
FTXJ1109J03,FTXJ1109,内搭,T恤,长袖棉T恤,,2015春,2015,春,J03,,,,,2015春二第(1)波,599,时尚商务,合身版
FOFL3210L01B,FOFL3210,外套,单西,茄克单西服,,2016秋,2016,秋,L01B,,,,,2016秋三,1880,时尚商务,合身版
BFWJ4541D02,BFWJ4541,外套,大衣,毛呢大衣,,2015冬,2015,冬,D02,,,,,2015初冬,2280,经典商务,合身版
BCCV3388L01,BCCV3388,内搭,衬衫,长袖休闲衬衫,,2017秋,2017,秋,L01,,,,,2017初秋秋二,699,新商务系列,合身版
FTTL3366J03,FTTL3366,内搭,T恤,短袖天丝T恤,,2016秋,2016,秋,J03,,,,,2016秋一,899,时尚商务,合身版
FTSL3312L01,FTSL3312,内搭,T恤,长袖丝光棉T恤,,2016秋,2016,秋,L01,,,,,2016秋二,1180,时尚商务,合身版
BECV2386J13B,BECV2386,内搭,衬衫,短袖正统衬衫,,2017夏,2017,夏,J13B,,,,,2017夏三,799,经典商务,合身版
FRDJ4368J17,FRDJ4368,外套,羽绒服,短版茄克薄羽绒服,,2015冬,2015,冬,J17,,,,,2015深秋,1380,时尚商务,合身版
BFWJ4525D01B,BFWJ4525,外套,大衣,毛呢大衣,,2015冬,2015,冬,D01B,,,,,2015初冬,3680,经典商务,合身版
FTBV2721N02,FTBV2721,内搭,T恤,短袖丝光棉T恤,,2017夏,2017,夏,N02,,,,,2017夏五,799,时尚商务,合身版
FYXV1302L01,FYXV1302,外套,毛衫,茄克毛衫,,2017春,2017,春,L01,,,,,2017春一,1580,时尚商务,合身版
BAGJ3307J03,BAGJ3307,内搭,衬衫,长袖高档衬衫,,2015秋,2015,秋,J03,,,,,2015初秋,1580,高端系列,合身版
PPXJ1318E01,PPXJ1318,鞋,鞋,鞋子,,2015春,2015,春,E01,,,,,2015春一第(1)波,1490,无,无
FQXV4356L01,FQXV4356,裤装,长裤,休闲裤,,2017冬,2017,冬,L01,,,,,2017初冬冬二,989,时尚系列,合身版
PPXV1336L15,PPXV1336,鞋,鞋,鞋子,,2017春,2017,春,L15,,,,,2017春二,2290,经典商务,无
FCCJ3119L01B,FCCJ3119,内搭,衬衫,长袖休闲衬衫,,2015秋,2015,秋,L01B,,,,,2015初秋,699,时尚商务,合身版
FTCJ2540L01,FTCJ2540,内搭,T恤,短袖棉T恤,,2015夏,2015,夏,L01,,,,,2015夏一第(1)波,869,时尚商务,修身版
FRJJ4564C02,FRJJ4564,外套,羽绒服,中长版茄克羽绒服,,2015冬,2015,冬,C02,,,,,2015初冬,1480,时尚商务,合身版
- stock : 2 千万条
区域,门店代码,销售月份,销售日期,款号,颜色,变动原因,库存变动量,店主,店铺类型
1000,1000W01,201701,20170101,BADJ2134,L01,调拨,-75.0,,
1000,1000W01,201701,20170101,BADJ2134,L12,调拨,-141.0,,
1000,1000W01,201701,20170101,BADJ2134,N11,调拨,-144.0,,
1000,1000W01,201701,20170101,BADJ2370,H02,调拨,-1496.0,,
1000,1000W01,201701,20170101,BADJ2370,H02B,调拨,-464.0,,
1000,1000W01,201701,20170101,BADJ2370,L03,调拨,-1210.0,,
1000,1000W01,201701,20170101,BADJ2370,L03B,调拨,-234.0,,
1000,1000W01,201701,20170101,BADJ2371,I02,调拨,-824.0,,
1000,1000W01,201701,20170101,BADJ2371,L14,调拨,-1510.0,,
1000,1000W01,201701,20170101,BADJ2375,L12,调拨,-91.0,,
1000,1000W01,201701,20170101,BADJ2375,N07,调拨,-133.0,,
1000,1000W01,201701,20170101,BADJ2378,L02,调拨,-1773.0,,
1000,1000W01,201701,20170101,BADJ2378,L12,调拨,-3033.0,,
1000,1000W01,201701,20170101,BADJ2382,L03,调拨,-326.0,,
1000,1000W01,201701,20170101,BADJ2382,L10,调拨,-377.0,,
1000,1000W01,201701,20170101,BADJ2383,K01,调拨,-20.0,,
1000,1000W01,201701,20170101,BADJ2383,L10,调拨,-30.0,,
1000,1000W01,201701,20170101,BADJ2510,A04,调拨,-2756.0,,
1000,1000W01,201701,20170101,BADJ2510,L10,调拨,-6165.0,,
1000,1000W01,201701,20170101,BADJ2511,A04,调拨,-277.0,,
1000,1000W01,201701,20170101,BADJ2511,N06,调拨,-153.0,,
1000,1000W01,201701,20170101,BADJ2521,H03,调拨,-120.0,,
1000,1000W01,201701,20170101,BADJ2537,L05,调拨,-490.0,,
2 ,将数据上传到 s3 :
3 ,idea 建 maven 项目,支持 scala :
- 建项目 : balaTest
- pom.xml :
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.lifecycle.demo01</groupId>
<artifactId>balaTest</artifactId>
<version>1.0-SNAPSHOT</version>
<repositories>
<repository>
<id>emr-5.18.0-artifacts</id>
<name>EMR 5.18.0 Releases Repository</name>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
<url>https://s3.us-west-1.amazonaws.com/us-west-1-emr-artifacts/emr-5.18.0/repos/maven/</url>
</repository>
</repositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<scala.version>2.11.12</scala.version>
<spark.version>2.4.3</spark.version>
<hadoop.version>2.8.5</hadoop.version>
<spark.pom.scope>compile</spark.pom.scope>
</properties>
<dependencies>
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk</artifactId>
<version>1.11.636</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
<scope>${spark.pom.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
<scope>${spark.pom.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
<scope>${spark.pom.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
<scope>${spark.pom.scope}</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.6.3</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.6.3</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.6.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-aws</artifactId>
<version>2.8.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.8.5</version>
</dependency>
<dependency>
<groupId>net.java.dev.jets3t</groupId>
<artifactId>jets3t</artifactId>
<version>0.9.4</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.4</version>
</dependency>
</dependencies>
<!-- 打包插件 -->
<!-- 打包插件 : 将 scala 和 java 代码一同打包成 jar 包 -->
<build>
<resources>
<resource>
<directory>${env.HOME}/.aws/</directory>
</resource>
<!-- 打包资源文件 -->
<resource>
<directory>src/main/resources</directory>
<includes>
<include>**/*.properties</include>
<include>**/*.xml</include>
</includes>
<filtering>false</filtering>
</resource>
<!--<resource>
<directory>src/main/java</directory>
<includes>
<include>**/*.properties</include>
<include>**/*.xml</include>
</includes>
<filtering>false</filtering>
</resource>-->
</resources>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.2.1</version>
<executions>
<execution>
<goals>
<goal>java</goal>
</goals>
</execution>
</executions>
<configuration>
<mainClass>com.amazonaws.samples.S3Sample</mainClass>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<id>scala-compile-first</id>
<goals>
<goal>compile</goal>
</goals>
<configuration>
<includes>
<include>**/*.scala</include>
</includes>
</configuration>
</execution>
</executions>
</plugin>
<!-- maven 打包插件,将依赖也打进 jar 包 -->
<!--<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.4.1</version>
<configuration>
<!– get all project dependencies –>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<!– MainClass in mainfest make a executable jar –>
<archive>
<manifest>
<mainClass>util.Microseer</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<!– bind to the packaging phase –>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>-->
</plugins>
</build>
</project>
3 ,支持 scala :
- 建文件夹 : scala
- 设置成源码目录 :
- 成功的标志 : 它变色了
4 ,s3 配置文件 :
5 ,自定义分区 :
package com.lifecycle.sparkUtil
import org.apache.spark.Partitioner
// 传 1 个参数 : key 组成的数组 ( 我们这里传进来的是所有 area )
class StockPartitioner(val arr: Array[String]) extends Partitioner {
// 理论 :
// 1 ,分区从 0 开始
// 2 ,一共 num 个分区的话,分区号从 0 到 num-1
// 1 ,分区数 ( 有几个区域,就有几个分区 )
override def numPartitions:Int = arr.length
// 2 ,根据 key ,指定分区
override def getPartition(key: Any):Int = {
return arr.indexOf(key.toString)
}
}
6 ,主体代码 :
package com.lifecycle.demo02
import java.io.InputStream
import java.util.Properties
import com.lifecycle.sparkUtil.StockPartitioner
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.IntegerType
import scala.collection.mutable.ListBuffer
object Demo01_partition {
def main(args: Array[String]): Unit = {
// goods("typeColorId","typeId","bigLei","midLei","smallLei","gendar","yearSeason","goodsYear","goodsSeason","color","sellMonth","sellTpt","dingScore","yunYingScore","boDuan","sellPrice","serialId","banType")
// stock("area","stockId","sellMonth","sellDate","kuanId","color","reasonBian","numBian","owner","stockType")
// 结果集
val resList: ListBuffer[(String, String)] = new ListBuffer[Tuple2[String, String]]
val spark: SparkSession = getSpark()
// 0 ,隐式转换
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import spark.implicits._
// 一 ,两表关联 ( goods,stock )
// 1 ,表关联 : ( 区域,小类,款色号,日期-此时已经转变为第几周,库存变动量 ),把日期转换为第几周
val dfJoin: DataFrame = spark.sql("select stock.area,goods.smallLei,goods.typeColorId,weekofyear(to_date(stock.sellDate,'yyyyMMdd')) weekofyear,stock.numBian from lifeCycleGoods goods,lifeCycleStock stock where goods.typeId=stock.kuanId and goods.color=stock.color")
// 2 ,表缓存
val joinTable: DataFrame = dfJoin.cache()
// 3 ,表注册 : joinTable ( stock.area,smallLei,typeColorId,weekofyear,numBian )
joinTable.createOrReplaceTempView("joinTable")
// 二 ,周聚合 : dfZhou ( area,smallLei,typeColorId,weekofyear,sumstock )
// 1 ,周 sql
val dfZhou: DataFrame = spark.sql("select area,smallLei,typeColorId,weekofyear,sum(numBian) sumstock from joinTable group by area,smallLei,typeColorId,weekofyear order by area,smallLei,typeColorId,weekofyear")
// 2 ,周缓存 :
val zhouTable: DataFrame = dfZhou.cache()
// 3 ,周注册 : ( area,smallLei,typeColorId,weekofyear,sumstock )
zhouTable.createOrReplaceTempView("zhouTable")
// 三 ,周累加,结果排序 ( over 里面 order by 到了谁,就按照谁来累加 )
// 1 ,累加 sql
var sqlLeiJia = "select " +
"area,smallLei,typeColorId,weekofyear,sumstock," +
"sum(sumstock) over(partition by area,smallLei,typeColorId order by area,smallLei,typeColorId,weekofyear) sumoverstock " +
"from zhouTable " +
"order by area,smallLei,typeColorId,weekofyear"
// 2 ,执行,并且,将结果缓存 ( area,smallLei,typeColorId,weekofyear,sumstock,sumoverstock )
val dfRes: DataFrame = spark.sql(sqlLeiJia).cache()
// 四 ,将结果重新分区 ( 目的 : 分文件输出 ) 思路 : df->rdd->重新分区->rdd->df ( 用 area 分区 )
// 1 ,所有的 area ,得到一个数组
val dfAreaRes: DataFrame = spark.sql("select distinct(area) area from joinTable")
val arr: Array[String] = dfAreaRes.rdd.map(row => {
row.get(0).toString
}).collect()
// 2 ,自定义分区器 : 按照 area 分区
val areaPartitioner: StockPartitioner = new StockPartitioner(arr)
// 3 ,转换 df -> rdd ( area,smallLei,typeColorId,weekofyear,sumstock,sumoverstock )
val rddRow: RDD[Row] = dfRes.rdd
// 转换 : area,smallLei,typeColorId,weekofyear,sumstock
val rddColumn: RDD[(String, (String, String, String, String, String, String))] = rddRow.map(row => {
val area: String = row.get(0).toString
val smallLei: String = row.get(1).toString
val typeColorId: String = row.get(2).toString
val weekofyear: String = row.get(3).toString
val sumstock: String = row.get(4).toString
val sumoverstock: String = row.get(5).toString
(area, (area, smallLei, typeColorId, weekofyear, sumstock,sumoverstock))
})
// 4 ,重新分区
val rddPar: RDD[(String, (String, String, String, String, String, String))] = rddColumn.partitionBy(areaPartitioner)
// 5 ,转换 rdd -> df
val dfResRes: DataFrame = rddPar.map(e=>e._2).toDF("area","smallLei","typeColorId","weekofyear","sumstock","sumsumstock")
// 五 ,输出
dfResRes.write.option("header","true").option("delimiter",",").csv("s3a://lifecyclebigdata/test/data/lifecycle/res02")
spark.stop()
}
// 建 spark 对象
// 建表 goods : lifeCycleGoods ( 18 列 )
// 建表 stock : lifeCycleStock ( 10 列 )
def getSpark(): SparkSession = {
// 毫秒
val timer1: Long = System.currentTimeMillis()
// 1 ,spark 上下文
val spark = SparkSession.builder()
// 为了使用 webUI
.config("spark.eventLog.enabled", "false")
// driver 进程的内存
.config("spark.driver.memory", "2g")
// spark 的 shuffle 数量
.config("spark.sql.shuffle.partitions", "100")
.appName("SparkDemoFromS3")
.getOrCreate()
// 1 ,日志级别
spark.sparkContext.setLogLevel("WARN")
// 2 ,读资源文件
val properties = new Properties()
val stream: InputStream = Demo01_partition.getClass.getClassLoader.getResourceAsStream("s3.properties")
properties.load(stream)
// 3 ,设置数据源 ( s3 )
val sc: SparkContext = spark.sparkContext
sc.hadoopConfiguration.set("fs.s3a.access.key", properties.getProperty("fs.s3a.access.key"))
sc.hadoopConfiguration.set("fs.s3a.secret.key", properties.getProperty("fs.s3a.secret.key"))
sc.hadoopConfiguration.set("fs.s3a.endpoint", properties.getProperty("fs.s3a.endpoint"))
// 4 ,隐式转换
// 5 ,商品表,18 列 ( 款色号,款号,大类,中类,小类,性别,年季,商品年份,商品季节,颜色,上市月份,上市温度,订货评分,运营评分,波段,售价,系列,版型 )
// 5 ,注意 : 读文件,有表头
val dfSourceGoods: DataFrame = spark.read.option("header", "true").option("delimiter", ",").csv("s3a://lifecyclebigdata/test/data/lifecycle/goods.csv")
val dfGoods: DataFrame = dfSourceGoods.toDF("typeColorId", "typeId", "bigLei", "midLei", "smallLei", "gendar", "yearSeason", "goodsYear", "goodsSeason", "color", "sellMonth", "sellTpt", "dingScore", "yunYingScore", "boDuan", "sellPrice", "serialId", "banType")
dfGoods.cache().createOrReplaceTempView("lifeCycleGoods")
// 6 ,业绩表,10 列 ( 区域,门店代码,销售月份,销售日期,款号,颜色,变动原因,库存变动量,店主,店铺类型 )
// 6 ,注意 : 读文件,有表头
val dfSourceStock: DataFrame = spark.read.option("header", "true").option("delimiter", ",").csv("s3a://lifecyclebigdata/test/data/lifecycle/stock.csv")
val dfStock: DataFrame = dfSourceStock.toDF("area", "stockId", "sellMonth", "sellDate", "kuanId", "color", "reasonBian", "numBian", "owner", "stockType").withColumn("numBian", col("numBian").cast(IntegerType))
dfStock.cache().createOrReplaceTempView("lifeCycleStock")
spark
}
}
// 样例类,JoinGoods:stock.area,smallLei,typeColorId,weekofyear,numBian
// case class JoinGoods(var area:String,var smallLei:String,var typeColorId:String,var weekofyear:String,var numBian:String)
7 ,打包 :
8 ,运行 :
spark-submit --master yarn --deploy-mode client --num-executors 20 --executor-cores 2 --executor-memory 4g --class com.lifecycle.demo02.Demo01_partition s3://lifecyclebigdata/test/jar/demo1007-1.0-SNAPSHOT.jar
9 ,结果 : 36 个分区文件