POM.xml文件

按照项目实际需求添加依赖即可。

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>spark</artifactId>
    <version>1.0</version>
    <packaging>jar</packaging>

    <properties>
        <java.version>1.8</java.version>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
        <scala.vesion>2.11</scala.vesion>
        <spark.version>2.1.1</spark.version>
        <mysql.version>5.1.48</mysql.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.vesion}</artifactId>
            <version>${spark.version}</version>
            <!--<scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.vesion}</artifactId>
            <version>${spark.version}</version>
            <!--<scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_${scala.vesion}</artifactId>
            <version>${spark.version}</version>
            <!--<scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>${mysql.version}</version>
            <scope>runtime</scope>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <!--编译scala的插件-->
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <!-- maven打包插件 -->
            <!--<plugin>-->
            <!--    <groupId>org.apache.maven.plugins</groupId>-->
            <!--    <artifactId>maven-compiler-plugin</artifactId>-->
            <!--    <version>3.8.1</version>-->
            <!--    <configuration>-->
            <!--        <source>1.8</source>-->
            <!--        <target>1.8</target>-->
            <!--    </configuration>-->
            <!--</plugin>-->

            <!--打包依赖-->
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <appendAssemblyId>false</appendAssemblyId>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>

        </plugins>
    </build>

</project>

Properties 文件

配置mysql连接参数。

# mysql连接参数
mysql.url=jdbc:mysql://92.168.13.11:3306/zhmlj?useUnicode=true&characterEncoding=utf-8&useSSL=false
mysql.user=root
mysql.password=root
# mysql读取大表配置(开启后采用并发分区读取)
mysql.isPartition=true
# 表数据大小(估算值即可,大于等于实际表数据量)
mysql.tableCount=11000000
# 分区读取条数(按照分页进行读取)
mysql.partitionLimit=100000
# 排序字段
mysql.orderField=id

Properties 工具类

用来读取properties文件的mysql连接参数

package utils

import java.io.InputStream
import java.util.Properties

object PropertiesUtils {
  //单例配置文件
  lazy val getProperties: Properties = {
    val properties = new Properties()
    val in: InputStream = this.getClass.getClassLoader.getResourceAsStream("application.properties");
    properties.load(in)
    properties
  }

}

SparkUtils工具类

用来创建单例SparkSession 。

package utils

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

object SparkUtils {

  /**
   * 创建批处理配置对象
   * setMaster:设置运行模式 local:单线程模式,local[n]:以n个线程运行,local[*]:以所有CPU核数的线程运行
   * setAppName:设置应用名称
   * set:各种属性(spark.testing.memory分配4G内存来测试)
   */

  //TODO 本地运行用这个
  lazy val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("TestHive").set("spark.testing.memory", "4294967296")

  //TODO 打包运行用这个
  //  lazy val sparkConf: SparkConf = new SparkConf().setAppName("SparkJob")

  //创建session
  //  lazy val sparkSession: SparkSession = SparkSession.builder().config(sparkConf).getOrCreate()

  //创建session,并启用hive
  lazy val sparkSessionWithHive: SparkSession = SparkSession.builder().enableHiveSupport().config(sparkConf).getOrCreate()

}

连接Mysql工具类

用来创建mysql连接。

package utils

import java.sql.{Connection, DriverManager}
import java.util.Properties
import org.apache.spark.sql.{DataFrame, SaveMode}

object ConnectUtils {

  private val properties: Properties = PropertiesUtils.getProperties

  /**
   * mysql数据源输入
   */
  def mysqlSource: (String) =>
    DataFrame = (tableName: String) => {
    val prop = new Properties()
    prop.setProperty("user", properties.getProperty("mysql.user"))
    prop.setProperty("password", properties.getProperty("mysql.password"))
    prop.setProperty("driver", "com.mysql.jdbc.Driver")

    //是否开启读大表配置
    if (properties.getProperty("mysql.isPartition").equals("true")) {
      //表数据大小
      val tableCount: Int = properties.getProperty("mysql.tableCount").toInt
      //每页数据大小
      val partitionLimit: Int = properties.getProperty("mysql.partitionLimit").toInt
      //需要的分页数
      val pages: Int = tableCount / partitionLimit
      //分页条件
      val partitionArray = new Array[String](pages)
      val orderField: String = properties.getProperty("mysql.orderField")
      for (i <- 0 until pages) {
        //        partitionArray(i) = s"1=1 order by ${properties.getProperty("mysql.orderField")} limit ${i * partitionLimit},${partitionLimit}"
        // 考虑到mysql在超大数据量查询时limit的性能问题,建议用这种方式进行limit分页
        partitionArray(i) = s"1=1 and ${orderField} >=(select ${orderField} from ${tableName} order by ${orderField} limit ${i * partitionLimit},1) limit ${partitionLimit}"
      }
      SparkUtils.sparkSessionWithHive.read.jdbc(properties.getProperty("mysql.url"), tableName, partitionArray, prop)
    } else {
      SparkUtils.sparkSessionWithHive.read.jdbc(properties.getProperty("mysql.url"), tableName, prop)
    }
  }
  
}

Mysql to Hive

使用spark读取mysql表数据写入到hive中。

package demo

import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import utils.{ConnectUtils, SparkUtils}

object MysqlToHive {
  def main(args: Array[String]): Unit = {
    //创建session
    val spark: SparkSession = SparkUtils.sparkSessionWithHive

    //连接mysql
    //TODO 修改表名
    val dataDF: DataFrame = ConnectUtils.mysqlSource("table_test01")

    //TODO 具体业务逻辑处理
    //通过调用API的方式保存到hive
    dataDF.write.mode(SaveMode.Append).insertInto("databases_test.table_test01")

    //方式二:利用API自动创建表再插入数据
    //dataDF.write.mode(SaveMode.Append).saveAsTable("test_xsh_0401")
    //方式三:利用SQL插入已存在的表
    //    dataDF.createTempView("qiaoJie")
    //    sql("insert into table ods_xsh_0330 select * from qiaoJie")

    println("OK。。。。。")

    //释放资源
    spark.stop()
  }
}

打包提交到YARN运行

  1. 本地debug程序的时候,可以先在properties文件把表数据大小tableCount和分区读取条数partitionLimit设置小一点,用作本地测试程序。
  2. 打包运行时,记得把SparkUtils的sparkConf修改为打包用的;把pom.xml的scope注释去掉,用服务器上spark自带的依赖即可。<scope>provided</scope>
  3. maven clean package打包成功后,把jar包提交到YARN上运行。
  4. 运行jar包命令如下:
    具体参数可结合服务器配置和美团技术文章说明进行修改。
    Spark性能优化指南——基础篇
spark-submit \
  --master yarn-cluster \
  --num-executors 2\
  --executor-memory 6G \
  --executor-cores 4 \
  --driver-memory 1G