现在Hive执行结果会存储到HDFS上,这些文件是一些SQL语句,我们可以通过Spark读取这些文本文件,然后导入到MySQL中,下面是实现了如何通过Spark来读取HDFS,通过在Parition中获取数据库的连接,并把操作MySQL数据库,从而实现Spark读取HDFS,来操作MySQL。




本项目的环境:


JDK:1.7


Hadoop:2.7.1


Spark:1.6.0


Scala:2.10.5



采用SBT方式创建的项目,可以详见:http://blog.csdn.net/shenfuli/article/details/51534734




/**
* <Function> 读取HDFS上文本的SQL,然后导入MySQL </Function>
* </Deploy>
*
* Author: Created by fuli.shen on 2016/5/30.
*/
object RDDtoMySQL {

var inputPath = ""
var ip = ""
var dataBaseName = ""
var userName = ""
var password = ""

def main(args: Array[String]) {
if (args.length != 5) {
println("Usage:<inputPath><dataBaseName><userName><password>")
sys.exit(1)
}
inputPath = args(0)
ip = args(1)
dataBaseName = args(2)
userName = args(3)
password = args(4)
//初始化Spark环境
val conf: SparkConf = new SparkConf()
.setAppName("RDDtoMySQL")
val sc: SparkContext = new SparkContext(conf)

//读取HDFS转为RDD
val textFile: RDD[String] = sc.textFile(inputPath)
//RDD导入Mysql,使用foreachPartition减少MySQL创建连接数
textFile.foreachPartition(importMySQL)
sc.stop()
}

/**
* 批量插入Mysql,提升插入MySQL的效率
*
* @param iterator
*/
def importMySQL(iterator: Iterator[String]): Unit = {

//初始化变量
var conn: Connection = null
var ps: PreparedStatement = null
try {
val url = "jdbc:mysql://" + ip + "/" + dataBaseName + ""
conn = DriverManager.getConnection(url, userName, password);
val s1 = System.currentTimeMillis();
iterator.foreach(sql => {
ps = conn.prepareStatement(sql)
ps.executeUpdate()
})
val s2 = System.currentTimeMillis();
println("update data consume time:" + (s2 - s1) + "ms")
} catch {
case e: Exception => println("RDDtoMySQL.importMySQL MySQL Exception ")
} finally {
if (null != ps) {
ps.close()
}
if (null != conn) {
conn.close()
}
}
}
}


注意: 再次强调,获取MySQL的连接,这里是在Parition中获取的。