本文章主要通过代码实现spark读取各类数据源

1 spark读取hive数据

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.hive.HiveContext;

/**
 * Hive数据源
 * @author Administrator
 *
 */
public class HiveDataSource {

   @SuppressWarnings("deprecation")
   public static void main(String[] args) {
      // 首先还是创建SparkConf
      SparkConf conf = new SparkConf()
            .setAppName("HiveDataSource");
      // 创建JavaSparkContext
      JavaSparkContext sc = new JavaSparkContext(conf);
      // 创建HiveContext,注意,这里,它接收的是SparkContext作为参数,不是JavaSparkContext
      HiveContext hiveContext = new HiveContext(sc.sc());
      
      // 第一个功能,使用HiveContext的sql()方法,可以执行Hive中能够执行的HiveQL语句
      // 判断是否存在student_infos表,如果存在则删除
      hiveContext.sql("DROP TABLE IF EXISTS student_infos");
      // 判断student_infos表是否不存在,如果不存在,则创建该表
      hiveContext.sql("CREATE TABLE IF NOT EXISTS student_infos (name STRING, age INT)");
      // 将学生基本信息数据导入student_infos表
      hiveContext.sql("LOAD DATA "
            + "LOCAL INPATH '/usr/local/spark-study/resources/student_infos.txt' "
            + "INTO TABLE student_infos");
      
      // 用同样的方式给student_scores导入数据
      hiveContext.sql("DROP TABLE IF EXISTS student_scores"); 
      hiveContext.sql("CREATE TABLE IF NOT EXISTS student_scores (name STRING, score INT)");  
      hiveContext.sql("LOAD DATA "
            + "LOCAL INPATH '/usr/local/spark-study/resources/student_scores.txt' "
            + "INTO TABLE student_scores");
      
      // 第二个功能,执行sql还可以返回DataFrame,用于查询
      // 执行sql查询,关联两张表,查询成绩大于80分的学生
      DataFrame goodStudentsDF = hiveContext.sql("SELECT si.name, si.age, ss.score "
            + "FROM student_infos si "
            + "JOIN student_scores ss ON si.name=ss.name "
            + "WHERE ss.score>=80");
      
      // 第三个功能,可以将DataFrame中的数据,理论上来说,DataFrame对应的RDD的元素,是Row即可
      // 将DataFrame中的数据保存到hive表中
      // 接着将DataFrame中的数据保存到good_student_infos表中
      hiveContext.sql("DROP TABLE IF EXISTS good_student_infos");  
      goodStudentsDF.saveAsTable("good_student_infos");  
      
      // 第四个功能,可以用table()方法,针对hive表,直接创建DataFrame
      
      // 然后针对good_student_infos表,直接创建DataFrame
      Row[] goodStudentRows = hiveContext.table("good_student_infos").collect();  
      for(Row goodStudentRow : goodStudentRows) {
         System.out.println(goodStudentRow);  
      }
      
      sc.close();
   }
   
}

2 spark读取jdbc数据源

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import scala.Tuple2;

/**
 * JDBC数据源
 * @author Administrator
 *
 */
public class JDBCDataSource {

   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("JDBCDataSource");  
      JavaSparkContext sc = new JavaSparkContext(conf);
      SQLContext sqlContext = new SQLContext(sc);
      
      // 总结一下
      // jdbc数据源
      // 首先,是通过SQLContext的read系列方法,将mysql中的数据加载为DataFrame
      // 然后可以将DataFrame转换为RDD,使用Spark Core提供的各种算子进行操作
      // 最后可以将得到的数据结果,通过foreach()算子,写入mysql、hbase、redis等等db / cache中
      
      // 分别将mysql中两张表的数据加载为DataFrame
      Map<String, String> options = new HashMap<String, String>();
      options.put("url", "jdbc:mysql://spark1:3306/testdb");
      options.put("dbtable", "student_infos");

      DataFrame studentInfosDF = sqlContext.read().format("jdbc")
            .options(options).load();
      
      options.put("dbtable", "student_scores");
      DataFrame studentScoresDF = sqlContext.read().format("jdbc")
            .options(options).load();
      
      // 将两个DataFrame转换为JavaPairRDD,执行join操作
      JavaPairRDD<String, Tuple2<Integer, Integer>> studentsRDD = 
            
            studentInfosDF.javaRDD().mapToPair(
            
                  new PairFunction<Row, String, Integer>() {
      
                     private static final long serialVersionUID = 1L;
            
                     @Override
                     public Tuple2<String, Integer> call(Row row) throws Exception {
                        return new Tuple2<String, Integer>(row.getString(0), 
                              Integer.valueOf(String.valueOf(row.get(1))));  
                     }
                     
                  })
            .join(studentScoresDF.javaRDD().mapToPair(
                     
                  new PairFunction<Row, String, Integer>() {
      
                     private static final long serialVersionUID = 1L;
      
                     @Override
                     public Tuple2<String, Integer> call(Row row) throws Exception {
                        return new Tuple2<String, Integer>(String.valueOf(row.get(0)),
                              Integer.valueOf(String.valueOf(row.get(1))));  
                     }
                     
                  }));
      
      // 将JavaPairRDD转换为JavaRDD<Row>
      JavaRDD<Row> studentRowsRDD = studentsRDD.map(
            
            new Function<Tuple2<String,Tuple2<Integer,Integer>>, Row>() {

               private static final long serialVersionUID = 1L;

               @Override
               public Row call(
                     Tuple2<String, Tuple2<Integer, Integer>> tuple)
                     throws Exception {
                  return RowFactory.create(tuple._1, tuple._2._1, tuple._2._2);
               }
               
            });
      
      // 过滤出分数大于80分的数据
      JavaRDD<Row> filteredStudentRowsRDD = studentRowsRDD.filter(
            
            new Function<Row, Boolean>() {

               private static final long serialVersionUID = 1L;

               @Override
               public Boolean call(Row row) throws Exception {
                  if(row.getInt(2) > 80) {
                     return true;
                  } 
                  return false;
               }
               
            });
      
      // 转换为DataFrame
      List<StructField> structFields = new ArrayList<StructField>();
      structFields.add(DataTypes.createStructField("name", DataTypes.StringType, true));  
      structFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true)); 
      structFields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true)); 
      StructType structType = DataTypes.createStructType(structFields);
      
      DataFrame studentsDF = sqlContext.createDataFrame(filteredStudentRowsRDD, structType);
      
      Row[] rows = studentsDF.collect();
      for(Row row : rows) {
         System.out.println(row);  
      }
      
      // 将DataFrame中的数据保存到mysql表中
      // 这种方式是在企业里很常用的,有可能是插入mysql、有可能是插入hbase,还有可能是插入redis缓存
      studentsDF.javaRDD().foreach(new VoidFunction<Row>() {
         
         private static final long serialVersionUID = 1L;

         @Override
         public void call(Row row) throws Exception {
            String sql = "insert into good_student_infos values(" 
                  + "'" + String.valueOf(row.getString(0)) + "',"
                  + Integer.valueOf(String.valueOf(row.get(1))) + ","
                  + Integer.valueOf(String.valueOf(row.get(2))) + ")";   
            
            Class.forName("com.mysql.jdbc.Driver");  
            
            Connection conn = null;
            Statement stmt = null;
            try {
               conn = DriverManager.getConnection(
                     "jdbc:mysql://spark1:3306/testdb", "", "");
               stmt = conn.createStatement();
               stmt.executeUpdate(sql);
            } catch (Exception e) {
               e.printStackTrace();
            } finally {
               if(stmt != null) {
                  stmt.close();
               } 
               if(conn != null) {
                  conn.close();
               }
            }
         }
         
      }); 
      
      sc.close();
   }
   
}

3 spark读取json格式数据

import java.util.ArrayList;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import scala.Tuple2;

/**
 * JSON数据源
 * @author Administrator
 *
 */
public class JSONDataSource {

   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("JSONDataSource");  
      JavaSparkContext sc = new JavaSparkContext(conf);
      SQLContext sqlContext = new SQLContext(sc);
      
      // 针对json文件,创建DataFrame(针对json文件创建DataFrame)
      DataFrame studentScoresDF = sqlContext.read().json(
            "hdfs://spark1:9000/spark-study/students.json");  
      
      // 针对学生成绩信息的DataFrame,注册临时表,查询分数大于80分的学生的姓名
      // (注册临时表,针对临时表执行sql语句)
      studentScoresDF.registerTempTable("student_scores");
      DataFrame goodStudentScoresDF = sqlContext.sql(
            "select name,score from student_scores where score>=80");
      
      // (将DataFrame转换为rdd,执行transformation操作)
      List<String> goodStudentNames = goodStudentScoresDF.javaRDD().map(
            
            new Function<Row, String>() {
               
               private static final long serialVersionUID = 1L;
      
               @Override
               public String call(Row row) throws Exception {
                  return row.getString(0);
               }
               
            }).collect();
      
      // 然后针对JavaRDD<String>,创建DataFrame
      // (针对包含json串的JavaRDD,创建DataFrame)
      List<String> studentInfoJSONs = new ArrayList<String>();
      studentInfoJSONs.add("{\"name\":\"Leo\", \"age\":18}");  
      studentInfoJSONs.add("{\"name\":\"Marry\", \"age\":17}");  
      studentInfoJSONs.add("{\"name\":\"Jack\", \"age\":19}");
      JavaRDD<String> studentInfoJSONsRDD = sc.parallelize(studentInfoJSONs);

      DataFrame studentInfosDF = sqlContext.read().json(studentInfoJSONsRDD);
      
      // 针对学生基本信息DataFrame,注册临时表,然后查询分数大于80分的学生的基本信息
      studentInfosDF.registerTempTable("student_infos");  
      
      String sql = "select name,age from student_infos where name in (";
      for(int i = 0; i < goodStudentNames.size(); i++) {
         sql += "'" + goodStudentNames.get(i) + "'";
         if(i < goodStudentNames.size() - 1) {
            sql += ",";
         }
      }
      sql += ")";
      
      DataFrame goodStudentInfosDF = sqlContext.sql(sql);
      
      // 然后将两份数据的DataFrame,转换为JavaPairRDD,执行join transformation
      // (将DataFrame转换为JavaRDD,再map为JavaPairRDD,然后进行join)
      JavaPairRDD<String, Tuple2<Integer, Integer>> goodStudentsRDD = 
            
            goodStudentScoresDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {

               private static final long serialVersionUID = 1L;
      
               @Override
               public Tuple2<String, Integer> call(Row row) throws Exception {
                  return new Tuple2<String, Integer>(row.getString(0), 
                        Integer.valueOf(String.valueOf(row.getLong(1))));  
               }
               
            }).join(goodStudentInfosDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
      
               private static final long serialVersionUID = 1L;
      
               @Override
               public Tuple2<String, Integer> call(Row row) throws Exception {
                  return new Tuple2<String, Integer>(row.getString(0),
                        Integer.valueOf(String.valueOf(row.getLong(1))));   
               }
               
            }));
      
      // 然后将封装在RDD中的好学生的全部信息,转换为一个JavaRDD<Row>的格式
      // (将JavaRDD,转换为DataFrame)
      JavaRDD<Row> goodStudentRowsRDD = goodStudentsRDD.map(
            
            new Function<Tuple2<String,Tuple2<Integer,Integer>>, Row>() {

               private static final long serialVersionUID = 1L;

               @Override
               public Row call(
                     Tuple2<String, Tuple2<Integer, Integer>> tuple)
                     throws Exception {
                  return RowFactory.create(tuple._1, tuple._2._1, tuple._2._2);
               }
               
            });
      
      // 创建一份元数据,将JavaRDD<Row>转换为DataFrame
      List<StructField> structFields = new ArrayList<StructField>();
      structFields.add(DataTypes.createStructField("name", DataTypes.StringType, true)); 
      structFields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true));  
      structFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));  
      StructType structType = DataTypes.createStructType(structFields);
      
      DataFrame goodStudentsDF = sqlContext.createDataFrame(goodStudentRowsRDD, structType);
      
      // 将好学生的全部信息保存到一个json文件中去
      // (将DataFrame中的数据保存到外部的json文件中去)
      goodStudentsDF.write().format("json").save("hdfs://spark1:9000/spark-study/good-students");  
   }
   
}

4 spark读取parquet数据

import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;

/**
 * Parquet数据源之使用编程方式加载数据
 * @author Administrator
 *
 */
public class ParquetLoadData {

   public static void main(String[] args) {
      SparkConf conf = new SparkConf()
            .setAppName("ParquetLoadData");  
      JavaSparkContext sc = new JavaSparkContext(conf);
      SQLContext sqlContext = new SQLContext(sc);
      
      // 读取Parquet文件中的数据,创建一个DataFrame
      DataFrame usersDF = sqlContext.read().parquet(
            "hdfs://spark1:9000/spark-study/users.parquet");
      
      // 将DataFrame注册为临时表,然后使用SQL查询需要的数据
      usersDF.registerTempTable("users");  
      DataFrame userNamesDF = sqlContext.sql("select name from users");  
      
      // 对查询出来的DataFrame进行transformation操作,处理数据,然后打印出来
      List<String> userNames = userNamesDF.javaRDD().map(new Function<Row, String>() {

         private static final long serialVersionUID = 1L;

         @Override
         public String call(Row row) throws Exception {
            return "Name: " + row.getString(0);
         }
         
      }).collect();
      
      for(String userName : userNames) {
         System.out.println(userName);  
      }
   }
   
}