一、SparkSQL整合Hive做数据源
1.官网上说需要把3个配文(core-site,hdfs-site,hive-site)放到程序类路径下。经测试本地运行需要给程序指明大数据的组件位置(hdfs、hive),在CDH上不需要设置到程序中,猜测应该是CDH的Spark自动把这3个配文放到类路径下,apache的没测过。在服务器上搜索core-site.xml: find / -name core-site.xml会发现spark2_on_yarn下有对应的3个配文。
2.创建SparkSession对象时需要进行2个设置,亲测,本地运行需要,CDH上不需要
val sparkSession = SparkSession.builder().config(sparkConf)
//.config("spark.sql.warehouse.dir", "/user/hive/warehouse")
//.config("hive.metastore.uris","thrift://node105:9083")
.enableHiveSupport()
.getOrCreate()
3.本地运行还需要spark-hive的依赖,如果遇到一些报错,大概率也是jar缺失引起的,因为本地运行不像CDH那样都配好,hadoop的jar有可能会缺失。在CDH上不需要任何额外的jar,spark-hive也不需要
二、Spark程序连接Hive时进行kerberos认证
1.kerberos是通过3个文件来进行免密验证。在程序中读取3个文件,设置到Hadoop的Configuration对象中,然后使用hadoop.security包中提供的API进行是否建立hadoop连接的尝试。整个过程封装到1个方法中,只要方法执行时没抛异常,就表示当前节点得到了hadoop服务器的认证和授权,当前节点建立的连接被允许连接hive。
2.因为kerberos5分钟就失效,所以程序启动时就需要通过线程工厂创建1个校验线程,5分钟执行1次获取认证和授权方法。并且这个线程要设为守护线程,因为这个程序跑完就算执行完,不像web项目需要一直运行。整个过程封装到HiveAuthen类中,调用HiveAuthen构造器时就开启了校验线程。
package com.cib.dqms.auth;
import com.cib.dqms.core.util.thread.ScheduledThreadFactory;
import com.cib.dqms.utils.PropertiesUtil;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.UserGroupInformation;
import java.io.IOException;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
public class HiveAuthen {
private ScheduledExecutorService scheduledExecutor = Executors.newScheduledThreadPool(1,new ScheduledThreadFactory()) ;
public HiveAuthen(){
krbAuth();
scheduledExecutor.scheduleAtFixedRate(new Runnable() {
@Override
/**
* 每5分钟执行1次向kerberos进行认证和授权的方法
*/
public void run() {
krbAuth();
}
}, 5L, 5L, TimeUnit.MINUTES);
}
/**
* 向kerberos进行认证和授权的方法
*/
public void krbAuth(){
String krbConf = PropertiesUtil.getRequiredStringProperty("hive.krb.conf");
String krbKeyTab = PropertiesUtil.getRequiredStringProperty("hive.krb.key");
String krbPrincipal = PropertiesUtil.getRequiredStringProperty("hive.krb.principal");
if (StringUtils.isEmpty(krbConf) || StringUtils.isEmpty(krbKeyTab) || StringUtils.isEmpty(krbPrincipal)) {
throw new RuntimeException("---------------------------kerbores认证文件不存在-------------------------");
}
System.setProperty("java.security.krb5.conf", krbConf);
Configuration configuration = new Configuration();
configuration.set("hadoop.security.authentication", "kerberos");
//这样就不需要交互式输入密码了
configuration.set("keytab.file", krbKeyTab);
configuration.setBoolean("hadoop.security.authorization", true);
configuration.set("kerberos.principal", krbPrincipal);
try{
UserGroupInformation.setConfiguration(configuration);
UserGroupInformation.loginUserFromKeytab(krbPrincipal, krbKeyTab);
}catch(IOException e){
System.err.println(e.getMessage());
}
}
}
package com.cib.dqms.core.util.thread;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.atomic.AtomicInteger;
public class ScheduledThreadFactory implements ThreadFactory {
private static final AtomicInteger poolNumber = new AtomicInteger(1);
private final ThreadGroup group;
private final AtomicInteger threadNumber = new AtomicInteger(1);
private final String namePrefix;
public ScheduledThreadFactory() {
SecurityManager s = System.getSecurityManager();
group = (s != null) ? s.getThreadGroup() : Thread.currentThread().getThreadGroup();
namePrefix = "Scheduled Pool-" + poolNumber.getAndIncrement() + "-Thead-";
}
@Override
public Thread newThread(Runnable r) {
Thread t = new Thread(group, r, namePrefix + threadNumber.getAndIncrement());
if (t.isDaemon()) {
/*
设置为守护进程,所在的jar执行完就退出,如果不是守护进程,在linux运行时,即使业务进程执行完,这个认证进程也不会关
*/
t.setDaemon(true);
}
if (t.getPriority() != Thread.NORM_PRIORITY) {
t.setPriority(Thread.NORM_PRIORITY);
}
return t;
}
}
object SparkWithPMMLAndDQMS {
def main(args: Array[String]): Unit = {
/*
调用HiveAuthen的构造器,类中构造器开启kerberos验证的线程
*/
new HiveAuthen()
/*
创建上下文
sparkSession需要连接hive
*/
val sparkSession = SparkSession.builder()
.appName("SparkHive")
.master("local")//此处可以设置方便本地运行,在集群上运行时在脚本中设置master会覆盖程序中的配置
/*下面这2个配置都不需要,本地运行需要
//.config("spark.sql.warehouse.dir", "/user/hive/warehouse")
//.config("hive.metastore.uris","thrift://node105:9083")
*/
.enableHiveSupport()
.getOrCreate()
val df = sparkSession.sql("SELECT * FROM dctest.sogouq11 limit 10")
df.printSchema()
df.show(10,false)
......
}
}
package com.cib.dqms.utils;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Properties;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class PropertiesUtil {
private static Log log = LogFactory.getLog(PropertiesUtil.class);
private static Properties props = new Properties();
private static String propertyFileName = "/system.properties";
static {
try {
if (props.size() == 0) {
log.info("Start read the constv.properties file.");
InputStream input = PropertiesUtil.class.getResourceAsStream(propertyFileName);
props.load(input);
input.close();
}
} catch (IOException ioe) {
log.error(ioe.getMessage());
log.debug(ioe);
}
}
public static Integer getRequiredIntgerProperty(String propertyName) {
String str = getRequiredStringProperty(propertyName);
return Integer.parseInt(str);
}
public static String getRequiredStringProperty(String propertyName) {
String str = getStringProperty(propertyName, null);
if(StringUtils.isBlank(str)){
throw new RuntimeException(propertyName+" not in property file "+ propertyFileName);
}
return str;
}
public static String getStringProperty(String propertyName, String defaultValue) {
if (props.containsKey(propertyName) == true) {
return (String) props.get(propertyName);
}
return defaultValue;
}
public static Integer getIntegerProperty(String propertyName, String defaultValue) {
String str = getStringProperty(propertyName, defaultValue);
if(StringUtils.isNotBlank(str)){
return Integer.parseInt(str);
}
return null;
}
public static String getStringProperty(String propertyName, String defaultValue,String encoding) {
if (props.containsKey(propertyName) == true) {
//编码转换,从ISO8859-1转向指定编码
String value=(String) props.get(propertyName);
try {
value = new String(value.getBytes("ISO8859-1"), encoding);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return value;
}
return defaultValue;
}
}