安装目录以及各个版本
大数据安装版本
软件 | 版本 | 备注 |
hadoop | hadoop-3.3.4 | hdfs基础 |
spark | spark-3.2.4-bin-hadoop3.2 | 计算框架 |
zookeeper | apache-zookeeper-3.5.7-bin | 分布式服务器 |
hbase | hbase-2.5.7-hadoop3 | 列式存储 |
hive | apache-hive-3.1.3-bin | 数仓元数据 |
创建文件
cd /export/server/hadoop
mkdir tmpdata
rm -rf tmpdata/*
vim workers
```sh
hadoop01
hadoop02
hadoop03
修改 vim yarn-site.xml hadoop01
<property>
<name>yarn.resourcemanager.hostname</name>
<value>hadoop01</value>
</property>
<property>
<name>yarn.log.server.url</name>
<value>http://hadoop01:19888/jobhistory/logs</value>
</property>
修改 vim core-site.xml hadoop01
<property>
<name>fs.defaultFS</name>
<value>hdfs://hadoop01:8020</value>
</property>
修改 vim hdfs-site.xml hadoop01
<!-- nn web端访问地址-->
<property>
<name>dfs.namenode.http-address</name>
<value>hadoop01:9870</value>
</property>
<!-- 2nn web端访问地址-->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>hadoop01:9868</value>
</property>
<property>
<name>dfs.namenode.hosts</name>
<value>hadoop01,hadoop02,hadoop03</value>
</property>
修改mapred-site.xml hadoop01
<!-- 历史服务器端地址 -->
<property>
<name>mapreduce.jobhistory.address</name>
<value>hadoop01:10020</value>
</property>
<!-- 历史服务器web端地址 -->
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>hadoop01:19888</value>
</property>
修改 spark-env.sh
SPARK_MASTER_HOST=hadoop01
SPARK_HISTORY_OPTS=“-Dspark.history.fs.logDirectory=hdfs://hadoop01:8020/sparklog/ -Dspark.history.fs.cleaner.enabled=true”
修改 vim spark-defaults.conf
spark.master spark://hadoop01:7077
spark.eventLog.enabled true
spark.eventLog.dir hdfs://hadoop01:8020/sparklog/
修改: vim workers
hadoop01
hadoop02
hadoop03
添加 sparklog
hadoop fs -mkdir /sparklog
启动服务
su - hadoop
-- 启动hadoop和spark
exec-hadoop.sh start
exec-hadoop.sh stop
-- 启动数据库
systemctl start docker // pgsql hive
-- 启动hive
nohup hive --service metastore >> /export/server/hive/logs/metastore.log 2>&1 &
-- 启动hbase
/export/server/hbase/bin/start-hbase.sh
cd /export/server/hbase && bin/stop-hbase.sh
本地虚拟机快照记录
查看系统
lsb_release -a
cat /etc/redhat-release
修改主机名
hostnamectl set-hostname <新主机名>
/etc/hostname
两台服务器的主机名
hostname
node1
node2
node3
安装python
使用包管理工具anaconda3
参考:
创建用户hadoop
adduser hadoop
passwd hadoop
安装hadoop
hadoop的配置文件
cd /export/server/hadoop/etc/hadoop
mkdir -p /export/server/hadoop/logs
mkdir -p /export/server/hadoop/tmpdata
mkdir -p /data/nm-local /data/nm-log
配置workers
vim workers
node1
node2
node3
配置hadoop-env.sh
vim hadoop-env.sh
export JAVA_HOME=/export/server/jdk
export HADOOP_HOME=/export/server/hadoop
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export HADOOP_LOG_DIR=$HADOOP_HOME/logs
# export HADOOP_PID_DIR=/home/hadoop/tmp
# export HADOOP_SECURE_PID_DIR=/home/hadoop/tmp
配置core-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://node1:8020</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/export/server/hadoop/tmpdata</value>
</property>
<property>
<name>hadoop.http.staticuser.user</name>
<value>hadoop</value>
</property>
<property>
<name>hadoop.proxyuser.hadoop.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hadoop.groups</name>
<value>*</value>
</property>
</configuration>
配置hdfs-site.xml
NameNode有一个工作线程池,用来处理不同DataNode的并发心跳以及客户端并发的元数据操作。对于大集群或者有大量客户端的集群来说,通常需要增大参数dfs.namenode.handler.count的默认值10。设置该值的一般原则是将其设置为集群大小的自然对数乘以20,即20logN,N为集群大小
dfs.blocksize默认是128M,大性能高,小存储利用率高
https://www.shezhan88.com/variousinfo/1183213.html
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- nn web端访问地址-->
<property>
<name>dfs.namenode.http-address</name>
<value>node1:9870</value>
</property>
<!-- 2nn web端访问地址-->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>node1:9868</value>
</property>
<property>
<name>dfs.datanode.data.dir.perm</name>
<value>777</value>
</property>
<property>
<name>dfs.blocksize</name>
<value>268435456</value>
</property>
<property>
<name>dfs.namenode.handler.count</name>
<value>100</value>
</property>
<property>
<name>dfs.namenode.hosts</name>
<value>node1,node2,node3</value>
</property>
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
</configuration>
hadoop-env.sh
export JAVA_HOME=/export/server/jdk
export HADOOP_HOME=/export/server/hadoop
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export HADOOP_LOG_DIR=$HADOOP_HOME/logs
# export HADOOP_PID_DIR=/home/shhadoop/tmp
# export HADOOP_SECURE_PID_DIR=/home/shhadoop/tmp
# export HADOOP_OS_TYPE=${HADOOP_OS_TYPE:-$(uname -s)}
mapred-env.sh
export JAVA_HOME=/export/server/jdk
yarn-env.sh
export JAVA_HOME=/export/server/jdk
export HADOOP_HOME=/export/server/hadoop
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export HADOOP_LOG_DIR=$HADOOP_HOME/logs
配置mapred-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
<description>Execution framework set to Hadoop YARN.</description>
</property>
<!-- 历史服务器端地址 -->
<property>
<name>mapreduce.jobhistory.address</name>
<value>node2:10020</value>
</property>
<!-- 历史服务器web端地址 -->
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>node2:19888</value>
</property>
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
<description>MAPREDUCE home设置为HADOOP_HOME</description>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
<description>MAPREDUCE HOME 设置为HADOOP_HOME</description>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
<description>mAPREDUCE HOME 设置为HADOOP_HOME</description>
</property>
</configuration>
配置yarn-site.xml
yarn-site.xml配置参数参考文档:
yarn.nodemanager.vmem-pmem-ratio = 2.1
每单位的物理内存总量对应的虚拟内存量,默认是2.1,表示每使用1MB的物理内存,最多可以使用2.1MB的虚拟内存总量。
yarn.nodemanager.resource.cpu-vcores=64
该节点上YARN可使用的虚拟CPU个数,默认是8.注意,目前推荐将该值设值为与物理CPU核数数目相同。如果你的节点CPU核数不够8个,则需要调减小这个值,而YARN不会智能的探测节点的物理CPU总数。
yarn.scheduler.minimum-allocation-vcores=1
单个任务可申请的最小虚拟CPU个数,默认是1,如果一个任务申请的CPU个数少于该数,则该对应的值改为这个数。
yarn.scheduler.maximum-allocation-vcores=64
单个任务可申请的最多虚拟CPU个数,默认是32。
<?xml version="1.0"?>
<configuration>
<!-- 指定MR走shuffle -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 指定ResourceManager的地址-->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>node1</value>
</property>
<!-- 开启日志聚集功能 -->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!-- 设置日志聚集服务器地址 -->
<property>
<name>yarn.log.server.url</name>
<value>http://node1:19888/jobhistory/logs</value>
</property>
<!-- 设置日志保留时间为7天 -->
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>604800</value>
</property>
<!-- 关闭yarn内存检查 -->
<property>
<name>yarn.nodemanager.pmem-check-enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
</configuration>
全局环境配置/etc/profile
vim /etc/profile
export JAVA_HOME=/usr/local/jdk
export HADOOP_HOME=/mnt/bigdata/components/hadoop
export PYSPARK_PYTHON=/mnt/bigdata/components/anaconda3/envs/pyspark/bin/python
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export SPARK_HOME=/mnt/bigdata/components/spark
export HIVE_HOME=/mnt/bigdata/components/hive
export PATH=$PATH:$JAVA_HOME/bin
export PATH=$PATH:$HADOOP_HOME/bin
export PATH=$PATH:$SPARK_HOME/bin
export PATH=$PATH:$HIVE_HOME/bin
export PATH=$PATH:$HADOOP_HOME/sbin
.bashrc
# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/mnt/bigdata/components/anaconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
eval "$__conda_setup"
else
if [ -f "/mnt/bigdata/components/anaconda3/etc/profile.d/conda.sh" ]; then
. "/mnt/bigdata/components/anaconda3/etc/profile.d/conda.sh"
else
export PATH="/mnt/bigdata/components/anaconda3/bin:$PATH"
fi
fi
unset __conda_setup
# <<< conda initialize <<<
export JAVA_HOME=/usr/local/jdk
export PYSPARK_PYTHON=/mnt/bigdata/components/anaconda3/envs/pyspark/bin/python
source .bashrc
同步node1,node2,node3服务器配置
创建同步脚本xsync
mkdir -p /home/hadoop/bin && cd /home/hadoop/bin
vim xsync
#!/bin/bash
#1. 判断参数个数
if [ $# -lt 1 ]
then
echo Not Arguement
exit;
fi
#2. 遍历集群所有机器
for host in node1 node2 node3
do
echo ==================== $host ====================
#3. 遍历所有目录,挨个发送
for file in $@
do
#4. 判断文件是否存在
if [ -e $file ]
then
#5. 获取父目录
pdir=$(cd -P $(dirname $file); pwd)
#6. 获取当前文件的名称
fname=$(basename $file)
ssh $host "mkdir -p $pdir"
rsync -av $pdir/$fname $host:$pdir
else
echo $file does not exists!
fi
done
done
添加执行权限
chmod +x xsync
同步Hadoop
cd /export/server
sudo /home/hadoop/bin/xsync hadoop
创建jps状态打印
vim /export/server/hadoop/sbin/jpsall.sh
#!/bin/bash
for host in node1 node2 node3
do
echo =============== $host ===============
ssh $host $JAVA_HOME/bin/jps
done
添加执行权限
chmod +x jpsall.sh
master服务初始化hadoop
hadoop namenode -format
配置hive
配置 /etc/profile 的hive同上;
授权代理hadoop用户,可以支持hive访问
cd hadoop/etc/hadoop
vim core-site.xml
hadoop.proxyuser..hosts /path/to/hosts/file ```xml hadoop.proxyuser.hadoop.hosts * hadoop.proxyuser.hadoop.groups * ``` 同上
配置hive-env.sh
cd hive/conf
mv hive-env.sh.template hive-env.sh
vim hive-env.sh
export HADOOP_HOME=/export/server/hadoop
export HIVE_CONF_DIR=/export/server/hive/conf
export HIVE_AUX_JARS_PATH=/export/server/hive/lib
配置meata元素据库pgsql
vim hive-site.xml
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:postgresql://node2:5432/hivedata?createDatabaseIfNotExist=true</value>
<description>JDBC connect string for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>org.postgresql.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hiveuser</value>
<description>username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
<description>password to use against metastore database</description>
</property>
<property>
<name>hive.server2.thrift.bind.host</name>
<value>node1</value>
</property>
<!-- 指定hiveserver2连接的端口号 -->
<property>
<name>hive.server2.thrift.port</name>
<value>10000</value>
</property>
<!--配置元数据服务-->
<property>
<name>hive.metastore.uris</name>
<value>thrift://node1:9083</value>
</property>
<!-- Hive默认在HDFS的工作目录 -->
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse</value>
</property>
<!-- Hive元数据存储的验证 -->
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
<!-- 元数据存储授权 -->
<property>
<name>hive.metastore.event.db.notification.api.auth</name>
<value>false</value>
</property>
</configuration>
hive配置日志位置
mv hive-log4j2.properties.template hive-log4j2.properties
#property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name}
property.hive.log.dir=/export/server/hive/logs
初始化表
cd /export/server/hive/bin
./schematool -dbType postgres -initSchema
测试hive
启动脚本: vim exec-hive.sh
#!/bin/bash
if [ $# -lt 1 ]
then
echo "No Args Input..."
exit ;
fi
case $1 in
"start")
echo " --------------- 启动 hive metastore ---------------"
nohup hive --service metastore >> /export/server/hive/logs/metastore.log 2>&1 &
echo " --------------- 启动 hiveserver2 ---------------"
nohup hive --service hiveserver2 >> /export/server/hive/logs/hiveserver2.log 2>&1 &
;;
"stop")
echo " --------------- stop hive ---------------"
jps | grep RunJar | awk '{print $1}' | xargs -n1 kill -9
;;
*)
echo "Input Args Error..."
;;
esac
# 启动元数据管理服务
nohup hive --service metastore >> /export/server/hive/logs/metastore.log 2>&1 &
> hive
> show databases;
# 启动的是HiveServer2服务是Hive内置的一个ThriftServer服务,给其他客户端连接使用;第一可能连不上,多试几次
nohup hive --service hiveserver2 >> /export/server/hive/logs/hiveserver2.log 2>&1 &
# 客户端连接
beeline -u jdbc:hive2://node1:10000 -n hadoop
> show databases;
配置spark
hadoop fs -mkdir /sparklog
/ect/profile配置同上;
配置works
node1
node2
node3
配置spark-env.sh
#!/usr/bin/env bash
export JAVA_HOME=/export/server/jdk
export HADOOP_CONF_DIR=/export/server/hadoop/etc/hadoop
export YARN_CONF_DIR=/export/server/hadoop/etc/hadoop
export SPARK_MASTER_HOST=node2 # 每台master改成自己的主机名
export SPARK_MASTER_PORT=7077
export SPARK_MASTER_WEBUI_PORT=8089
export SPARK_WORKER_CORES=3
export SPARK_WORKER_MEMORY=2g
export SPARK_HISTORY_OPTS="-Dspark.history.fs.logDirectory=hdfs://node2:8020/sparklog/ -Dspark.history.fs.cleaner.enabled=true"
spark-default.conf
# 日志记录打开
spark.eventLog.enabled true
# # 存储位置
spark.eventLog.dir hdfs://node1:8020/sparklog/
# # 压缩
spark.eventLog.compress true
spark.yarn.jars hdfs://node1:8020/spark-jars/*.jar
配置PYSPARK_PYTHON
YSPARK_PYTHON和 JAVA_HOME 需要同样配置在: /root/.bashrc
中
vim /root/.bashrc
export JAVA_HOME=/export/server/jdk
export PYSPARK_PYTHON=/export/server/anaconda3/bin/python3
source /root/.bashrc
本地测试spark和python是否配置完成
spark-submit --master local[*] /export/server/spark/examples/src/main/python/pi.py 10
配置 spark使用hive
进入 Spark 安装目录
cd /opt/server/spark/conf
增加 hive-site.xml 配置文件
vim hive-site.xml
增加以下配置信息
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse</value>
</property>
<property>
<name>hive.metastore.local</name>
<value>false</value>
</property>
<property>
<name>hive.metastore.uris</name>
<value>thrift://node2:9083</value>
</property>
</configuration>
相关页面查看
hdfs的master ui页面
http://node2:9870/explorer.html#/
http://node1:8088/cluster/nodes
yarn Nodes of the cluster
http://node2:8089/
Spark Master at spark://node2:7077
http://node2:18080/
Spark Event log directory: hdfs://node2:8020/sparklog/
启动服务
cd /export/hadoop/sbin
vim exec-hadoop.sh
#!/bin/bash
if [ $# -lt 1 ]
then
echo "No Args Input..."
exit ;
fi
case $1 in
"start")
echo " =================== 启动 hadoop集群 ==================="
echo " --------------- 启动 hdfs ---------------"
ssh node2 $HADOOP_HOME/sbin/start-dfs.sh
echo " --------------- 启动 yarn ---------------"
ssh node1 $HADOOP_HOME/sbin/start-yarn.sh
echo " --------------- 启动 historyserver ---------------"
ssh node2 $HADOOP_HOME/bin/mapred --daemon start historyserver
echo " --------------- 启动 spark ---------------"
ssh node2 $SPARK_HOME/sbin/start-all.sh
echo " --------------- 启动 spark 历史服务器 ---------------"
ssh node2 $SPARK_HOME/sbin/start-history-server.sh
# ssh node2 /export/server/zookeeper/bin/zk.sh start
;;
"stop")
echo " =================== 关闭 hadoop集群 ==================="
echo " --------------- stop historyserver ---------------"
ssh node2 $HADOOP_HOME/bin/mapred --daemon stop historyserver
echo " --------------- stop yarn ---------------"
ssh node1 $HADOOP_HOME/sbin/stop-yarn.sh
echo " --------------- stop hdfs ---------------"
ssh node2 $HADOOP_HOME/sbin/stop-dfs.sh
echo " --------------- stop spark ---------------"
ssh node2 $SPARK_HOME/sbin/stop-all.sh
echo " --------------- stop spark 历史服务器 ---------------"
ssh node2 $SPARK_HOME/sbin/stop-history-server.sh
# ssh node2 /export/server/zookeeper/bin/zk.sh stop
;;
*)
echo "Input Args Error..."
;;
esac
在hdfs中引入spark的jar包
spark-3.2.4-bin-without-hadoop.tgz包中没有yarn下载纯净
https://archive.apache.org/dist/spark/spark-3.2.4/
tar -zxvf spark-3.2.4-bin-without-hadoop.tgz
cd spark-3.2.4-bin-without-hadoop
hdfs dfs -mkdir /spark-jars
hdfs dfs -put jars/* /spark-jars
chmod +x exec-hadoop.sh
exec-hadoop.sh start
exec-hadoop.sh stop
chown -R hadoop:hadoop apache-hive-3.1.3-bin spark-3.2.4-bin-hadoop3.2 apache-hive-3.1.3-bin zookeeper spark hive hadoop hadoop-3.3.4
spark on yarn
pyspark --master spark://ecs-qar1-0002:7077
pyspark --master yarn
sc.parallelize([1,2,3]).map(lambda x:x*10).collect()
yarn客户端模式
spark-submit --master yarn --deploy-mode client --driver-memory 512m --executor-memory 512m --num-executors 2 --total-executor-cores 2 $SPARK_HOME/examples/src/main/python/pi.py 10
spark-submit --master yarn --deploy-mode cluster --driver-memory 512m --executor-memory 512m --num-executors 2 --total-executor-cores 2 $SPARK_HOME/examples/src/main/python/pi.py 10
spark-submit --master spark://node1:7077 --driver-memory 2g --total-executor-cores 2 --executor-memory 1g /export/server/spark/examples/src/main/python/pi.py 10