1、先安装的工具
(一)、安装JDK1.8,多个服务器JDK环境不同可以通过 ln -s 当前jdk环境目录 需要映射的目录,统一JDK路径,JDK安装 (二)、安装Zookeeper (三)、安装Mysql

所有的机器都需要执行
2、修改hosts文件

vim /etc/hosts
#hadoop master
10.0.0.198 hdp01
10.0.0.195 hdp02
10.0.0.196 hdp03
10.0.0.199 hdp04
10.0.0.193 hdp05

3、新增普通用户hadoop

#创建hadoop用户,并使用/bin/bash作为shell
sudo useradd -m hadoop -s /bin/bash
#为hadoop用户设置密码
sudo passwd hadoop
#为hadoop用户增加管理员权限                   
sudo adduser hadoop sudo
#切换当前用户为用户hadoop             
su - hadoop
#更新hadoop用户                         
sudo apt-get update

4、新增SSH

ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
chmod 600 ~/.ssh/authorized_keys

5、复制所有的服务器的 /.ssh/id_rsa.pub到主服务器,之后把主服务器的/.ssh/authorized_keys覆盖所有的从服务器(注意权限需要相同:chmod 600 ./authorized_keys),可以使用 ssh 10.0.0.199测试是否免密登录,第一次需要输入yes记录公钥信息

ubuntu下安装iostat ubuntu下安装hadoop集群_ci


下面在主服务器操作

6、下载hadoop到master服务 解压

tar -xvf hadoop-3.3.2.tar.gz
mv hadoop-3.3.2 ./hadoop
#把hadoop赋权给hadoop用户和hadoop用户组
chown -R hadoop:hadoop ./hadoop
#进入配置文件信息
cd ./hadoop/hadoop/etc/hadoop
vim mapred-env.sh
#新增公共的JDK路径,不同的路径使用ln -s 配置软链接
export JAVA_HOME=/opt/jdk1.8.0_101
vim yarn-env.sh
#新增公共的JDK路径,不同的路径使用ln -s 配置软链接
export JAVA_HOME=/opt/jdk1.8.0_101
vim hadoop-env.sh
#新增公共的JDK路径,不同的路径使用ln -s 配置软链接
export JAVA_HOME=/opt/jdk1.8.0_101
export JAVA_HOME=${JAVA_HOME}
export HADOOP_OPTS="-Djava.library.path=${HADOOP_HOME}/lib/native"
vim core-site.xml
<configuration>
   <property>
       <name>hadoop.tmp.dir</name>
       <value>/data/hadoop/tmp</value>
       <description>Abase for other temporary directories.</description>
   </property>
   <property>
       <name>fs.default.name</name>
       <value>hdfs://hdp01:9000</value>
   </property>
  	<!-- Java代码通过用户名密码连接Hive-->
   <property>
       <name>hadoop.proxyuser.hadoop.hosts</name>
       <value>*</value>
   </property>
   <property>
       <name>hadoop.proxyuser.hadoop.groups</name>
       <value>*</value>
   </property>                                                                                                                                                                                                                                   
</configuration>
vim hdfs-site.xml
<configuration>
     <property>
          <name>dfs.namenode.secondary.http-address</name>
          <value>hdp01:9001</value>
     </property>
     <property>
          <name>dfs.namenode.name.dir</name>
          <value>file:/data/hadoop/dfs/name</value>
     </property>
     <property>
          <name>dfs.datanode.data.dir</name>
          <value>file:/data/hadoop/dfs/data</value>
     </property>
     <property>
     	  #副本数量,默认3个
          <name>dfs.replication</name>
          <value>3</value>
     </property>
     <property>
          <name>dfs.webhdfs.enabled</name>
          <value>true</value>
     </property>
</configuration>
vim yarn-site.xml
<configuration>
	<!-- Site specific YARN configuration properties -->
    <property>
         <name>yarn.nodemanager.aux-services</name>
         <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>hdp01</value>
    </property>
	<property>
         <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
         <value>org.apache.hadoop.mapred.ShuffleHandler</value>
    </property>
</configuration>
vim mapred-site.xml
<configuration>  
    <property>    
        <name>mapreduce.framework.name</name>    
        <value>yarn</value>    
    </property>    
    <property>    
        <name>mapreduce.jobhistory.address</name>    
        <value>hdp01:10020</value>    
    </property>    
    <property>    
        <name>mapreduce.jobhistory.webapp.address</name>    
        <value>hdp01:19888</value>    
    </property>
    <property>
        <name>yarn.app.mapreduce.am.env</name>
        <value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
    </property>
    <property>
        <name>mapreduce.map.env</name>
        <value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
    </property>
    <property>
         <name>mapreduce.reduce.env</name>
         <value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
    </property>      
</configuration>
vim workers
#从服务器域名或IP
hdp02
hdp03
hdp04
hdp05

vim capacity-scheduler.xml

<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->
<configuration>

    <property>
        <name>yarn.scheduler.capacity.maximum-applications</name>
        <value>10000</value>
        <description>
            Maximum number of applications that can be pending and running.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
        <value>0.5</value>
        <description>
            Maximum percent of resources in the cluster which can be used to run
            application masters i.e. controls number of concurrent running
            applications.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.resource-calculator</name>
        <value>org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator</value>
        <description>
            The ResourceCalculator implementation to be used to compare
            Resources in the scheduler.
            The default i.e. DefaultResourceCalculator only uses Memory while
            DominantResourceCalculator uses dominant-resource to compare
            multi-dimensional resources such as Memory, CPU etc.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.queues</name>
        <value>default,hive</value>
        <description>
            The queues at the this level (root is the root queue).
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.default.capacity</name>
        <value>50</value>
        <description>Default queue target capacity.</description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.default.user-limit-factor</name>
        <value>1</value>
        <description>
            Default queue user limit a percentage from 0.0 to 1.0.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
        <value>100</value>
        <description>
            The maximum capacity of the default queue.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.default.state</name>
        <value>RUNNING</value>
        <description>
            The state of the default queue. State can be one of RUNNING or STOPPED.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.default.acl_submit_applications</name>
        <value>*</value>
        <description>
            The ACL of who can submit jobs to the default queue.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.default.acl_administer_queue</name>
        <value>*</value>
        <description>
            The ACL of who can administer jobs on the default queue.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.default.acl_application_max_priority</name>
        <value>*</value>
        <description>
            The ACL of who can submit applications with configured priority.
            For e.g, [user={name} group={name} max_priority={priority} default_priority={priority}]
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.default.maximum-application-lifetime
        </name>
        <value>-1</value>
        <description>
            Maximum lifetime of an application which is submitted to a queue
            in seconds. Any value less than or equal to zero will be considered as
            disabled.
            This will be a hard time limit for all applications in this
            queue. If positive value is configured then any application submitted
            to this queue will be killed after exceeds the configured lifetime.
            User can also specify lifetime per application basis in
            application submission context. But user lifetime will be
            overridden if it exceeds queue maximum lifetime. It is point-in-time
            configuration.
            Note : Configuring too low value will result in killing application
            sooner. This feature is applicable only for leaf queue.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.default.default-application-lifetime
        </name>
        <value>-1</value>
        <description>
            Default lifetime of an application which is submitted to a queue
            in seconds. Any value less than or equal to zero will be considered as
            disabled.
            If the user has not submitted application with lifetime value then this
            value will be taken. It is point-in-time configuration.
            Note : Default lifetime can't exceed maximum lifetime. This feature is
            applicable only for leaf queue.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.hive.capacity</name>
        <value>50</value>
        <description>
            hive队列的容量为50%
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.hive.user-limit-factor</name>
        <value>1</value>
        <description>
            一个用户最多能够获取该队列资源容量的比例
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.hive.maximum-capacity</name>
        <value>80</value>
        <description>
            hive队列的最大容量
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.hive.state</name>
        <value>RUNNING</value>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.hive.acl_submit_applications</name>
        <value>*</value>
        <description>
            访问控制,控制谁可以将任务提交到该队列
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.hive.acl_administer_queue</name>
        <value>*</value>
        <description>
            访问控制,控制谁可以管理(包括提交和取消)该队列的任务
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.hive.acl_application_max_priority</name>
        <value>*</value>
        <description>
            访问控制,控制用户可以提交到该队列的任务的最大优先级
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.root.hive.maximum-application-lifetime</name>
        <value>-1</value>
        <description>
            hive队列中任务的最大生命时长
        </description>
    </property>
    <property>
        <name>yarn.scheduler.capacity.root.hive.default-application-lifetime</name>
        <value>-1</value>
        <description>
            default队列中任务的最大生命时长
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.node-locality-delay</name>
        <value>40</value>
        <description>
            Number of missed scheduling opportunities after which the CapacityScheduler
            attempts to schedule rack-local containers.
            When setting this parameter, the size of the cluster should be taken into account.
            We use 40 as the default value, which is approximately the number of nodes in one rack.
            Note, if this value is -1, the locality constraint in the container request
            will be ignored, which disables the delay scheduling.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.rack-locality-additional-delay</name>
        <value>-1</value>
        <description>
            Number of additional missed scheduling opportunities over the node-locality-delay
            ones, after which the CapacityScheduler attempts to schedule off-switch containers,
            instead of rack-local ones.
            Example: with node-locality-delay=40 and rack-locality-delay=20, the scheduler will
            attempt rack-local assignments after 40 missed opportunities, and off-switch assignments
            after 40+20=60 missed opportunities.
            When setting this parameter, the size of the cluster should be taken into account.
            We use -1 as the default value, which disables this feature. In this case, the number
            of missed opportunities for assigning off-switch containers is calculated based on
            the number of containers and unique locations specified in the resource request,
            as well as the size of the cluster.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.queue-mappings</name>
        <value></value>
        <description>
            A list of mappings that will be used to assign jobs to queues
            The syntax for this list is [u|g]:[name]:[queue_name][,next mapping]*
            Typically this list will be used to map users to queues,
            for example, u:%user:%user maps all users to queues with the same name
            as the user.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.queue-mappings-override.enable</name>
        <value>false</value>
        <description>
            If a queue mapping is present, will it override the value specified
            by the user? This can be used by administrators to place jobs in queues
            that are different than the one specified by the user.
            The default is false.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.per-node-heartbeat.maximum-offswitch-assignments</name>
        <value>1</value>
        <description>
            Controls the number of OFF_SWITCH assignments allowed
            during a node's heartbeat. Increasing this value can improve
            scheduling rate for OFF_SWITCH containers. Lower values reduce
            "clumping" of applications on particular nodes. The default is 1.
            Legal values are 1-MAX_INT. This config is refreshable.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.application.fail-fast</name>
        <value>false</value>
        <description>
            Whether RM should fail during recovery if previous applications'
            queue is no longer valid.
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.workflow-priority-mappings</name>
        <value></value>
        <description>
            A list of mappings that will be used to override application priority.
            The syntax for this list is
            [workflowId]:[full_queue_name]:[priority][,next mapping]*
            where an application submitted (or mapped to) queue "full_queue_name"
            and workflowId "workflowId" (as specified in application submission
            context) will be given priority "priority".
        </description>
    </property>

    <property>
        <name>yarn.scheduler.capacity.workflow-priority-mappings-override.enable</name>
        <value>false</value>
        <description>
            If a priority mapping is present, will it override the value specified
            by the user? This can be used by administrators to give applications a
            priority that is different than the one specified by the user.
            The default is false.
        </description>
    </property>

</configuration>

7、新增文件夹目录并赋权给hadoop用户(root账号操作)

cd /data
mkdir hadoop
cd hadoop
mkdir tmp
mkdir var
mkdir dfs
cd dfs
mkdir name
mkdir data
chown -R hadoop:hadoop /data/hadoop
8、配置hadoop的环境变量

```bash
vim /etc/profile
#hadoop安装路径
export HADOOP_HOME=/opt/hadoop
export CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath):$CLASSPATH
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
#刷新配置文件
source /etc/profile
#校验是否安装完成
hadoop version

ubuntu下安装iostat ubuntu下安装hadoop集群_ci_02


9、进入bin目录

cd /opt/hadoop/bin
hadoop classpath
#复制路径信息添加
vim /opt/hadoop/etc/hadoop/yarn-site.xml
<property>
     <name>yarn.application.classpath</name>
     #上面复制的路径信息
     <value>/opt/hadoop/etc/hadoop:/opt/hadoop/share/hadoop/common/lib/*:/opt/hadoop/share/hadoop/common/*:/opt/hadoop/share/hadoop/hdfs:/opt/hadoop/share/hadoop/hdfs/lib/*:/opt/hadoop/share/hadoop/hdfs/*:/opt/hadoop/share/hadoop/mapreduce/*:/opt/hadoop/share/hadoop/yarn:/opt/hadoop/share/hadoop/yarn/lib/*:/opt/hadoop/share/hadoop/yarn/*</value>	   
</property>

ubuntu下安装iostat ubuntu下安装hadoop集群_vim_03


参考配置:

<?xml version="1.0"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->
<configuration>
    <!-- rm失联后重新链接的时间 -->
    <property>
        <name>yarn.resourcemanager.connect.retry-interval.ms</name>
        <value>2000</value>
    </property>
    <property>
        <!-- 启用RM高可用 -->
        <name>yarn.resourcemanager.ha.enabled</name>
        <value>true</value>
        <description>启动Yran HA模式</description>
    </property>
    <property>
        <!-- 指定两台RM主机名标识符 -->
        <name>yarn.resourcemanager.ha.rm-ids</name>
        <value>rm1,rm2</value>
        <description>resourcemanager id</description>
    </property>
    <property>
        <!--Ha功能,需要一组zk地址,用逗号分隔。被ZKFailoverController使用于自动失效备援failover。 -->
        <name>ha.zookeeper.quorum</name>
        <value>hdp01:2181,hdp02:2181,hdp03:2181,hdp04:2181</value>
        <description>Zookeeper 队列</description>
    </property>
    <property>
        <!--开启失效转移 -->
        <name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
        <value>true</value>
        <description>开启 ResourceManager 故障自动切换</description>
    </property>
    <property>
        <!-- 指定rm1的主机名-->
        <name>yarn.resourcemanager.hostname.rm1</name>
        <value>hdp01</value>
        <description>rm1 的hostname</description>
    </property>
    <property>
        <!-- 指定rm2的主机名-->
        <name>yarn.resourcemanager.hostname.rm2</name>
        <value>hdp02</value>
        <description>rm2 的hostname</description>
    </property>
    <property>
        <name>yarn.resourcemanager.ha.id</name>
        <value>rm1</value>
        <description>本机的rmid</description>
    </property>
    <property>
        <!-- RM故障自动恢复 -->
        <name>yarn.resourcemanager.recovery.enabled</name>
        <value>true</value>
    </property>
    <property>
        <!-- 被RM用于状态存储的ZooKeeper服务器的主机:端口号,多个ZooKeeper的话使用逗号分隔。 -->
        <name>yarn.resourcemanager.zk-state-store.address</name>
        <value>hdp01:2181,hdp02:2181,hdp03:2181,hdp04:2181</value>
    </property>
    <property>
        <!-- 配置RM状态信息存储方式,有MemStore和ZKStore。 -->
        <name>yarn.resourcemanager.store.class</name>
        <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
    </property>
    <property>
        <!--使用ZK集群保存状态信息,指定zookeeper队列 -->
        <name>yarn.resourcemanager.zk-address</name>
        <value>hdp01:2181,hdp02:2181,hdp03:2181,hdp04:2181</value>
    </property>
    <property>
        <name>yarn.resourcemanager.cluster-id</name>
        <value>LN-rslog</value>
        <description>集群ID</description>
    </property>
    <property>
        <!-- schelduler失联等待连接时间,以毫秒为单位-->
        <name>yarn.app.mapreduce.am.scheduler.connection.wait.interval-ms</name>
        <value>5000</value>
    </property>
    <description>以下开始对 rm1 进行配置,rm2 改成对应的值!!!</description>
    <property>
        <!-- 客户端通过该地址向RM提交对应用程序操作 -->
        <name>yarn.resourcemanager.address.rm1</name>
        <value>hdp01:8032</value>
    </property>
    <property>
        <!--ResourceManager 对ApplicationMaster暴露的访问地址。ApplicationMaster通过该地址向RM申请资源、释放资源等。 -->
        <name>yarn.resourcemanager.scheduler.address.rm1</name>
        <value>hdp01:8030</value>
    </property>
    <property>
        <!-- RM HTTP访问地址,查看集群信息-->
        <name>yarn.resourcemanager.webapp.address.rm1</name>
        <value>hdp01:8088</value>
    </property>
    <property>
        <!-- NodeManager通过该地址交换信息 -->
        <name>yarn.resourcemanager.resource-tracker.address.rm1</name>
        <value>hdp01:8031</value>
    </property>
    <property>
        <!--管理员通过该地址向RM发送管理命令 -->
        <name>yarn.resourcemanager.admin.address.rm1</name>
        <value>hdp01:8033</value>
    </property>
    <property>
        <name>yarn.resourcemanager.ha.admin.address.rm1</name>
        <value>hdp01:23142</value>
    </property>
    <property>
        <name>yarn.resourcemanager.address.rm2</name>
        <value>hdp02:8032</value>
    </property>
    <property>
        <name>yarn.resourcemanager.scheduler.address.rm2</name>
        <value>hdp02:8030</value>
    </property>
    <property>
        <name>yarn.resourcemanager.webapp.address.rm2</name>
        <value>hdp02:8088</value>
    </property>
    <property>
        <name>yarn.resourcemanager.resource-tracker.address.rm2</name>
        <value>hdp02:8031</value>
    </property>
    <property>
        <name>yarn.resourcemanager.admin.address.rm2</name>
        <value>hdp02:8033</value>
    </property>
    <property>
        <name>yarn.resourcemanager.ha.admin.address.rm2</name>
        <value>hdp02:23142</value>
    </property>
    <property>
        <!-- 为了能够运行MapReduce程序,需要让各个NodeManager在启动时加载shuffle server,shuffle server实际上是Jetty/Netty Server,Reduce Task通过该server从各个NodeManager上远程拷贝Map Task产生的中间结果。下面增加的两个配置均用于指定shuffle serve。 -->
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
        <value>org.apache.hadoop.mapred.ShuffleHandler</value>
    </property>
    <property>
        <!--中间结果存放位置,类似于1.0中的mapred.local.dir。注意,这个参数通常会配置多个目录,已分摊磁盘IO负载 -->
        <name>yarn.nodemanager.local-dirs</name>
        <value>/data/hadoop/yarn/tmp</value>
    </property>
    <property>
        <!-- yarn node 运行时日志存放地址,记录container日志,并非nodemanager日志存放地址 -->
        <name>yarn.nodemanager.log-dirs</name>
        <value>/data/hadoop/yarn/log</value>
    </property>
    <property>
        <name>mapreduce.shuffle.port</name>
        <value>23080</value>
    </property>
    <property>
        <!-- 以轮训方式寻找活动的RM所使用的类-->
        <name>yarn.client.failover-proxy-provider</name>
        <value>org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider</value>
    </property>
    <property>
        <name>yarn.resourcemanager.ha.automatic-failover.zk-base-path</name>
        <value>/yarn-leader-election</value>
    </property>
    <property>
        <!-- 每使用一个物理cpu,可以使用的虚拟cpu的比例,默认为2-->
        <name>yarn.nodemanager.vcores-pcores-ratio</name>
        <value>1</value>
    </property>
    <property>
        <!-- 每单位的物理内存总量对应的虚拟内存量,默认是2.1,表示每使用1MB的物理内存,最多可以使用2.1MB的虚拟内存总量。-->
        <name>yarn.nodemanager.vmem-pmem-ratio</name>
        <value>5.2</value>
    </property>
    <!--
    (2)yarn.nodemanager.vmem-pmem-ratio

    任务每使用1MB物理内存,最多可使用虚拟内存量,默认是2.1
    (3) yarn.nodemanager.pmem-check-enabled

    是否启动一个线程检查每个任务正使用的物理内存量,如果任务超出分配值,则直接将其杀掉,默认是true。
    (4) yarn.nodemanager.vmem-check-enabled

    是否启动一个线程检查每个任务正使用的虚拟内存量,如果任务超出分配值,则直接将其杀掉,默认是true。 -->
    <property>
        <name>yarn.nodemanager.vmem-check-enabled</name>
        <value>false</value>
    </property>
    <property>
        <!-- 表示该节点上YARN可使用的虚拟CPU个数,默认是8,注意,目前推荐将该值设值为与物理CPU核数数目相同。如果你的节点CPU核数不够8个,则需要调减小这个值,而YARN不会智能的探测节点的物理CPU总数 -->
        <name>yarn.nodemanager.resource.cpu-vcores</name>
        <value>1</value>
    </property>
    <property>
        <!-- 表示该节点上YARN可使用的物理内存总量,默认是8192(MB),注意,如果你的节点内存资源不够8GB,则需要调减小这个值,而YARN不会智能的探测节>点的物理内存总量。 -->
        <name>yarn.nodemanager.resource.memory-mb</name>
        <value>10240</value>
    </property>
    <property>
        <!-- 单个任务可申请的最少物理内存量,默认是1024(MB),如果一个任务申请的物理内存量少于该值,则该对应的值改为这个数 -->
        <name>yarn.scheduler.minimum-allocation-mb</name>
        <value>256</value>
    </property>
    <property>
        <!-- 单个任务可申请的最多物理内存量,默认是8192(MB)。

        默认情况下,YARN采用了线程监控的方法判断任务是否超量使用内存,一旦发现超量,则直接将其杀死。由于Cgroups对内存的控制缺乏灵活性(即任务任何时刻不能超过内存上限,如果超过,则直接将其杀死或者报OOM),而Java进程在创建瞬间内存将翻倍,之后骤降到正常值,这种情况下,采用线程监控的方式更加灵活(当发现进程树内存瞬间翻倍超过设定值时,可认为是正常现象,不会将任务杀死),因此YARN未提供Cgroups内存隔离机制 -->
        <name>yarn.scheduler.maximum-allocation-mb</name>
        <value>40960</value>
    </property>
    <!--指定yarn.log.server.url所在节点-->
    <property>
    <!-- 执行job日志存储路径-->
	<name>yarn.log.server.url</name>
        <value>http://hdp01:19888/data/hadoop/yarn/jobhistory/logs</value>
    </property>
    <!-- 运行环境 hadoop classpath-->
    <property>
        <name>yarn.application.classpath</name>
        <value>
            /opt/hadoop/etc/hadoop:/opt/hadoop/share/hadoop/common/lib/*:/opt/hadoop/share/hadoop/common/*:/opt/hadoop/share/hadoop/hdfs:/opt/hadoop/share/hadoop/hdfs/lib/*:/opt/hadoop/share/hadoop/hdfs/*:/opt/hadoop/share/hadoop/mapreduce/*:/opt/hadoop/share/hadoop/yarn:/opt/hadoop/share/hadoop/yarn/lib/*:/opt/hadoop/share/hadoop/yarn/*
        </value>
    </property>

</configuration>

10、复制hadoop环境变量到hadoop用户下,否则用户执行hadoop格式化失败

su hadoop
cd ~
vim .bashrc
export HADOOP_HOME=/opt/hadoop
export CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath):$CLASSPATH
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin

11、复制hadoop到别的从服务器

rsync -av /opt/hadoop hdp02:/opt
rsync -av /opt/hadoop hdp03:/opt
rsync -av /opt/hadoop hdp04:/opt
rsync -av /opt/hadoop hdp05:/opt
rsync -av /data/hadoop hdp02:/data
rsync -av /data/hadoop hdp03:/data
rsync -av /data/hadoop hdp04:/data
rsync -av /data/hadoop hdp05:/data

12、所有的从服务器配置环境变量信息
所有的机子重复第10步

13、格式化Master(hdp01)

source ~/.bashrc
hadoop namenode -format
hadoop datanode -format

14、启动所有服务

su hadoop
cd /opt/hadoop/sbin
./start-all.sh

ubuntu下安装iostat ubuntu下安装hadoop集群_hive_04


15、查看日志启动日志信息(如果报格式化失败就重新执行hadoop namenode -format/hadoop datanode -format)

tail -f -n 300 /opt/hadoop/logs/hadoop-hadoop-namenode-ubuntu-198.log

ubuntu下安装iostat ubuntu下安装hadoop集群_hadoop_05


16、校验所有服务是否启动成功

jps

ubuntu下安装iostat ubuntu下安装hadoop集群_hadoop_06


浏览器输入

http://10.0.0.198:9001/status.htmlubuntu下安装iostat ubuntu下安装hadoop集群_hadoop_07

http://10.0.0.198:8088/cluster/nodes


#创建多级文件夹
hadoop fs -mkdir -p /data/hadoop/
#上传文件
hdfs dfs -put hdfs://hdp01:9000/data/hadoop/*.txt
#获取命令
hdfs dfs -ls hdfs://hdp01:9000/data/hadoop/*.txt
#移除文件
hdfs dfs -rm /data/hadoop/1.txt
#移除文件夹
hdfs dfs -rm  -r /data/hadoop
#导入某个文件到分区表上,字段一一映射,以空格隔开(根据建表的空格符),分区字段不需要在文件中
load data local inpath '/tmp/28b7c256-980a-4638-bd47-e83e478f504e.txt' overwrite into table vehicle_hive.t_veh_data_distinct_unzip partition(dt='20200407');

17、安装Hive

hive用户使用hadoop用户,权限相同,否则需要配置hive调用hadoop权限

下载Hive 注意Hadoop版本

ubuntu下安装iostat ubuntu下安装hadoop集群_ci_08


下载路径,下载慢可以试下迅雷能不能下载快点

18、安装,配置环境变量

cd /opt
tar -xvf apache-hive-3.1.3-bin.tar.gz
mv apache-hive-3.1.3-bin/ ./hive
chmod -R hadoop:hadoop hive
su hadoop
cd ~
vim .bashrc
export HIVE_HOME=/opt/hive                                                                                                                                                                                                                       
export PATH=$PATH:$HIVE_HOME/bin
source .bashrc
cd conf
mv hive-env.sh.template ./hive-env.sh
mv hive-default.xml.template ./hive-site.xml
#hive.exec.local.scratchdir和hive.downloaded.resources.dir
#新建3个目录,是指本地目录(必须先手动建好)
#如果没有创建权限,可以先用root用户创建,之后通过 chmod -R hadoop:hadoop /data/hive赋权个hadoop
mkdir -p /data/hive/scratchdir
mkdir -p /data/hive/downloaded
mkdir -p /data/hive/logs
vim hive-env.sh
#根据实际情况是否开放注释,默认申请的JVM堆内存256M
export HADOOP_HEAPSIZE=1024
#JDK路径
export JAVA_HOME=/opt/jdk1.8.0_101
#HIVE路径
export HIVE_HOME=/opt/hive
#hadoop路径
export HADOOP_HOME=/opt/hadoop
export HIVE_CONF_DIR=$HIVE_HOME/conf 
export HIVE_AUX_JARS_PATH=$HIVE_HOME/lib/*
vim hive-site.xml
#根据property的name属性找到对应的位置,修改对应的值信息
#/user/hive/scratchdir
hive.exec.local.scratchdir
#/user/hive/downloaded
hive.downloaded.resources.dir
#Mysql url jdbc:mysql://ip:3306/hive?useUnicode=true&characterEncoding=utf-8&allowMultiQueries=true
#&符号非法
javax.jdo.option.ConnectionURL
#mysql驱动,根据数据库版本下载对应的驱动,5.7和8是不同的
javax.jdo.option.ConnectionDriverName
#mysql用户名
javax.jdo.option.ConnectionUserName
#mysql密码
javax.jdo.option.ConnectionPassword
#删除3215行的非法字符
#默认为NONE,改为CUSTOM,需要用户名密码
hive.server2.authentication
#存储验证用户名密码方法,此处的用法根据实际需求,我这里在hive数据库新建一张db_user表,只有用户名密码
#SELECT user_name,pass_word FROM hive.db_user
hive.server2.custom.authentication.class

ubuntu下安装iostat ubuntu下安装hadoop集群_hive_09


模板(别的配置可以不要,用默认的,也可以只修改下面几个配置的name)

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
    <property>
        <name>javax.jdo.option.ConnectionURL</name>
        <value>
            jdbc:mysql://hdp04:3306/hive?useUnicode=true&characterEncoding=utf-8&allowMultiQueries=true
        </value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionDriverName</name>
        <value>com.mysql.jdbc.Driver</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionUserName</name>
        <value>suyun</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionPassword</name>
        <value>suyun123</value>
    </property>

    <property>
        <name>datanucleus.readOnlyDatastore</name>
        <value>false</value>
    </property>
    <property>
        <name>datanucleus.fixedDatastore</name>
        <value>false</value>
    </property>

    <property>
        <name>datanucleus.autoCreateSchema</name>
        <value>true</value>
    </property>

    <property>
        <name>datanucleus.autoCreateTables</name>
        <value>true</value>
    </property>

    <property>
        <name>datanucleus.autoCreateColumns</name>
        <value>true</value>
    </property>

    <property>
        <name>hive.exec.local.scratchdir</name>
        <value>/data/hive/scratchdir</value>
    </property>
    <property>
        <name>hive.downloaded.resources.dir</name>
        <value>/data/hive/downloaded</value>
    </property>
    <property>
        <name>hive.exec.dynamic.partition</name>
        <value>true</value>
    </property>
    <property>
        <name>hive.exec.dynamic.partition.mode</name>
        <value>nonstrick</value>
    </property>
    <property>
        <name>hive.optimize.sort.dynamic.partition</name>
        <value>true</value>
    </property>
    <property>
        <name>mapreduce.map.java.opts</name>
        <value>Xmx4096m</value>
    </property>
    <property>
        <name>hive.exec.max.dynamic.partitions.pernode</name>
        <value>10000</value>
    </property>
    <property>
        <name>hive.exec.max.dynamic.partitions</name>
        <value>100000</value>
    </property>
    <property>
        <name>hive.exec.max.created.files</name>
        <value>150000</value>
    </property>
    <property>
        <name>hive.merge.mapfiles</name>
        <value>true</value>
    </property>
    <property>
        <name>hive.merge.mapredfiles</name>
        <value>true</value>
    </property>
    <property>
        <name>hive.exec.parallel</name>
        <value>true</value>
    </property>
    <!-- hive server2的使用端口,默认就是10000-->
    <property>
        <name>hive.server2.thrift.port</name>
        <value>10000</value>
        <description>Port number of HiveServer2 Thrift interface.Can be overridden by setting $HIVE_SERVER2_THRIFT_PORT</description>
    </property>
    <!-- hive server2绑定的主机名-->
    <property>
        <name>hive.server2.thrift.bind.host</name>
        <value>hdp01</value>
        <description>Bind host on which to run the HiveServer2 Thrift interface.Can be overridden by setting $HIVE_SERVER2_THRIFT_BIND_HOST</description>
    </property>
    <property>
        <name>hive.execution.engine</name>
        <value>spark</value>
    </property>
    <property>
        <name>hive.enable.spark.execution.engine</name>
        <value>true</value>
    </property>
    <property>
        <name>spark.home</name>
        <value>/opt/spark</value>
    </property>
    <property>
        <name>spark.master</name>
        <value>yarn</value>
    </property>
    <property>
        <name>spark.eventLog.enabled</name>
        <value>true</value>
    </property>
    <property>
        <name>spark.eventLog.dir</name>
        <value>hdfs://hdp01:9000/data/hadoop/spark/spark-hive-jobhistory</value>
    </property>
    <property>
        <name>spark.executor.memory</name>
        <value>4096m</value>
    </property>
    <property>
        <name>spark.driver.memory</name>
        <value>4096m</value>
    </property>
    <property>
        <name>spark.serializer</name>
        <value>org.apache.spark.serializer.KryoSerializer</value>
    </property>
    <property>
        <name>spark.yarn.jars</name>
        <value>hdfs://hdp01:9000/data/hadoop/spark/spark-jars/*</value>
    </property>
    <property>
        <name>hive.spark.client.connect.timeout</name>                                                                                                                                                                                    
        <value>600000ms</value>
    </property>
    <property>
        <name>hive.spark.client.server.connect.timeout</name>
        <value>600000ms</value>
    </property>
</configuration>

19、创建log4j信息

cp hive-log4j2.properties.template ./hive-log4j2.properties
vim hive-log4j2.properties
#在调试之前可以使用DEBUG模式,可以看到更详细的日志
property.hive.log.level = INFO
property.hive.log.dir = /data/hive/logs
#使用控制台命令操作,可以使用西面命令临时修改日志级别
bin/hive --hiveconf hive.root.logger=DEBUG,console  -e "TRUNCATE TABLE veh_terminal_protocol;"

ubuntu下安装iostat ubuntu下安装hadoop集群_ubuntu下安装iostat_10


20、下载Mysql8.0.16版本驱动

下载Mysql驱动包 mysql-connector-java

ubuntu下安装iostat ubuntu下安装hadoop集群_ubuntu下安装iostat_11


上传jar包到/opt/hive/lib下面

ubuntu下安装iostat ubuntu下安装hadoop集群_ci_12


手动创建hive表,否则执行删除语句可能错误

mysql -u root -p
mysql>SET GLOBAL binlog_format = 'ROW';
mysql>commit;
mysql>create database hive;
mysql>alter database hive character set latin1;
#初始化数据库
cd /opt/hive/bin
schematool -initSchema -dbType mysql -verbose

ubuntu下安装iostat ubuntu下安装hadoop集群_vim_13


ubuntu下安装iostat ubuntu下安装hadoop集群_vim_14


21、上传hive用户名密码jar

下载校验jar包 在hive数据库新增表db_user

# 修改字段注释字符集
alter table COLUMNS_V2 modify column COMMENT varchar(256) character set utf8;
# 表
alter table TABLE_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;
# 分区
alter table PARTITION_KEYS modify column PKEY_COMMENT varchar(4000) character set utf8;
alter table PARTITION_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;
# 索引
alter table INDEX_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;
-- ----------------------------
-- Table structure for db_user
-- ----------------------------
DROP TABLE IF EXISTS `db_user`;
CREATE TABLE `db_user`  (
  `user_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '登录用户名',
  `pass_word` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '密码MD5加密',
  PRIMARY KEY (`user_name`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;

-- ----------------------------
-- Records of db_user 默认用户名:hive 密码:123456
-- ----------------------------
INSERT INTO `db_user` VALUES ('hive', 'e196a015eac49789e43b18c4ab565198');
INSERT INTO `db_user` VALUES ('hadoop', 'e196a015eac49789e43b18c4ab565198');

上传jar包到/lib下面

ubuntu下安装iostat ubuntu下安装hadoop集群_hive_15

#检查guava-*.jar源码中是否存在createUnstarted方法,如果不存在,删除,上传一个存在的guava-*.jar
rm -rf /opt/hive/lib/guava-19.0.jar
#两个19.0是不同的
cp /opt/hadoop/share/hadoop/common/lib/guava-19.0.jar /opt/hive/lib

22、启动hive(启动之前确定hadoop已经启动,没有启动,需要先启动hadoop)

nohup hive --service metastore &
nohup  hive --service hiveserver2 &
#查看10000端口号是否启动成功
lsof -i:10000

ubuntu下安装iostat ubuntu下安装hadoop集群_vim_16

#进入hive命令
hive
#创建vehicle_hive数据库,路径在/data/hadoop/dfs/vehicle_hive.db/,默认有个default库
create database vehicle_hive location '/data/hadoop/dfs/vehicle_hive.db/';

JAVA代码测试获取连接

ubuntu下安装iostat ubuntu下安装hadoop集群_vim_17

使用DolphinScheduler连接成功

ubuntu下安装iostat ubuntu下安装hadoop集群_ci_18


23、校验Hadoop

hadoop checknative -a
 #false是不支持的,可以通过安装工具解决,不要下载dev包,下载稳定版本
ISA-L: false
apt-get install yasm
apt-get install autoconf automake
#下载zip包,上传https://hub.xn--p8jhe.tw/intel/isa-l
unzip isa-l-master.zip
cd isa-l-master
./autogen.sh
./configure
make
make install
zstd  :  false
#下载zip包,上传https://hub.xn--p8jhe.tw/facebook/zstd
unzip zstd-zstd-0.4.2.zip
cd zstd-zstd-0.4.2
make
make install

配置文件修改模板:
Hadoop:
Hive: