一、卸载
卸载原有驱动 方法一:找到显卡驱动.run文件: $ sh NVIDIA-Linux-x86_64-418.126.02.run --uninstall 方法二:清除所有nvidia相关文件和依赖 $ yum remove nvidia-* 为了删除干净还可以(目标就是把nvidia-driver的相关组件都清理掉): rpm -qa|grep -i nvid|sort yum remove kmod-nvidia-*
卸载原有cuda
如果是rpm安装
sudo rpm -e cuda-repo[补全]
sudo yum -y remove nvidia-*
sudo yum -y remove cuda-*
sudo rm -rf /usr/local/cuda*
如果是.run安装
sudo nvidia-uninstall #卸载驱动
sudo /usr/local/cuda/cuda-uninstaller #卸载cuda,9.2版本的后缀名为.pl
清理gcc环境
sudo yum autoremove -y gcc*
sudo yum autoremove -y cpp*
sudo yum autoremove -y dkms
sudo rm -rf /opt/rh #删除由devtoolset安装的gcc
sudo rm -rf /usr/lib/gcc
sudo rm -rf /usr/libexec/gcc
whereis gcc
sudo yum install -y gcc
sudo yum install -y gcc-c++
sudo yum install -y glibc-devel
gcc -v
安装cuda以及nvidia driver
#install cuda from .run file from https://developer.nvidia.com/cuda-downloads
sudo sh cuda_10.1.105_418.39_linux.run # 也可在此安装driver,更新版本的用下条命令安装
sudo sh NVIDIA-Linux-x86_64-418.56.run # install the driver
#install cuda from .rpm file
修改cuda源,/etc/yum.repos.d/cuda.repo
https://mirrors.aliyun.com/nvidia-cuda/rhel7/x86_64/
安装cudnn
rm -rf cuda
tar -xzvf cudnn-10.1-linux-x64-v7.5.1.10.tgz
sudo cp cuda/include/cudnn.h /usr/local/cuda/include
sudo cp cuda/lib64/libcudnn* /usr/local/cuda/lib64
sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
测试
reboot
nvidia-smi
测试时如果报错Failed to initialize NVML: Driver/library version mismatch
参考NVML
编译安装gcc5.4.0
#从http://ftp.tsukuba.wide.ad.jp/software/gcc/releases/下载
cd gcc-5.4.0
./contrib/download_prerequisites
mkdir build
cd build
../configure --enable-checking=release --enable-languages=c,c++ --disable-multilib #--prefix指定安装路径,默认/usr/local
make -j $(nproc)
sudo make install
编译gcc时如果出现error "Unable to find a suitable type for HOST_WIDE_INT
unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE
修改gcc环境
# 添加/etc/profile.d/gcc5.4path.sh,影响root及所有账号,内容为
export PATH=/usr/local/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/usr/local/lib64:$LD_LIBRARY_PATH
# 添加/etc/sudoers.d/sudopath,对sudo path产生影响
Defaults secure_path = /sbin:/usr/sbin:/usr/local/bin:/usr/bin:/usr/local/openmpi/bin:/bin
二、实战
1001 2021-08-27 14:53:47 root sudo sh cuda_11.1.0_455.23.05_linux.run 1002 2021-08-27 15:03:47 root yum remove nvidia-* 1003 2021-08-27 15:05:25 root rpm -qa|grep -i nvid|sort 1004 2021-08-27 15:06:04 root yum remove yum-plugin-nvidia-1.0.2-1.el7.elrepo.noarch 1005 2021-08-27 15:06:11 root ls 1006 2021-08-27 15:06:38 root yum remove cuda 1007 2021-08-27 15:06:42 root yum remove cuda* 1008 2021-08-27 15:06:51 root nvidia-smi 1009 2021-08-27 15:06:58 root nvcc -V 1010 2021-08-27 15:08:23 root sudo dpkg -l |grep cuda 1011 2021-08-27 15:09:19 root sudo rpm -e cuda* 1012 2021-08-27 15:09:43 root sudo yum -y remove nvidia-* 1013 2021-08-27 15:09:50 root sudo yum -y remove cuda-* 1014 2021-08-27 15:10:09 root sudo rm -rf /usr/local/cuda* 1015 2021-08-27 15:10:29 root sudo nvidia-uninstall 1016 2021-08-27 15:10:49 root sudo /usr/local/cuda/cuda-uninstaller 1017 2021-08-27 15:11:25 root ls 1018 2021-08-27 15:11:37 root sudo sh cuda_11.1.0_455.23.05_linux.run 1019 2021-08-27 15:14:08 root rpm -qa cuda 1020 2021-08-27 15:14:58 root rpm -qa cuda* 1021 2021-08-27 15:15:22 root nvcc -V 1022 2021-08-27 15:16:33 root wget https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-rhel7-10-1-local-10.1.105-418.39-1.0-1.x86_64.rpm 1023 2021-08-27 15:21:30 root ls 1024 2021-08-27 15:21:44 root sudo rpm -i cuda-repo-rhel7-10-1-local-10.1.105-418.39-1.0-1.x86_64.rpm 1025 2021-08-27 15:22:50 root yum search cuda* 1026 2021-08-27 15:23:01 root yum search cuda 1027 2021-08-27 15:23:50 root yum list | grep cuda 1028 2021-08-27 15:24:25 root sudo yum clean all 1029 2021-08-27 15:24:37 root sudo yum install cuda 1030 2021-08-27 15:31:37 root nvidia-smi 1031 2021-08-27 15:31:48 root nvcc -V