多机nccl Tests测试
2024-07-02
2分钟阅读时长
系统已经预先安装了nvidia driver 550.90.07
和 所兼容的 nvidia-fabricmanager
有四张计算用的IB网卡 速率都是200Gb/s的
环境设置
ssh 互相免密
hosts 设置好
关闭cpu 多线程 和虚拟化
关闭acs
#!/bin/bash
echo "==============`date` stop acs =================" >> /var/log/acs.log
for i in `lspci -vvv |grep "PCI bridge" | awk '{print $1}'`
do
lspci -s $i -vvv |grep ACSCtl >/dev/null&& setpci -v -s $i ecap_acs+6.w=0 &>> /var/log/acs.log
done
安装cuda
wget https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.42.02_linux.run
bash cuda_12.5.0_555.42.02_linux.run
安装时记得取消drvier的选项
设置环境变量
echo "export PATH=/usr/local/cuda/bin:$PATH" >> ~/.bashrc
echo "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> ~/.bashrc
source ~/.bashrc
安装nccl
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
apt install ./cuda-keyring_1.0-1_all.deb
apt update
apt install libnccl2 libnccl-dev -y
创建共享存储
gpu3
apt install nfs-server
mkdir /share
echo '/share 10.0.0.4(rw,sync,no_root_squash,no_subtree_check,insecure)' >> /etc/exports
exportfs -ra
gpu4
mkdir /share
mount gpu3:/share /share
安装hpcx
cd /share
wget https://content.mellanox.com/hpc/hpc-x/v2.17.1rc2/hpcx-v2.17.1-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64.tbz
tar xf hpcx-v2.17.1-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64.tbz
编译nccl tests
cd /share
wget https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v2.13.9.tar.gz
tar xf v2.13.9.tar.gz
cd nccl-tests-2.13.9/
make -J MPI=1
测试nccl tests
单节点:
all_reduce_perf -b 16M -e 8G -f 2 -g 8 -f 2 -c 1
gpu3结果
多节点 (slurm by srun)
export NCCL_SOCKET_IFNAME=bond0
export NCCL_IB_HCA=ib0,ib2,ib4
export NCCL_DEBUG=WARN
srun -N 2 -c 8 --gres=gpu:a800:8 ./all_reduce_perf -b 16M -e 8G -f 2 -g 8 -c 1
sbatch脚本参考 arp.slurm:
#!/bin/bash
#SBATCH --ntasks-per-node=8
#SBATCH -c 4
#SBATCH --gres=gpu:8
module load intel/2021.4.0 cuda/12.2 intel/slurm-support
ulimit -n 102400
export NCCL_IB_QPS_PER_CONNECTION=2
export NCCL_NSOCKS_PERTHREAD=3
export NCCL_SOCKET_NTHREADS=4
export NCCL_SOCKET_IFNAME=ib3
export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1
export NCCL_DEBUG=WARN
echo $SLURM_NODELIST $SLURM_NNODES
srun --cpu-bind=none --mpi=pmi2 --gres=gpu:8 /share/home/test/base/all_rp/all_reduce_perf -b 512M -e 16G -f 2 -g 1 -c 1 -m 4
sbatch -N