初始环境:CentOS 7.9 最小化安装:
192.168.2.130 mgr
192.168.2.131 node1
192.168.2.132 node2
上传slurm-20.11.2的安装包到管理节点的/home/share/下面。
一、安装mysql(管理节点)
yum -y install mariadb-server
systemctl start mariadb
systemctl enable mariadb
mysql
点击(此处)折叠或打开
- set password=password('123456');
- create database slurm_acct_db;
- quit
二、安装samba,用于传输文件,可忽略(管理节点)
yum -y install samba
mkdir /home/share
chmod 777 /home/share/
echo "[global]
log file = /var/log/samba/log.%m
max log size = 50
security = user
map to guest = Bad User
[share]
path=/home/share
readonly=yes
browseable=yes
writable = yes
guest ok=yes" > /etc/samba/smb.conf
systemctl start smb
systemctl enable smb
systemctl disable firewalld
systemctl stop firewalld
三、配置初始设定(每个节点上执行,包括管理和计算节点)
export MUNGEUSER=991 && groupadd -g $MUNGEUSER munge
useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGEUSER -g munge -s /sbin/nologin munge
export SLURMUSER=992 && groupadd -g $SLURMUSER slurm
useradd -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm -s /bin/bash slurm
yum -y install openssh-clients munge munge-libs munge-devel rng-tools openssl openssl-devel pam-devel numactl numactl-devel hwloc hwloc-devel lua lua-devel readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad python3-pip perl-ExtUtils-MakeMaker gcc rpm-build mysql-devel json-c json-c-devel http-parser http-parser-devel
yum -y install ucx* hdf5 hdf5-devel freeipmi
yum -y install gcc make libffi-devel openssl-devel
yum -y install gcc gcc-c++ make autoconf m4 automake libtool
yum -y install libgpg-error libgcrypt
rngd -r /dev/urandom
echo '192.168.2.130 mgr
192.168.2.131 node1
192.168.2.132 node2' >> /etc/hosts
四、开始安装和配置(在管理节点执行)
echo mgr >> /etc/hostname
ssh-keygen
ssh-copy-id node1
ssh-copy-id node2
ssh node1 "echo node1 > /etc/hostname"
ssh node2 "echo node2 > /etc/hostname"
ssh node1 reboot
ssh node2 reboot
reboot
/usr/sbin/create-munge-key -r
dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key
chown munge: /etc/munge/munge.key && chmod 400 /etc/munge/munge.key
scp /etc/munge/munge.key node1:/etc/munge/
scp /etc/munge/munge.key node2:/etc/munge/
chown -R munge: /etc/munge/ /var/log/munge/ && chmod 0700 /etc/munge/ /var/log/munge/
systemctl enable munge
systemctl start munge
systemctl status munge
ssh node1 "chown -R munge: /etc/munge/ /var/log/munge/ && chmod 0700 /etc/munge/ /var/log/munge/"
ssh node1 systemctl enable munge
ssh node1 systemctl start munge
ssh node1 systemctl status munge
ssh node2 "chown -R munge: /etc/munge/ /var/log/munge/ && chmod 0700 /etc/munge/ /var/log/munge/"
ssh node2 systemctl enable munge
ssh node2 systemctl start munge
ssh node2 systemctl status munge
systemctl stop firewalld
systemctl disable firewalld
ssh node1 systemctl stop firewalld
ssh node1 systemctl disable firewalld
ssh node2 systemctl stop firewalld
ssh node2 systemctl disable firewalld
yum -y install ucx* hdf5 hdf5-devel freeipmi
yum -y install gcc make libffi-devel openssl-devel
yum -y install gcc gcc-c++ make autoconf m4 automake libtool
yum -y install libgpg-error libgcrypt
cd /home/share/
rpmbuild -ta --with mysql slurm-20.11.2.tar.bz2
cd
cp -rf rpmbuild/RPMS/x86_64 ./
yum localinstall x86_64/slurm-*.rpm -y
scp -r x86_64/ node1:
scp -r x86_64/ node2:
ssh node1 "yum localinstall x86_64/slurm-*.rpm -y"
ssh node2 "yum localinstall x86_64/slurm-*.rpm -y"
cp /etc/slurm/slurm.conf.example /etc/slurm/slurm.conf
cp /etc/slurm/slurmdbd.conf.example /etc/slurm/slurmdbd.conf
cp /etc/slurm/cgroup.conf.example /etc/slurm/cgroup.conf
本文最后准备了slurm.conf 和 slurmdbd.conf文件供参考
cat /home/share/slurm.conf > /etc/slurm/slurm.conf
cat /home/share/slurmdbd.conf > /etc/slurm/slurmdbd.conf
#cat /home/share/cgroup.conf > /etc/slurm/cgroup.conf #使用默认即可
scp /etc/slurm/slurm.conf node1:/etc/slurm/slurm.conf
scp /etc/slurm/slurm.conf node2:/etc/slurm/slurm.conf
scp /etc/slurm/slurmdbd.conf node1:/etc/slurm/slurmdbd.conf
scp /etc/slurm/slurmdbd.conf node2:/etc/slurm/slurmdbd.conf
scp /etc/slurm/cgroup.conf node1:/etc/slurm/cgroup.conf
scp /etc/slurm/cgroup.conf node2:/etc/slurm/cgroup.conf
mkdir /var/spool/slurmctld && chown slurm: /var/spool/slurmctld && chmod 755 /var/spool/slurmctld
mkdir /var/log/slurm && touch /var/log/slurm/slurmctld.log && chown slurm: /var/log/slurm/slurmctld.log
touch /var/log/slurm/slurm_jobacct.log /var/log/slurm/slurm_jobcomp.log && chown slurm: /var/log/slurm/slurm_jobacct.log /var/log/slurm/slurm_jobcomp.log
ssh node1 "mkdir /var/spool/slurmd && chown slurm: /var/spool/slurmd && chmod 755 /var/spool/slurmd"
ssh node1 "mkdir /var/log/slurm && touch /var/log/slurm/slurmd.log && chown slurm: /var/log/slurm/slurmd.log"
ssh node2 "mkdir /var/spool/slurmd && chown slurm: /var/spool/slurmd && chmod 755 /var/spool/slurmd"
ssh node2 "mkdir /var/log/slurm && touch /var/log/slurm/slurmd.log && chown slurm: /var/log/slurm/slurmd.log"
systemctl enable slurmdbd
systemctl start slurmdbd
systemctl status slurmdbd
systemctl enable slurmctld
systemctl start slurmctld
systemctl status slurmctld
ssh node1 systemctl enable slurmd
ssh node1 systemctl restart slurmd
ssh node1 systemctl status slurmd
ssh node2 systemctl enable slurmd
ssh node2 systemctl restart slurmd
ssh node2 systemctl status slurmd
至此slurm安装完毕,如果启动服务的过程中报错,使用调试方式启动查看启动服务的过程中报错
$ slurmctld -Dvvvvv
$ slurmdbd -Dvvvvv
$ slurmd -Dvvvvv
slurm.conf
点击(此处)折叠或打开
- SlurmctldHost=mgr
-
- #
- SlurmctldDebug=info
- SlurmdDebug=debug3
- GresTypes=gpu
-
- MpiDefault=none
- ProctrackType=proctrack/cgroup
- SlurmctldPidFile=/var/run/slurmctld.pid
- SlurmctldPort=6817
- SlurmdPidFile=/var/run/slurmd.pid
- SlurmdPort=6818
- SlurmdSpoolDir=/var/spool/slurmd
- SlurmUser=root
- StateSaveLocation=/var/spool/slurmctld
- SwitchType=switch/none
- TaskPlugin=task/affinity,task/cgroup
- TaskPluginParam=Sched
-
- # TIMERS
- InactiveLimit=0
- KillWait=15
- ResumeTimeout=600
- MinJobAge=300
- #OverTimeLimit=0
- SlurmctldTimeout=12
- SlurmdTimeout=300
- Waittime=0
- # SCHEDULING
- SchedulerType=sched/backfill
- SelectType=select/cons_tres
- SelectTypeParameters=CR_Core
- # LOGGING AND ACCOUNTING
- AccountingStorageEnforce=associations
- AccountingStorageHost=mgr
- AccountingStoragePort=6819
- AccountingStorageType=accounting_storage/slurmdbd
- AccountingStoreJobComment=YES
- ClusterName=slurm20_cluster
- JobCompHost=localhost
- JobCompPass=123456
- JobCompPort=3306
- JobCompType=jobcomp/mysql
- JobCompUser=root
- JobAcctGatherFrequency=1
- JobAcctGatherType=jobacct_gather/linux
- SlurmctldLogFile=/var/log/slurm/slurmctld.log
- SlurmdLogFile=/var/log/slurm/slurmd.log
-
- # POWER SAVE SUPPORT FOR IDLE NODES (optional)
- SuspendTime=70
- NodeName=node[1-2] Procs=1 State=UNKNOWN
- PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
点击(此处)折叠或打开
- # Authentication info
- AuthType=auth/munge
- AuthInfo=/var/run/munge/munge.socket.2
- #DebugLevel=info
- # slurmDBD info
- DbdAddr=192.168.2.130
- DbdHost=localhost
- DbdPort=6819
- SlurmUser=root
- DebugLevel=verbose
- LogFile=/var/log/slurm/slurmdbd.log
- PidFile=/var/run/slurmdbd.pid
- # Database info
- StorageType=accounting_storage/mysql
- StorageHost=localhost
- StoragePort=3306
- StoragePass=123456
- StorageUser=root
- StorageLoc=slurm_acct_db