参考文档,https://blog.csdn.net/xuecangqiuye/article/details/109687256

初始环境:CentOS 7.9 最小化安装:
192.168.2.130   mgr
192.168.2.131   node1
192.168.2.132   node2
上传slurm-20.11.2的安装包到管理节点的/home/share/下面。


一、安装mysql(管理节点)
yum -y install mariadb-server
systemctl start mariadb
systemctl enable mariadb
mysql

点击(此处)折叠或打开

  1. set password=password('123456');
  2. create database slurm_acct_db;
  3. quit

二、安装samba,用于传输文件,可忽略(管理节点
yum -y install samba
mkdir /home/share
chmod 777 /home/share/
echo "[global]
        log file = /var/log/samba/log.%m
        max log size = 50
        security = user
        map to guest = Bad User
[share]
        path=/home/share
        readonly=yes
        browseable=yes
        writable = yes
        guest ok=yes" > /etc/samba/smb.conf
systemctl start smb
systemctl enable smb

systemctl disable firewalld
systemctl stop firewalld

三、配置初始设定(每个节点上执行,包括管理和计算节点)
export MUNGEUSER=991 && groupadd -g $MUNGEUSER munge
useradd  -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGEUSER -g munge  -s /sbin/nologin munge
export SLURMUSER=992 && groupadd -g $SLURMUSER slurm
useradd  -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm  -s /bin/bash slurm

yum -y install openssh-clients munge munge-libs munge-devel rng-tools openssl openssl-devel pam-devel numactl numactl-devel hwloc hwloc-devel lua lua-devel readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad python3-pip perl-ExtUtils-MakeMaker gcc rpm-build mysql-devel json-c json-c-devel http-parser http-parser-devel
yum -y install ucx* hdf5 hdf5-devel freeipmi
yum -y install gcc make libffi-devel openssl-devel 
yum -y install gcc gcc-c++ make autoconf m4 automake libtool 
yum -y install libgpg-error libgcrypt

rngd -r /dev/urandom

echo '192.168.2.130   mgr
192.168.2.131   node1
192.168.2.132   node2' >> /etc/hosts

四、开始安装和配置(在管理节点执行)
echo mgr >> /etc/hostname
ssh-keygen
ssh-copy-id node1
ssh-copy-id node2
ssh node1 "echo node1 > /etc/hostname"
ssh node2 "echo node2 > /etc/hostname"
ssh node1 reboot
ssh node2 reboot
reboot

/usr/sbin/create-munge-key -r
dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key
chown munge: /etc/munge/munge.key && chmod 400 /etc/munge/munge.key

scp /etc/munge/munge.key node1:/etc/munge/
scp /etc/munge/munge.key node2:/etc/munge/

chown -R munge: /etc/munge/ /var/log/munge/ && chmod 0700 /etc/munge/ /var/log/munge/
systemctl enable munge
systemctl start munge
systemctl status munge

ssh node1 "chown -R munge: /etc/munge/ /var/log/munge/ && chmod 0700 /etc/munge/ /var/log/munge/"
ssh node1 systemctl enable munge
ssh node1 systemctl start munge
ssh node1 systemctl status munge

ssh node2 "chown -R munge: /etc/munge/ /var/log/munge/ && chmod 0700 /etc/munge/ /var/log/munge/"
ssh node2 systemctl enable munge
ssh node2 systemctl start munge
ssh node2 systemctl status munge

systemctl stop firewalld
systemctl disable firewalld

ssh node1 systemctl stop firewalld
ssh node1 systemctl disable firewalld
ssh node2 systemctl stop firewalld
ssh node2 systemctl disable firewalld

yum -y install ucx* hdf5 hdf5-devel freeipmi
yum -y install gcc make libffi-devel openssl-devel 
yum -y install gcc gcc-c++ make autoconf m4 automake libtool 
yum -y install libgpg-error libgcrypt

cd /home/share/
rpmbuild -ta --with mysql slurm-20.11.2.tar.bz2
cd
cp -rf rpmbuild/RPMS/x86_64 ./

yum localinstall x86_64/slurm-*.rpm -y

scp -r x86_64/ node1:
scp -r x86_64/ node2:
ssh node1 "yum localinstall x86_64/slurm-*.rpm -y"
ssh node2 "yum localinstall x86_64/slurm-*.rpm -y"

cp /etc/slurm/slurm.conf.example /etc/slurm/slurm.conf
cp /etc/slurm/slurmdbd.conf.example /etc/slurm/slurmdbd.conf
cp /etc/slurm/cgroup.conf.example /etc/slurm/cgroup.conf

本文最后准备了slurm.conf 和 slurmdbd.conf文件供参考
cat /home/share/slurm.conf > /etc/slurm/slurm.conf
cat /home/share/slurmdbd.conf > /etc/slurm/slurmdbd.conf
#cat /home/share/cgroup.conf > /etc/slurm/cgroup.conf #使用默认即可

scp /etc/slurm/slurm.conf node1:/etc/slurm/slurm.conf
scp /etc/slurm/slurm.conf node2:/etc/slurm/slurm.conf
scp /etc/slurm/slurmdbd.conf node1:/etc/slurm/slurmdbd.conf
scp /etc/slurm/slurmdbd.conf node2:/etc/slurm/slurmdbd.conf
scp /etc/slurm/cgroup.conf node1:/etc/slurm/cgroup.conf
scp /etc/slurm/cgroup.conf node2:/etc/slurm/cgroup.conf

mkdir /var/spool/slurmctld && chown slurm: /var/spool/slurmctld && chmod 755 /var/spool/slurmctld
mkdir /var/log/slurm && touch /var/log/slurm/slurmctld.log && chown slurm: /var/log/slurm/slurmctld.log
touch /var/log/slurm/slurm_jobacct.log /var/log/slurm/slurm_jobcomp.log && chown slurm: /var/log/slurm/slurm_jobacct.log /var/log/slurm/slurm_jobcomp.log

ssh node1 "mkdir /var/spool/slurmd && chown slurm: /var/spool/slurmd && chmod 755 /var/spool/slurmd"
ssh node1 "mkdir /var/log/slurm && touch /var/log/slurm/slurmd.log && chown slurm: /var/log/slurm/slurmd.log"
ssh node2 "mkdir /var/spool/slurmd && chown slurm: /var/spool/slurmd && chmod 755 /var/spool/slurmd"
ssh node2 "mkdir /var/log/slurm && touch /var/log/slurm/slurmd.log && chown slurm: /var/log/slurm/slurmd.log"

systemctl enable slurmdbd
systemctl start slurmdbd
systemctl status slurmdbd

systemctl enable slurmctld
systemctl start slurmctld
systemctl status slurmctld

ssh node1 systemctl enable slurmd
ssh node1 systemctl restart slurmd
ssh node1 systemctl status slurmd
ssh node2 systemctl enable slurmd
ssh node2 systemctl restart slurmd
ssh node2 systemctl status slurmd

至此slurm安装完毕,如果启动服务的过程中报错,使用调试方式启动查看启动服务的过程中报错

$ slurmctld -Dvvvvv
$ slurmdbd -Dvvvvv
$ slurmd -Dvvvvv

slurm.conf

点击(此处)折叠或打开

  1. SlurmctldHost=mgr
  2.  
  3. #
  4. SlurmctldDebug=info
  5. SlurmdDebug=debug3
  6. GresTypes=gpu
  7.  
  8. MpiDefault=none
  9. ProctrackType=proctrack/cgroup
  10. SlurmctldPidFile=/var/run/slurmctld.pid
  11. SlurmctldPort=6817
  12. SlurmdPidFile=/var/run/slurmd.pid
  13. SlurmdPort=6818
  14. SlurmdSpoolDir=/var/spool/slurmd
  15. SlurmUser=root
  16. StateSaveLocation=/var/spool/slurmctld
  17. SwitchType=switch/none
  18. TaskPlugin=task/affinity,task/cgroup
  19. TaskPluginParam=Sched
  20.  
  21. # TIMERS
  22. InactiveLimit=0
  23. KillWait=15
  24. ResumeTimeout=600
  25. MinJobAge=300
  26. #OverTimeLimit=0
  27. SlurmctldTimeout=12
  28. SlurmdTimeout=300
  29. Waittime=0

  30. # SCHEDULING
  31. SchedulerType=sched/backfill
  32. SelectType=select/cons_tres
  33. SelectTypeParameters=CR_Core

  34. # LOGGING AND ACCOUNTING
  35. AccountingStorageEnforce=associations
  36. AccountingStorageHost=mgr
  37. AccountingStoragePort=6819
  38. AccountingStorageType=accounting_storage/slurmdbd
  39. AccountingStoreJobComment=YES
  40. ClusterName=slurm20_cluster
  41. JobCompHost=localhost
  42. JobCompPass=123456
  43. JobCompPort=3306
  44. JobCompType=jobcomp/mysql
  45. JobCompUser=root
  46. JobAcctGatherFrequency=1
  47. JobAcctGatherType=jobacct_gather/linux
  48. SlurmctldLogFile=/var/log/slurm/slurmctld.log
  49. SlurmdLogFile=/var/log/slurm/slurmd.log
  50.      
  51. # POWER SAVE SUPPORT FOR IDLE NODES (optional)
  52. SuspendTime=70
  53. NodeName=node[1-2] Procs=1 State=UNKNOWN
  54. PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
slurmdbd.conf

点击(此处)折叠或打开

  1. # Authentication info
  2. AuthType=auth/munge
  3. AuthInfo=/var/run/munge/munge.socket.2
  4. #DebugLevel=info

  5. # slurmDBD info
  6. DbdAddr=192.168.2.130
  7. DbdHost=localhost
  8. DbdPort=6819
  9. SlurmUser=root
  10. DebugLevel=verbose
  11. LogFile=/var/log/slurm/slurmdbd.log
  12. PidFile=/var/run/slurmdbd.pid

  13. # Database info
  14. StorageType=accounting_storage/mysql
  15. StorageHost=localhost
  16. StoragePort=3306
  17. StoragePass=123456
  18. StorageUser=root
  19. StorageLoc=slurm_acct_db
12-31 16:59