# LAMMPS on Taiwania3 ## version 2021/1/20. patch_24Dec2020-106-g102a6eb 2021/1/28. patch_24Dec2020-194-g364727a 2021/1/29. patch_24Dec2020-208-ga77bb30 2021/2/2. patch_24Dec2020-326-g1a7cb46 2021/2/5. Update example script 2021/3/1. patch_10Feb2021-211-g9efc831 2021/3/24. patch_10Mar2021-137-g73b9f22 ## compiler toolchain For version before patch_24Dec2020-326-g1a7cb46 ``` module load intel/2020 cmake/3.15.4 hdf5/1.12.0_intelmpi-19.1.3.304 netcdf/4.7.4-hdf5-1.12.0_intelmpi-19.1.3.304 ``` For version after patch_10Feb2021-211-g9efc831 ``` module load compiler/intel/2021 IntelMPI/2021 hdf5/1.12 netcdf/4.7.4 adios2/2.7.1 ``` ## compiler recipe ``` # intel_cpu_intelmpi = USER-INTEL package, Intel MPI, MKL FFT SHELL = /bin/sh # --------------------------------------------------------------------- # compiler/linker settings # specify flags and libraries needed for your compiler CC = mpiicpc -std=c++11 OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits \ -qopt-zmm-usage=high CCFLAGS = -qopenmp -qno-offload -ansi-alias -restrict \ -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) \ -I$(MKLROOT)/include SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpiicpc -std=c++11 LINKFLAGS = -qopenmp $(OPTFLAGS) -L$(MKLROOT)/lib/intel64/ LIB = -ltbbmalloc -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core SIZE = size ARCHIVE = ar ARFLAGS = -rc SHLIBFLAGS = -shared # --------------------------------------------------------------------- # LAMMPS-specific settings, all OPTIONAL # specify settings for LAMMPS features you will use # if you change any -D setting, do full re-compile after "make clean" # LAMMPS ifdef settings # see possible settings in Section 3.5 of the manual LMP_INC = -DLAMMPS_GZIP -DLAMMPS_BIGBIG -DLAMMPS_PNG # MPI library # see discussion in Section 3.4 of the manual # MPI wrapper compiler/linker can provide this info # can point to dummy MPI library in src/STUBS as in Makefile.serial # use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts # INC = path for mpi.h, MPI compiler settings # PATH = path for MPI library # LIB = name of MPI library MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 MPI_PATH = MPI_LIB = # FFT library # see discussion in Section 3.5.2 of manual # can be left blank to use provided KISS FFT library # INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings # PATH = path for FFT library # LIB = name of FFT library FFT_INC = -DFFT_MKL -DFFT_SINGLE FFT_PATH = FFT_LIB = # JPEG and/or PNG library # see discussion in Section 3.5.4 of manual # only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC # INC = path(s) for jpeglib.h and/or png.h # PATH = path(s) for JPEG library and/or PNG library # LIB = name(s) of JPEG library and/or PNG library JPG_INC = -I/usr/local/include JPG_PATH = -L/usr/lib64 JPG_LIB = -lpng # --------------------------------------------------------------------- # build rules and dependencies # do not edit this section include Makefile.package.settings include Makefile.package EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC) EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH) EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB) EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS) EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS) # Path to src files vpath %.cpp .. vpath %.h .. # Link target $(EXE): main.o $(LMPLIB) $(EXTRA_LINK_DEPENDS) $(LINK) $(LINKFLAGS) main.o $(EXTRA_PATH) $(LMPLINK) $(EXTRA_LIB) $(LIB) -o $@ $(SIZE) $@ # Library targets $(ARLIB): $(OBJ) $(EXTRA_LINK_DEPENDS) @rm -f ../$(ARLIB) $(ARCHIVE) $(ARFLAGS) ../$(ARLIB) $(OBJ) @rm -f $(ARLIB) @ln -s ../$(ARLIB) $(ARLIB) $(SHLIB): $(OBJ) $(EXTRA_LINK_DEPENDS) $(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o ../$(SHLIB) \ $(OBJ) $(EXTRA_LIB) $(LIB) @rm -f $(SHLIB) @ln -s ../$(SHLIB) $(SHLIB) # Compilation rules %.o:%.cpp $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< # Individual dependencies depend : fastdep.exe $(SRC) @./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1 fastdep.exe: ../DEPEND/fastdep.c cc -O -o $@ $< sinclude .depend ``` ### note 1. install ccache to speed up compiler progress. 2. install zeromq(login/computing node), zeromq-del(login node) for message package. 3. cslib現在無法在-DLAMMPS_BIGBIG flag下作用。 4. install voro++(login/computing node), voro++-devel(login node) for VORONOI. 5. molfile package 需要libdl.so,這包在glibc,這應該都有裝。 6. 修改lib內自帶makefile時盡量建立副本`cp Makefile Makefile.icc`,再用`make -f Makefile.icc`來編,避免git pull 會遇到的衝突問題。 7. python package需要在lib/python手動將Makefile.lammps.python3取代Makefile.lammps,才會正確使用系統的python3 interpreter。 8. 安裝message時要先編譯自帶的cslib,需要手動在Makefile的CCFLAGS加入--std=c++11,不然會有`identifier "nullptr" is undefined`的問題。 9. git pull 之後記得在src下`make pu && make ps`更新挑選的套件和程式碼,再重新編譯。 10. `git describe --tags`可以查看現在的版本。 11. make clean-all 12. make intel_cpu_intelmpi -j 28 ## exe path initial build /opt/ohpc/pkg/lammps/patch_24Dec2020-106-g102a6eb/lmp_intel_cpu_intelmpi add vororoni package /opt/ohpc/pkg/lammps/patch_24Dec2020-194-g364727a/lmp_intel_cpu_intelmpi add meam, user-intel, user-omp, molfile, python3 package /opt/ohpc/pkg/lammps/patch_24Dec2020-208-ga77bb30/lmp_intel_cpu_intelmpi_p3 add meam, user-intel, user-omp, molfile, python2 package /opt/ohpc/pkg/lammps/patch_24Dec2020-208-ga77bb30/lmp_intel_cpu_intelmpi_p2 add netcdf, hdf5 package /opt/ohpc/pkg/lammps/patch_24Dec2020-208-ga77bb30/lmp_intel_cpu_intelmpi small update, latest /opt/ohpc/pkg/lammps/patch_24Dec2020-326-g1a7cb46/lmp_intel_cpu_intelmpi_p2 /opt/ohpc/pkg/lammps/patch_24Dec2020-326-g1a7cb46/lmp_intel_cpu_intelmpi_p3 add user-adios and update toolchain /opt/ohpc/pkg/lammps/patch_10Feb2021-211-g9efc831/lmp_intel_cpu_intelmpi update toolchain /opt/ohpc/pkg/lammps/patch_10Mar2021-137-g73b9f22/lmp_intel_cpu_intelmpi ## modulefile path not set yet ## input file ``` # 3d Lennard-Jones melt variable x index 1 variable y index 1 variable z index 1 variable xx equal 20*$x variable yy equal 20*$y variable zz equal 20*$z units lj atom_style atomic lattice fcc 0.8442 region box block 0 ${xx} 0 ${yy} 0 ${zz} create_box 1 box create_atoms 1 box mass 1 1.0 velocity all create 1.44 87287 loop geom pair_style lj/cut 2.5 pair_coeff 1 1 1.0 1.0 2.5 neighbor 0.3 bin neigh_modify delay 0 every 20 check no fix 1 all nve run 100 ``` ## basic submition script ``` #!/bin/bash #SBATCH --job-name lammps_test # Job name #SBATCH --output %x-%j.out # Name of stdout output file (%x expands to jobname, %j expands to jobId) ####Step 1: Selection of Nodes ###SBATCH --nodelist=gn[0103-0105,107].twcc.ai #Request a specific list of hosts #SBATCH --nodes=4 #Controls the number of nodes allocated to the job ###SBATCH --ntasks=8 #Controls the number of tasks to be created for the job #SBATCH --cpus-per-task=1 #Controls the number of CPUs allocated per task #SBATCH --ntasks-per-node=56 #Controls the maximum number of tasks per allocated node ###SBATCH --ntasks-per-core #Controls the maximum number of tasks per allocated core ###SBATCH --ntasks-per-socket #Controls the maximum number of tasks per allocated socket ###Step 2: Allocation of CPUs from the selected Nodes ###SBATCH --distribution=block:cyclic #before the ":" controls the sequence in which tasks are distributed to each of the selected nodes. #after the ":" controls the sequence in which tasks are distributed to sockets within a node. ###SBATCH --time 24:00:00 # Run time (hh:mm:ss) - 1.5 hours #SBATCH --partition test #SBATCH --account GOV108018 module purge ###module load intel/2020 hdf5/1.12.0_intelmpi-19.1.3.304 netcdf/4.7.4-hdf5-1.12.0_intelmpi-19.1.3.304 module load compiler/intel/2021 IntelMPI/2021 hdf5/1.12 netcdf/4.7.4 adios2/2.7.1 ### force to adopt OFA export I_MPI_FABRICS=shm:ofi export UCX_TLS=rc,ud,sm,self ### set processor management librery export I_MPI_PMI_LIBRARY=/usr/lib64/libpmi2.so ### set debug level, 0:no debug info export I_MPI_DEBUG=10 ### export I_MPI_HYDRA_BOOTSTRAP=slurm ### set cpu binding export I_MPI_PIN=1 ###export EXE=/opt/ohpc/pkg/lammps/patch_24Dec2020-208-ga77bb30/lmp_intel_cpu_intelmpi export EXE=/opt/ohpc/pkg/lammps/patch_10Feb2021-211-g9efc831/lmp_intel_cpu_intelmpi echo "Running on hosts: $SLURM_NODELIST" echo "Running on $SLURM_NNODES nodes." echo "Running $SLURM_NTASKS tasks." echo "$SLURM_MPI_TYPE" SUBMIT_FILE=`scontrol show job $SLURM_JOB_ID | grep "Command=" | awk 'BEGIN {FS="="}; {print $2}'` #echo $SUBMIT_FILE #echo ${SUBMIT_FILE##/*/} #echo "$SLURM_SUBMIT_DIR/$SLURM_JOB_ID.debug" cp $SUBMIT_FILE $SLURM_SUBMIT_DIR/$SLURM_JOB_ID.debug #mpiexec.hydra $EXE -sf hybrid intel omp -nocite -var x 20 -var y 35 -var z 40 -in in.lj #mpiexec.hydra -bootstrap slurm -n $SLURM_NTASKS $EXE -sf hybrid intel omp -nocite -var x 12 -var y 21 -var z 24 -in in.lj mpiexec.hydra -n $SLURM_NTASKS $EXE -sf hybrid intel omp -nocite -var x 12 -var y 21 -var z 24 -in in.lj #srun -n $SLURM_NTASKS $EXE -sf hybrid intel omp -nocite -var x 3 -var y 21 -var z 24 -in in.lj echo "Your LAMMPS job completed at `date` " ``` ### Command-line options #### -nocite Disable writing the log.cite file #### -suffix style args LAMMPS currently has acceleration support for three kinds of hardware, via the listed packages: | Many-core CPUs | USER-INTEL, KOKKOS, USER-OMP, OPT packages | | :--------: | :--------: | | NVIDIA/AMD GPUs | GPU, KOKKOS packages | | Intel Phi/AVX | USER-INTEL, KOKKOS packages | `-sf intel`: use styles from the USER-INTEL package `-sf opt`: use styles from the OPT package `-sf omp`: use styles from the USER-OMP package `-sf hybrid intel omp`: use styles from the USER-INTEL package if they are installed and available, but styles for the USER-OMP package otherwise. `-sf hybrid intel opt`: ## Running many sequential jobs in parallel using job arrays ``` #!/bin/bash #SBATCH --job-name sja # Job name #SBATCH --output %x-%A_%a.out # Name of stdout output file (%x expands to jobname, %j expands to jobId) ####Step 1: Selection of Nodes ###SBATCH --nodelist=gn[0103-0105,107].twcc.ai #Request a specific list of hosts #SBATCH --nodes=1 #Controls the number of nodes allocated to the job ###SBATCH --ntasks=8 #Controls the number of tasks to be created for the job ###SBATCH --cpus-per-task=1 #Controls the number of CPUs allocated per task ###SBATCH --ntasks-per-node=4 #Controls the maximum number of tasks per allocated node ###SBATCH --ntasks-per-core #Controls the maximum number of tasks per allocated core ###SBATCH --ntasks-per-socket #Controls the maximum number of tasks per allocated socket #SBATCH --array=0-3 ###Step 2: Allocation of CPUs from the selected Nodes #SBATCH --distribution=block:cyclic #before the ":" controls the sequence in which tasks are distributed to each of the selected nodes. #after the ":" controls the sequence in which tasks are distributed to sockets within a node. ###SBATCH --time 24:00:00 # Run time (hh:mm:ss) - 1.5 hours #SBATCH --partition gpu #SBATCH --account GOV109199 module purge echo "Running on hosts: $SLURM_NODELIST" echo "Running on $SLURM_NNODES nodes." echo "Running $SLURM_NTASKS tasks." echo "$SLURM_MPI_TYPE" SUBMIT_FILE=`scontrol show job $SLURM_JOB_ID | grep "Command=" | awk 'BEGIN {FS="="}; {print $2}'` #echo $SUBMIT_FILE #echo ${SUBMIT_FILE##/*/} #echo "$SLURM_SUBMIT_DIR/$SLURM_JOB_ID.debug" #cp $SUBMIT_FILE $SLURM_SUBMIT_DIR/$SLURM_JOB_ID.debug echo "Your job starts at `date`" python3 Ramanujan.py echo "Your job completed at `date` " ``` ## Packaging smaller parallel jobs into one job script ``` #!/bin/bash #SBATCH --job-name lammps_test # Job name #SBATCH --output %x-%j.out # Name of stdout output file (%x expands to jobname, %j expands to jobId) ####Step 1: Selection of Nodes ###SBATCH --nodelist=gn[0103-0105,107].twcc.ai #Request a specific list of hosts #SBATCH --nodes=8 #Controls the number of nodes allocated to the job ###SBATCH --ntasks=8 #Controls the number of tasks to be created for the job #SBATCH --cpus-per-task=1 #Controls the number of CPUs allocated per task #SBATCH --ntasks-per-node=56 #Controls the maximum number of tasks per allocated node ###SBATCH --ntasks-per-core #Controls the maximum number of tasks per allocated core ###SBATCH --ntasks-per-socket #Controls the maximum number of tasks per allocated socket ###Step 2: Allocation of CPUs from the selected Nodes ###SBATCH --distribution=block:cyclic #before the ":" controls the sequence in which tasks are distributed to each of the selected nodes. #after the ":" controls the sequence in which tasks are distributed to sockets within a node. ###SBATCH --time 24:00:00 # Run time (hh:mm:ss) - 1.5 hours #SBATCH --partition test #SBATCH --account GOV109199 module purge module load intel/19.1.3.304 export I_MPI_FABRICS=shm:ofi export UCX_TLS=rc,ud,sm,self export I_MPI_PMI_LIBRARY=/opt/qct/ohpc/admin/pmix/lib/libpmi2.so export I_MPI_DEBUG=10 export EXE=/opt/ohpc/pkg/lammps/patch_24Dec2020-106-g102a6eb/lmp_intel_cpu_intelmpi # When running a large number of tasks simultaneously, it may be # necessary to increase the user process limit. ulimit -u 10000 echo "Running on hosts: $SLURM_NODELIST" echo "Running on $SLURM_NNODES nodes." echo "Running $SLURM_NTASKS tasks." echo "Your LAMMPS job starts at `date`" mpiexec.hydra $EXE -sf hybrid intel omp -nocite -var x 20 -var y 35 -var z 40 -in in.lj echo "Your LAMMPS job completed at `date` " ``` ## MPI-IO test problem size: 286720000 atoms conputing resource: 160 nodes (8960 cores) ![](https://cos.twcc.ai/SYS-MANUAL/uploads/upload_7b0419b4c1de14e92d5f3a9eda9aa0f5.png) lammps command: write_dump all image initial.png type type view 60 150 axes yes 0.2 0.02 subbox yes 0.01 size 1024 1024 subbox yes 0.01這選項可以把空間如何切分給視覺化 想像xy平面平鋪於桌面,z軸垂直於桌面, view 60 150 第一個60是xy平面的仰角,第二個150是以z為中心做旋轉。 dump variable: mass type xs ys zs element 4 float64, 1 integer, 1 string 4 * 8bytes+1 * 4bytes+1 * 4bytes=40bytes 286,720,000 * 40=11,468,800,000bytes=11,200,000KB | dump type | size | Access time | Modify time | time | | -------- | -------- | -------- |-------- |-------- | | image | 300 KB | 02:37:04 | 02:37:05 | 1s | | cfg/mpiio.gz | 11,095,363 KB | 02:37:05 | 02:37:35 | 30s | | cfg | 11,095,363 KB | 02:37:58 | 02:38:04 | 6s | | cfg/gz | 790,966 KB | 02:38:04 | 03:10:00 | 31m56s | | cfg/mpiio.bin | 11,095,363 KB | 03:10:00 | 03:10:31 | 31s | ps1. You can use the “.bin” suffix described below in an MPI-IO dump file; A binary dump file will be about the same size as a text version, but will typically write out much faster. ### formating sacct sacct --format="JobID,JobName%30,Partition,Account,AllocCPUS,State,ExitCode" ### Large Scale simulation consideration ## mpi ### intelmpi #### Reduce initialization times If all ranks work on the same Intel Architecture generation, switch off the platform check: `I_MPI_PLATFORM_CHECK=0` Specify the processor architecture being used to tune the collective operations: `I_MPI_PLATFORM=uniform` Alternative PMI data exchange algorithm can help to speed up the startup phase: `I_MPI_HYDRA_PMI_CONNECT=alltoall` Customizing the branching may also help startup times (default is 32 for over 127 nodes): `I_MPI_HYDRA_BRANCH_COUNT=<n>` ### openmpi ## Queuing system ### slurm Specifies that the batch job should never be requeued under any circumstances. `#SBATCH --no-requeue` ### pbspro