From 88d26376ff5d4092390a3f629dedc7623aaf0e78 Mon Sep 17 00:00:00 2001 From: Juan ESCOBAR <juan.escobar@aero.obs-mip.fr> Date: Fri, 16 Dec 2022 15:45:02 +0100 Subject: [PATCH] Juan 16/12/2022:add set_rocm_device & Rocprof tools --- bin/Rocprof | 18 ++++++++++++++++++ bin/set_rocm_device | 23 +++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100755 bin/Rocprof create mode 100755 bin/set_rocm_device diff --git a/bin/Rocprof b/bin/Rocprof new file mode 100755 index 000000000..c0bc6ab32 --- /dev/null +++ b/bin/Rocprof @@ -0,0 +1,18 @@ +#!/bin/bash +if [[ -n ${OMPI_COMM_WORLD_RANK+z} ]]; then + # mpich + export MPI_RANK=${OMPI_COMM_WORLD_RANK} +elif [[ -n ${MV2_COMM_WORLD_RANK+z} ]]; then + # ompi + export MPI_RANK=${MV2_COMM_WORLD_RANK} +elif [[ -n ${SLURM_PROCID+z} ]]; then + #srun + export MPI_RANK=${SLURM_PROCID} +fi +args="$*" +pid="$$" +outdir="dir_Rocprof-${SLURM_JOBID}" +outfile="results_Rocprof-MNH${XYZ}_${NP}NP_${NG}NG_${NC}NC_${CG}CG.${MPI_RANK}IP.${SLURM_JOBID}" +eval "rocprof -o ${outfile}.csv $*" +rm -f ${outfile}.{db,json,sysinfo.txt} + diff --git a/bin/set_rocm_device b/bin/set_rocm_device new file mode 100755 index 000000000..74833ee79 --- /dev/null +++ b/bin/set_rocm_device @@ -0,0 +1,23 @@ +#!/bin/bash +#set -x + +Gpuinfo='rocm-smi -i' +NB_DEVICE=${NB_DEVICE:-$( ${Gpuinfo} | grep GPU | wc -l )} +[ ${NB_DEVICE} -eq 0 ] && NB_DEVICE=1 + +export GPU_OFFSET=${GPU_OFFSET:-0} + +#[[ ${IP} -ge 1 ]] && IP=$(( IP +1 )) +#[[ ${IP} -ge 2 ]] && IP=$(( IP +1 )) + +export LIP=${OMPI_COMM_WORLD_LOCAL_RANK:-${SLURM_LOCALID}} +export IP=${OMPI_COMM_WORLD_RANK:-${SLURM_PROCID}} +export NP=${OMPI_COMM_WORLD_SIZE:-${SLURM_NTASKS}} +export NN=${OMPI_MCA_orte_num_nodes:-${SLURM_NNODES}} +export NPN=$(( 1 + (NP-1)/ NN )) +export HN=$( hostname ) +#export ROCR_VISIBLE_DEVICES=$(( IP % NB_DEVICE )) +export ROCR_VISIBLE_DEVICES=$(( GPU_OFFSET + LIP / ( 1 + (NPN-1) / NB_DEVICE ) )) +echo LIP=${LIP} IP=${IP} NP=${NP} NN=${NN} NPN=${NPN} NG=${NB_DEVICE} GPU=${ROCR_VISIBLE_DEVICES} ${HN} + +exec $* -- GitLab