Add gpu resource usage tracking (#7075)

This commit is contained in:
Sagar Dhawan
2019-11-21 08:33:02 -08:00
committed by GitHub
parent 2c1b8fdd39
commit 79199711b8
2 changed files with 94 additions and 1 deletions

View File

@ -19,8 +19,27 @@ while true; do
ram_total_and_usage=$(echo "${top_ouput}" | grep '.*B Mem'| tail -1 | sed "s/.*: *\([0-9.]*\)%* total.*, *\([0-9.]*\)%* used.*/\1 \2/")
read -r total used <<< "$ram_total_and_usage"
ram_usage=$(awk "BEGIN {print $used / $total * 100}")
cpu_report="cpu_usage=$cpu_usage,ram_usage=$ram_usage"
report="cpu_usage=$cpu_usage,ram_usage=$ram_usage"
# if nvidia-smi exists, report gpu stats
gpu_report=""
if [ -x "$(command -v nvidia-smi)" ]; then
mapfile -t individual_gpu_usage < <(nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total --format=csv,nounits,noheader)
total_gpu_usage=0
total_gpu_mem_usage=0
num_gpus=${#individual_gpu_usage[@]}
for entry in "${individual_gpu_usage[@]}"
do
read -r compute mem_used mem_total <<< "${entry//,/}"
total_gpu_usage=$(awk "BEGIN {print $total_gpu_usage + $compute }")
total_gpu_mem_usage=$(awk "BEGIN {print $total_gpu_mem_usage + $mem_used / $mem_total * 100}")
done
avg_gpu_usage=$(awk "BEGIN {print $total_gpu_usage / $num_gpus}")
avg_gpu_mem_usage=$(awk "BEGIN {print $total_gpu_mem_usage / $num_gpus}")
gpu_report=",avg_gpu_usage=$avg_gpu_usage,avg_gpu_mem_usage=$avg_gpu_mem_usage"
fi
report="${cpu_report}${gpu_report}"
./scripts/metrics-write-datapoint.sh "system-stats,hostname=$HOSTNAME $report"
sleep 1
done