Add gpu resource usage tracking (#7075)
This commit is contained in:
@ -9764,6 +9764,80 @@
|
|||||||
]
|
]
|
||||||
],
|
],
|
||||||
"tags": []
|
"tags": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"params": [
|
||||||
|
"$__interval"
|
||||||
|
],
|
||||||
|
"type": "time"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"params": [
|
||||||
|
"null"
|
||||||
|
],
|
||||||
|
"type": "fill"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"orderByTime": "ASC",
|
||||||
|
"policy": "default",
|
||||||
|
"query": "SELECT mean(\"avg_gpu_usage\") as \"gpu_usage\" FROM \"$testnet\".\"autogen\".\"system-stats\" WHERE hostname =~ /$hostid/ AND $timeFilter GROUP BY time(5s) fill(null)\n",
|
||||||
|
"rawQuery": true,
|
||||||
|
"refId": "C",
|
||||||
|
"resultFormat": "time_series",
|
||||||
|
"select": [
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"params": [
|
||||||
|
"value"
|
||||||
|
],
|
||||||
|
"type": "field"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"params": [],
|
||||||
|
"type": "mean"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"params": [
|
||||||
|
"$__interval"
|
||||||
|
],
|
||||||
|
"type": "time"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"params": [
|
||||||
|
"null"
|
||||||
|
],
|
||||||
|
"type": "fill"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"orderByTime": "ASC",
|
||||||
|
"policy": "default",
|
||||||
|
"query": "SELECT mean(\"avg_gpu_mem_usage\") as \"gpu_memory_usage\" FROM \"$testnet\".\"autogen\".\"system-stats\" WHERE hostname =~ /$hostid/ AND $timeFilter GROUP BY time(5s) fill(null)\n",
|
||||||
|
"rawQuery": true,
|
||||||
|
"refId": "D",
|
||||||
|
"resultFormat": "time_series",
|
||||||
|
"select": [
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"params": [
|
||||||
|
"value"
|
||||||
|
],
|
||||||
|
"type": "field"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"params": [],
|
||||||
|
"type": "mean"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"tags": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"thresholds": [],
|
"thresholds": [],
|
||||||
|
@ -19,8 +19,27 @@ while true; do
|
|||||||
ram_total_and_usage=$(echo "${top_ouput}" | grep '.*B Mem'| tail -1 | sed "s/.*: *\([0-9.]*\)%* total.*, *\([0-9.]*\)%* used.*/\1 \2/")
|
ram_total_and_usage=$(echo "${top_ouput}" | grep '.*B Mem'| tail -1 | sed "s/.*: *\([0-9.]*\)%* total.*, *\([0-9.]*\)%* used.*/\1 \2/")
|
||||||
read -r total used <<< "$ram_total_and_usage"
|
read -r total used <<< "$ram_total_and_usage"
|
||||||
ram_usage=$(awk "BEGIN {print $used / $total * 100}")
|
ram_usage=$(awk "BEGIN {print $used / $total * 100}")
|
||||||
|
cpu_report="cpu_usage=$cpu_usage,ram_usage=$ram_usage"
|
||||||
|
|
||||||
report="cpu_usage=$cpu_usage,ram_usage=$ram_usage"
|
# if nvidia-smi exists, report gpu stats
|
||||||
|
gpu_report=""
|
||||||
|
if [ -x "$(command -v nvidia-smi)" ]; then
|
||||||
|
mapfile -t individual_gpu_usage < <(nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total --format=csv,nounits,noheader)
|
||||||
|
total_gpu_usage=0
|
||||||
|
total_gpu_mem_usage=0
|
||||||
|
num_gpus=${#individual_gpu_usage[@]}
|
||||||
|
for entry in "${individual_gpu_usage[@]}"
|
||||||
|
do
|
||||||
|
read -r compute mem_used mem_total <<< "${entry//,/}"
|
||||||
|
total_gpu_usage=$(awk "BEGIN {print $total_gpu_usage + $compute }")
|
||||||
|
total_gpu_mem_usage=$(awk "BEGIN {print $total_gpu_mem_usage + $mem_used / $mem_total * 100}")
|
||||||
|
done
|
||||||
|
avg_gpu_usage=$(awk "BEGIN {print $total_gpu_usage / $num_gpus}")
|
||||||
|
avg_gpu_mem_usage=$(awk "BEGIN {print $total_gpu_mem_usage / $num_gpus}")
|
||||||
|
gpu_report=",avg_gpu_usage=$avg_gpu_usage,avg_gpu_mem_usage=$avg_gpu_mem_usage"
|
||||||
|
fi
|
||||||
|
|
||||||
|
report="${cpu_report}${gpu_report}"
|
||||||
./scripts/metrics-write-datapoint.sh "system-stats,hostname=$HOSTNAME $report"
|
./scripts/metrics-write-datapoint.sh "system-stats,hostname=$HOSTNAME $report"
|
||||||
sleep 1
|
sleep 1
|
||||||
done
|
done
|
||||||
|
Reference in New Issue
Block a user