[TESTING] Added testing utilities for fixing clock and using cuda-memcheck (#500)

This commit is contained in:
Philippe Tillet
2022-04-21 22:40:10 -07:00
committed by GitHub
parent 073be1d2ee
commit 7d6c504e8d

View File

@@ -1,6 +1,8 @@
import functools
import os
import subprocess
import sys
from contextlib import contextmanager
import torch
@@ -358,6 +360,80 @@ def get_max_tensorcore_tflops(dtype: torch.dtype, backend=None, device=None, clo
tflops = num_subcores * clock_rate * ops_per_sub_core * 1e-9
return tflops
# create decorator that wraps test function into
# a cuda-memcheck system call
def cuda_memcheck(**target_kwargs):
def decorator(test_fn):
@functools.wraps(test_fn)
def wrapper(*args, **kwargs):
import psutil
ppid_name = psutil.Process(os.getppid()).name()
run_cuda_memcheck = target_kwargs.items() <= kwargs.items()
if run_cuda_memcheck and ppid_name != "cuda-memcheck":
path = os.path.realpath(test_fn.__globals__["__file__"])
# get path of current file
env = {"PATH": os.environ["PATH"], "PYTORCH_NO_CUDA_MEMORY_CACHING": "1"}
assert 'request' in kwargs, "memcheck'ed test must have a (possibly unused) `request` fixture"
test_id = kwargs['request'].node.callspec.id
cmd = f"{path}::{test_fn.__name__}[{test_id}]"
out = subprocess.run(["cuda-memcheck", "pytest", "-vs", cmd], capture_output=True, env=env)
assert out.returncode == 0, "cuda-memcheck returned an error: bounds checkng failed"
assert "ERROR SUMMARY: 0 errors" in str(out.stdout)
else:
test_fn(*args, **kwargs)
return wrapper
return decorator
def nvsmi_attr(attrs):
attrs = ",".join(attrs)
cmd = [
"nvidia-smi",
"-i",
"0",
"--query-gpu=" + attrs,
"--format=csv,noheader,nounits",
]
out = subprocess.check_output(cmd)
ret = out.decode(sys.stdout.encoding).split(",")
ret = [int(x) for x in ret]
return ret
@contextmanager
def set_gpu_clock(ref_sm_clock=1350, ref_mem_clock=1215):
try:
subprocess.check_output(["nvidia-smi", "-i", "0", "-pm", "1"])
subprocess.check_output(
[
"nvidia-smi",
"-i",
"0",
f"--lock-gpu-clocks={ref_sm_clock},{ref_sm_clock}",
]
)
subprocess.check_output(
[
"nvidia-smi",
"-i",
"0",
f"--lock-memory-clocks={ref_mem_clock},{ref_mem_clock}",
]
)
cur_sm_clock = nvsmi_attr(["clocks.current.sm"])[0]
cur_mem_clock = nvsmi_attr(["clocks.current.memory"])[0]
assert abs(cur_sm_clock - ref_sm_clock) < 10, f"GPU SMs must run at {ref_sm_clock} MHz"
assert abs(cur_mem_clock - ref_mem_clock) < 10, f"GPU SMs must run at {ref_mem_clock} MHz"
tflops = 1e-6 * 2 * 108 * 4 * 256 * ref_sm_clock
gbps = 640 * 2 * ref_mem_clock * 1e-3
yield tflops, gbps
finally:
subprocess.check_output(["nvidia-smi", "-i", "0", "-pm", "0"])
subprocess.check_output(["nvidia-smi", "-i", "0", "-rgc"])
subprocess.check_output(["nvidia-smi", "-i", "0", "-rmc"])
def get_max_simd_tflops(dtype: torch.dtype, backend=None, device=None):
if not backend: