[TESTING] Added testing utilities for fixing clock and using cuda-memcheck (#500)
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
import functools
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
|
||||
import torch
|
||||
|
||||
@@ -358,6 +360,80 @@ def get_max_tensorcore_tflops(dtype: torch.dtype, backend=None, device=None, clo
|
||||
tflops = num_subcores * clock_rate * ops_per_sub_core * 1e-9
|
||||
return tflops
|
||||
|
||||
# create decorator that wraps test function into
|
||||
# a cuda-memcheck system call
|
||||
|
||||
|
||||
def cuda_memcheck(**target_kwargs):
|
||||
def decorator(test_fn):
|
||||
@functools.wraps(test_fn)
|
||||
def wrapper(*args, **kwargs):
|
||||
import psutil
|
||||
ppid_name = psutil.Process(os.getppid()).name()
|
||||
run_cuda_memcheck = target_kwargs.items() <= kwargs.items()
|
||||
if run_cuda_memcheck and ppid_name != "cuda-memcheck":
|
||||
path = os.path.realpath(test_fn.__globals__["__file__"])
|
||||
# get path of current file
|
||||
env = {"PATH": os.environ["PATH"], "PYTORCH_NO_CUDA_MEMORY_CACHING": "1"}
|
||||
assert 'request' in kwargs, "memcheck'ed test must have a (possibly unused) `request` fixture"
|
||||
test_id = kwargs['request'].node.callspec.id
|
||||
cmd = f"{path}::{test_fn.__name__}[{test_id}]"
|
||||
out = subprocess.run(["cuda-memcheck", "pytest", "-vs", cmd], capture_output=True, env=env)
|
||||
assert out.returncode == 0, "cuda-memcheck returned an error: bounds checkng failed"
|
||||
assert "ERROR SUMMARY: 0 errors" in str(out.stdout)
|
||||
else:
|
||||
test_fn(*args, **kwargs)
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
def nvsmi_attr(attrs):
|
||||
attrs = ",".join(attrs)
|
||||
cmd = [
|
||||
"nvidia-smi",
|
||||
"-i",
|
||||
"0",
|
||||
"--query-gpu=" + attrs,
|
||||
"--format=csv,noheader,nounits",
|
||||
]
|
||||
out = subprocess.check_output(cmd)
|
||||
ret = out.decode(sys.stdout.encoding).split(",")
|
||||
ret = [int(x) for x in ret]
|
||||
return ret
|
||||
|
||||
|
||||
@contextmanager
|
||||
def set_gpu_clock(ref_sm_clock=1350, ref_mem_clock=1215):
|
||||
try:
|
||||
subprocess.check_output(["nvidia-smi", "-i", "0", "-pm", "1"])
|
||||
subprocess.check_output(
|
||||
[
|
||||
"nvidia-smi",
|
||||
"-i",
|
||||
"0",
|
||||
f"--lock-gpu-clocks={ref_sm_clock},{ref_sm_clock}",
|
||||
]
|
||||
)
|
||||
subprocess.check_output(
|
||||
[
|
||||
"nvidia-smi",
|
||||
"-i",
|
||||
"0",
|
||||
f"--lock-memory-clocks={ref_mem_clock},{ref_mem_clock}",
|
||||
]
|
||||
)
|
||||
cur_sm_clock = nvsmi_attr(["clocks.current.sm"])[0]
|
||||
cur_mem_clock = nvsmi_attr(["clocks.current.memory"])[0]
|
||||
assert abs(cur_sm_clock - ref_sm_clock) < 10, f"GPU SMs must run at {ref_sm_clock} MHz"
|
||||
assert abs(cur_mem_clock - ref_mem_clock) < 10, f"GPU SMs must run at {ref_mem_clock} MHz"
|
||||
tflops = 1e-6 * 2 * 108 * 4 * 256 * ref_sm_clock
|
||||
gbps = 640 * 2 * ref_mem_clock * 1e-3
|
||||
yield tflops, gbps
|
||||
finally:
|
||||
subprocess.check_output(["nvidia-smi", "-i", "0", "-pm", "0"])
|
||||
subprocess.check_output(["nvidia-smi", "-i", "0", "-rgc"])
|
||||
subprocess.check_output(["nvidia-smi", "-i", "0", "-rmc"])
|
||||
|
||||
|
||||
def get_max_simd_tflops(dtype: torch.dtype, backend=None, device=None):
|
||||
if not backend:
|
||||
|
Reference in New Issue
Block a user