diff --git a/python/triton/testing.py b/python/triton/testing.py index fbca719ff..bfcd6ef6b 100644 --- a/python/triton/testing.py +++ b/python/triton/testing.py @@ -1,6 +1,8 @@ +import functools import os import subprocess import sys +from contextlib import contextmanager import torch @@ -358,6 +360,80 @@ def get_max_tensorcore_tflops(dtype: torch.dtype, backend=None, device=None, clo tflops = num_subcores * clock_rate * ops_per_sub_core * 1e-9 return tflops +# create decorator that wraps test function into +# a cuda-memcheck system call + + +def cuda_memcheck(**target_kwargs): + def decorator(test_fn): + @functools.wraps(test_fn) + def wrapper(*args, **kwargs): + import psutil + ppid_name = psutil.Process(os.getppid()).name() + run_cuda_memcheck = target_kwargs.items() <= kwargs.items() + if run_cuda_memcheck and ppid_name != "cuda-memcheck": + path = os.path.realpath(test_fn.__globals__["__file__"]) + # get path of current file + env = {"PATH": os.environ["PATH"], "PYTORCH_NO_CUDA_MEMORY_CACHING": "1"} + assert 'request' in kwargs, "memcheck'ed test must have a (possibly unused) `request` fixture" + test_id = kwargs['request'].node.callspec.id + cmd = f"{path}::{test_fn.__name__}[{test_id}]" + out = subprocess.run(["cuda-memcheck", "pytest", "-vs", cmd], capture_output=True, env=env) + assert out.returncode == 0, "cuda-memcheck returned an error: bounds checkng failed" + assert "ERROR SUMMARY: 0 errors" in str(out.stdout) + else: + test_fn(*args, **kwargs) + return wrapper + return decorator + + +def nvsmi_attr(attrs): + attrs = ",".join(attrs) + cmd = [ + "nvidia-smi", + "-i", + "0", + "--query-gpu=" + attrs, + "--format=csv,noheader,nounits", + ] + out = subprocess.check_output(cmd) + ret = out.decode(sys.stdout.encoding).split(",") + ret = [int(x) for x in ret] + return ret + + +@contextmanager +def set_gpu_clock(ref_sm_clock=1350, ref_mem_clock=1215): + try: + subprocess.check_output(["nvidia-smi", "-i", "0", "-pm", "1"]) + subprocess.check_output( + [ + "nvidia-smi", + "-i", + "0", + f"--lock-gpu-clocks={ref_sm_clock},{ref_sm_clock}", + ] + ) + subprocess.check_output( + [ + "nvidia-smi", + "-i", + "0", + f"--lock-memory-clocks={ref_mem_clock},{ref_mem_clock}", + ] + ) + cur_sm_clock = nvsmi_attr(["clocks.current.sm"])[0] + cur_mem_clock = nvsmi_attr(["clocks.current.memory"])[0] + assert abs(cur_sm_clock - ref_sm_clock) < 10, f"GPU SMs must run at {ref_sm_clock} MHz" + assert abs(cur_mem_clock - ref_mem_clock) < 10, f"GPU SMs must run at {ref_mem_clock} MHz" + tflops = 1e-6 * 2 * 108 * 4 * 256 * ref_sm_clock + gbps = 640 * 2 * ref_mem_clock * 1e-3 + yield tflops, gbps + finally: + subprocess.check_output(["nvidia-smi", "-i", "0", "-pm", "0"]) + subprocess.check_output(["nvidia-smi", "-i", "0", "-rgc"]) + subprocess.check_output(["nvidia-smi", "-i", "0", "-rmc"]) + def get_max_simd_tflops(dtype: torch.dtype, backend=None, device=None): if not backend: