from numpy import record import torch import triton import subprocess import sys import pytest ####################### # Utilities ####################### def nvsmi(attrs): attrs = ','.join(attrs) cmd = ['nvidia-smi', '-i', '0', '--query-gpu=' + attrs, '--format=csv,noheader,nounits'] out = subprocess.check_output(cmd) ret = out.decode(sys.stdout.encoding).split(',') ret = [int(x) for x in ret] return ret ####################### # Matrix Multiplication ####################### matmul_data = { # square (256 , 256 , 256 ) : {'v100': 0.027}, (512 , 512 , 512 ) : {'v100': 0.141}, (1024, 1024, 1024 ) : {'v100': 0.466}, (2048, 2048, 2048 ) : {'v100': 0.680}, (4096, 4096, 4096 ) : {'v100': 0.831}, (8192, 8192, 8192 ) : {'v100': 0.841}, # tall-skinny (16 , 1024, 1024 ) : {'v100': 0.0128}, (16 , 4096, 4096 ) : {'v100': 0.0558}, (16 , 8192, 8192 ) : {'v100': 0.101}, (64 , 1024, 1024 ) : {'v100': 0.049}, (64 , 4096, 4096 ) : {'v100': 0.211}, (64 , 8192, 8192 ) : {'v100': 0.360}, (1024, 64 , 1024 ) : {'v100': 0.0469}, (4096, 64 , 4096 ) : {'v100': 0.198}, (8192, 64 , 8192 ) : {'v100': 0.323}, # # deep reductions # (64 , 64 , 16384) : {'v100': 0.}, # (64 , 64 , 65536) : {'v100': 0.}, # (256 , 256 , 8192 ) : {'v100': 0.}, # (256 , 256 , 32768) : {'v100': 0.}, } @pytest.mark.parametrize('M, N, K', matmul_data.keys()) def test_matmul(M, N, K): ref_gpu_util = matmul_data[(M, N, K)]['v100'] cur_sm_clock = nvsmi(['clocks.current.sm'])[0] ref_sm_clock = 1350 max_gpu_perf = 1e-6*80*8*128*cur_sm_clock assert abs(cur_sm_clock - ref_sm_clock) < 5, f'GPU SMs must run at {ref_sm_clock} MHz' a = torch.randn((M, K), dtype=torch.float16, device='cuda') b = torch.randn((K, N), dtype=torch.float16, device='cuda') fn = lambda: triton.ops.matmul(a, b) ms = triton.testing.do_bench(fn, percentiles=None, warmup=10, rep=1000) cur_gpu_perf = 2.*M*N*K/ms * 1e-9 cur_gpu_util = cur_gpu_perf / max_gpu_perf triton.testing.assert_almost_equal(cur_gpu_util, ref_gpu_util, decimal=2) ####################### # Element-Wise ####################### import triton.language as tl @triton.jit def _add(x_ptr, y_ptr, output_ptr, n_elements, **meta): BLOCK_SIZE = meta['BLOCK_SIZE'] pid = tl.program_id(axis=0) block_start = pid * BLOCK_SIZE offsets = block_start + tl.arange(0, BLOCK_SIZE) mask = offsets < n_elements x = tl.load(x_ptr + offsets, mask=mask) y = tl.load(y_ptr + offsets, mask=mask) output = x + y tl.store(output_ptr + offsets, output, mask=mask) elementwise_data = { 1024*16 : {'v100': 0.0219}, 1024*64 : {'v100': 0.0791}, 1024*256 : {'v100': 0.243}, 1024*1024 : {'v100': 0.534}, 1024*4096 : {'v100': 0.796}, 1024*16384: {'v100': 0.905}, 1024*65536: {'v100': 0.939}, } @pytest.mark.parametrize('N', elementwise_data.keys()) def test_elementwise(N): ref_gpu_util = elementwise_data[N]['v100'] cur_mem_clock = nvsmi(['clocks.current.memory'])[0] ref_mem_clock = 877 max_gpu_perf = 512*2*ref_mem_clock*1e-3 assert abs(cur_mem_clock - ref_mem_clock) < 5, f'GPU memmory must run at {ref_mem_clock} MHz' z = torch.empty((N, ), dtype=torch.float16, device='cuda') x = torch.randn_like(z) y = torch.randn_like(z) grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']), ) fn = lambda: _add[grid](x, y, z, N, BLOCK_SIZE=1024) ms = triton.testing.do_bench(fn, percentiles=None, warmup=10, rep=250) cur_gpu_perf = 3.*N*z.element_size()/ms*1e-6 cur_gpu_util = cur_gpu_perf / max_gpu_perf triton.testing.assert_almost_equal(cur_gpu_util, ref_gpu_util, decimal=2)