[STYLE] run autopep8 and isort (#421)

Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review.
2022-01-06 14:34:17 -08:00
parent 120cda015e
commit 8bf551ae7a
30 changed files with 742 additions and 623 deletions
--- a/python/test/regression/test_performance.py
+++ b/python/test/regression/test_performance.py
@@ -1,14 +1,18 @@
-from numpy import record
-import torch
-import triton
+import triton.language as tl
 import subprocess
 import sys
+
 import pytest
+import torch
+from numpy import record
+
+import triton

 #######################
 # Utilities
 #######################

+
 def nvsmi(attrs):
    attrs = ','.join(attrs)
    cmd = ['nvidia-smi', '-i', '0', '--query-gpu=' + attrs, '--format=csv,noheader,nounits']
@@ -23,48 +27,51 @@ def nvsmi(attrs):
 #######################

 matmul_data = {
-  # square
-  (256 , 256 , 256  ) : {'v100': 0.027},
-  (512 , 512 , 512  ) : {'v100': 0.158},
-  (1024, 1024, 1024 ) : {'v100': 0.466},
-  (2048, 2048, 2048 ) : {'v100': 0.680},
-  (4096, 4096, 4096 ) : {'v100': 0.831},
-  (8192, 8192, 8192 ) : {'v100': 0.849},
-  # tall-skinny
-  (16  , 1024, 1024 ) : {'v100': 0.0128},
-  (16  , 4096, 4096 ) : {'v100': 0.0883},
-  (16  , 8192, 8192 ) : {'v100': 0.101},
-  (64  , 1024, 1024 ) : {'v100': 0.073},
-  (64  , 4096, 4096 ) : {'v100': 0.270},
-  (64  , 8192, 8192 ) : {'v100': 0.360},
-  (1024, 64  , 1024 ) : {'v100': 0.0692},
-  (4096, 64  , 4096 ) : {'v100': 0.264},
-  (8192, 64  , 8192 ) : {'v100': 0.323},
-#   # deep reductions
-#   (64  , 64  , 16384) : {'v100': 0.},
-#   (64  , 64  , 65536) : {'v100': 0.},
-#   (256 , 256 , 8192 ) : {'v100': 0.},
-#   (256 , 256 , 32768) : {'v100': 0.},
+    # square
+    (256, 256, 256): {'v100': 0.027},
+    (512, 512, 512): {'v100': 0.158},
+    (1024, 1024, 1024): {'v100': 0.466},
+    (2048, 2048, 2048): {'v100': 0.680},
+    (4096, 4096, 4096): {'v100': 0.831},
+    (8192, 8192, 8192): {'v100': 0.849},
+    # tall-skinny
+    (16, 1024, 1024): {'v100': 0.0128},
+    (16, 4096, 4096): {'v100': 0.0883},
+    (16, 8192, 8192): {'v100': 0.101},
+    (64, 1024, 1024): {'v100': 0.073},
+    (64, 4096, 4096): {'v100': 0.270},
+    (64, 8192, 8192): {'v100': 0.360},
+    (1024, 64, 1024): {'v100': 0.0692},
+    (4096, 64, 4096): {'v100': 0.264},
+    (8192, 64, 8192): {'v100': 0.323},
+    #   # deep reductions
+    #   (64  , 64  , 16384) : {'v100': 0.},
+    #   (64  , 64  , 65536) : {'v100': 0.},
+    #   (256 , 256 , 8192 ) : {'v100': 0.},
+    #   (256 , 256 , 32768) : {'v100': 0.},
 }
+
+
@pytest.mark.parametrize('M, N, K', matmul_data.keys())
 def test_matmul(M, N, K):
    ref_gpu_util = matmul_data[(M, N, K)]['v100']
    cur_sm_clock = nvsmi(['clocks.current.sm'])[0]
    ref_sm_clock = 1350
-    max_gpu_perf = 1e-6*80*8*128*cur_sm_clock
+    max_gpu_perf = 1e-6 * 80 * 8 * 128 * cur_sm_clock
    assert abs(cur_sm_clock - ref_sm_clock) < 10, f'GPU SMs must run at {ref_sm_clock} MHz'
    a = torch.randn((M, K), dtype=torch.float16, device='cuda')
    b = torch.randn((K, N), dtype=torch.float16, device='cuda')
    fn = lambda: triton.ops.matmul(a, b)
    ms = triton.testing.do_bench(fn, percentiles=None, warmup=25, rep=1000)
-    cur_gpu_perf = 2.*M*N*K/ms * 1e-9
+    cur_gpu_perf = 2. * M * N * K / ms * 1e-9
    cur_gpu_util = cur_gpu_perf / max_gpu_perf
    triton.testing.assert_almost_equal(cur_gpu_util, ref_gpu_util, decimal=2)

+
 #######################
 # Element-Wise
 #######################
-import triton.language as tl
+

@triton.jit
 def _add(x_ptr, y_ptr, output_ptr, n_elements,
@@ -80,21 +87,22 @@ def _add(x_ptr, y_ptr, output_ptr, n_elements,


 elementwise_data = {
-  1024*16   : {'v100': 0.0219},
-  1024*64   : {'v100': 0.0791},
-  1024*256  : {'v100': 0.243},
-  1024*1024 : {'v100': 0.534},
-  1024*4096 : {'v100': 0.796},
-  1024*16384: {'v100': 0.905},
-  1024*65536: {'v100': 0.939},
+    1024 * 16: {'v100': 0.0219},
+    1024 * 64: {'v100': 0.0791},
+    1024 * 256: {'v100': 0.243},
+    1024 * 1024: {'v100': 0.534},
+    1024 * 4096: {'v100': 0.796},
+    1024 * 16384: {'v100': 0.905},
+    1024 * 65536: {'v100': 0.939},
 }

+
@pytest.mark.parametrize('N', elementwise_data.keys())
 def test_elementwise(N):
    ref_gpu_util = elementwise_data[N]['v100']
    cur_mem_clock = nvsmi(['clocks.current.memory'])[0]
    ref_mem_clock = 877
-    max_gpu_perf = 512*2*ref_mem_clock*1e-3
+    max_gpu_perf = 512 * 2 * ref_mem_clock * 1e-3
    assert abs(cur_mem_clock - ref_mem_clock) < 10, f'GPU memmory must run at {ref_mem_clock} MHz'
    z = torch.empty((N, ), dtype=torch.float16, device='cuda')
    x = torch.randn_like(z)
@@ -102,7 +110,6 @@ def test_elementwise(N):
    grid = lambda args: (triton.cdiv(N, args['BLOCK_SIZE']), )
    fn = lambda: _add[grid](x, y, z, N, BLOCK_SIZE=1024)
    ms = triton.testing.do_bench(fn, percentiles=None, warmup=25, rep=250)
-    cur_gpu_perf = 3.*N*z.element_size()/ms*1e-6
+    cur_gpu_perf = 3. * N * z.element_size() / ms * 1e-6
    cur_gpu_util = cur_gpu_perf / max_gpu_perf
    triton.testing.assert_almost_equal(cur_gpu_util, ref_gpu_util, decimal=2)
-