[OPS] Add performance model for gemm/gemv (#397)

Significantly improves the performance of `triton.ops.matmul` in memory-bound settings via the use of many more block configs coupled with a performance model to drive the auto-tuning process.
2021-12-22 01:56:10 +08:00
parent 5cdb948c05
commit 39d4bfed83
12 changed files with 289 additions and 27 deletions
--- a/python/test/regression/test_performance.py
+++ b/python/test/regression/test_performance.py
@@ -25,7 +25,7 @@ def nvsmi(attrs):
 matmul_data = {
  # square
  (256 , 256 , 256  ) : {'v100': 0.027},
-  (512 , 512 , 512  ) : {'v100': 0.141},
+  (512 , 512 , 512  ) : {'v100': 0.158},
  (1024, 1024, 1024 ) : {'v100': 0.466},
  (2048, 2048, 2048 ) : {'v100': 0.680},
  (4096, 4096, 4096 ) : {'v100': 0.831},
@@ -35,10 +35,10 @@ matmul_data = {
  (16  , 4096, 4096 ) : {'v100': 0.0883},
  (16  , 8192, 8192 ) : {'v100': 0.101},
  (64  , 1024, 1024 ) : {'v100': 0.073},
-  (64  , 4096, 4096 ) : {'v100': 0.228},
+  (64  , 4096, 4096 ) : {'v100': 0.270},
  (64  , 8192, 8192 ) : {'v100': 0.360},
  (1024, 64  , 1024 ) : {'v100': 0.0692},
-  (4096, 64  , 4096 ) : {'v100': 0.223},
+  (4096, 64  , 4096 ) : {'v100': 0.264},
  (8192, 64  , 8192 ) : {'v100': 0.323},
 #   # deep reductions
 #   (64  , 64  , 16384) : {'v100': 0.},