[OPS] Add performance model for gemm/gemv (#397)

Significantly improves the performance of `triton.ops.matmul` in memory-bound settings via the use of many more block configs coupled with a performance model to drive the auto-tuning process.
2021-12-22 01:56:10 +08:00
parent 5cdb948c05
commit 39d4bfed83
12 changed files with 289 additions and 27 deletions
--- a/include/triton/codegen/transform/cts.h
+++ b/include/triton/codegen/transform/cts.h
@@ -33,4 +33,4 @@ private:
 }
 }

-#endif
+#endif
--- a/include/triton/codegen/transform/peephole.h
+++ b/include/triton/codegen/transform/peephole.h
@@ -35,7 +35,7 @@ private:
  bool rewrite_select_masked_load(ir::instruction *value, ir::builder& builder);
  bool rewrite_load_to_shared(ir::instruction *value, ir::builder& builder);
  bool rewrite_cvt_layout(ir::instruction *value, ir::builder& builder);
-
+ 
 public:
  peephole(target* tgt, analysis::layouts* layouts): tgt_(tgt), layouts_(layouts) {}
  void run(ir::module &mod);
--- a/include/triton/ir/instructions.h
+++ b/include/triton/ir/instructions.h
@@ -455,7 +455,7 @@ public:
 // masked load async
 class masked_load_async_inst: public load_inst {
 private:
-  std::string repr_impl() const { return "masked_load_async_async" + get_cache_modifier_repr(); }
+  std::string repr_impl() const { return "masked_load_async" + get_cache_modifier_repr(); }
  masked_load_async_inst(value *ptr, value *mask, value *false_value, load_inst::CACHE_MODIFIER cache,
                   const std::string &name, instruction *next);

@@ -728,12 +728,21 @@ public:
 class dot_inst: public builtin_inst {
 public:
  enum TransT { NoTrans, Trans };
+  enum DataType { 
+    FP8, FP16, BF16, TF32, FP32, 
+    INT1, INT4, INT8, INT32, 
+    UNKNOWN,
+  };

 private:
  dot_inst(value *A, value *B, value *C, TransT AT, TransT BT, const std::string &name, instruction *next);
  std::string repr_impl() const { return "dot"; }

  bool is_prefetched_ = false;
+  DataType C_type_ = DataType::FP32;
+  DataType A_type_ = DataType::FP16;
+  DataType B_type_ = DataType::FP16;
+  
 public:
  bool is_prefetched() const { return is_prefetched_; }
  void set_prefetched(bool is_prefetched) { is_prefetched_ = is_prefetched; }