[BACKEND] Add bf16 & tf32 mma supports (on A100) (#426)

2022-01-12 02:20:31 +08:00
parent efdabe6073
commit 94a2e10fe5
17 changed files with 717 additions and 263 deletions
--- a/include/triton/codegen/analysis/layout.h
+++ b/include/triton/codegen/analysis/layout.h
@@ -109,6 +109,63 @@ protected:
 };

 class mma_layout: public distributed_layout {
+public:
+  enum TensorCoreType : uint8_t {
+    // floating-point tensor core instr
+    FP32_FP16_FP16_FP32 = 0, // default
+    FP32_BF16_BF16_FP32,
+    FP32_TF32_TF32_FP32,
+    // integer tensor core instr
+    INT32_INT1_INT1_INT32, // Not implemented
+    INT32_INT4_INT4_INT32, // Not implemented
+    INT32_INT8_INT8_INT32, // Not implemented
+    //
+    NOT_APPLICABLE,    
+  };
+
+  // Used on nvidia GPUs with sm >= 80
+  inline static const std::map<TensorCoreType, std::vector<int>> mma_instr_shape_ = {
+    {FP32_FP16_FP16_FP32, {16, 8, 16}}, 
+    {FP32_BF16_BF16_FP32, {16, 8, 16}},
+    {FP32_TF32_TF32_FP32, {16, 8, 8}},
+
+    {INT32_INT1_INT1_INT32, {16, 8, 256}},
+    {INT32_INT4_INT4_INT32, {16, 8, 64}},
+    {INT32_INT8_INT8_INT32, {16, 8, 32}},
+  };
+
+  // shape of matrices loaded by ldmatrix (m-n-k, for mxk & kxn matrices)
+  inline static const std::map<TensorCoreType, std::vector<int>> mma_mat_shape_ = {
+    {FP32_FP16_FP16_FP32, {8, 8, 8}}, 
+    {FP32_BF16_BF16_FP32, {8, 8, 8}},
+    {FP32_TF32_TF32_FP32, {8, 8, 4}},
+
+    {INT32_INT1_INT1_INT32, {8, 8, 64}},
+    {INT32_INT4_INT4_INT32, {8, 8, 32}},
+    {INT32_INT8_INT8_INT32, {8, 8, 16}},
+  };
+
+  inline static const std::map<TensorCoreType, std::string> mma_instr_ptx_ = {
+    {FP32_FP16_FP16_FP32, "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"}, 
+    {FP32_BF16_BF16_FP32, "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"},
+    {FP32_TF32_TF32_FP32, "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"},
+
+    {INT32_INT1_INT1_INT32, "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.xor.popc"},
+    {INT32_INT4_INT4_INT32, "mma.sync.aligned.m16n8k64.row.col.satfinite.s32.s4.s4.s32"},
+    {INT32_INT8_INT8_INT32, "mma.sync.aligned.m16n8k32.row.col.satfinite.s32.s8.s8.s32"},
+  };
+
+  // vector length per ldmatrix (16*8/elelment_size_in_bits)
+  inline static const std::map<TensorCoreType, int> mma_instr_vec_ = {
+    {FP32_FP16_FP16_FP32, 8},
+    {FP32_BF16_BF16_FP32, 8},
+    {FP32_TF32_TF32_FP32, 4},
+
+    {INT32_INT1_INT1_INT32, 128},
+    {INT32_INT4_INT4_INT32, 32},
+    {INT32_INT8_INT8_INT32, 16},
+  };
+
 public:
  mma_layout(size_t num_warps,
                const std::vector<int>& axes,
@@ -116,7 +173,8 @@ public:
                const std::vector<ir::value *> &values,
                analysis::align* align, target *tgt,
             shared_layout* layout_a,
-             shared_layout* layout_b);
+             shared_layout* layout_b,
+             ir::value *dot);
  void accept(layout_visitor* vst) { vst->visit_layout_mma(this); }
  // accessor
  int fpw(size_t k) { return fpw_.at(k); }
@@ -124,6 +182,16 @@ public:
  int spw(size_t k) { return spw_.at(k); }
  int rep(size_t k) { return rep_.at(k); }

+  // helpers for generator.cc
+  std::string get_ptx_instr() const { return mma_instr_ptx_.at(tensor_core_type_); }
+  std::vector<int> get_mma_instr_shape() const { return mma_instr_shape_.at(tensor_core_type_); }
+  std::vector<int> get_mma_mat_shape() const { return mma_mat_shape_.at(tensor_core_type_); }
+  int get_vec_a() const { return mma_instr_vec_.at(tensor_core_type_); }
+  int get_vec_b() const { return mma_instr_vec_.at(tensor_core_type_); }
+
+  // setter
+  void set_tensor_core_type(TensorCoreType type) { tensor_core_type_ = type; }
+
 private:
  // fragment per warp
  std::vector<int> fpw_;
@@ -135,6 +203,8 @@ private:
  std::vector<int> spt_;
  // repetitions
  std::vector<int> rep_;
+
+  TensorCoreType tensor_core_type_ = FP32_FP16_FP16_FP32;
 };

 struct scanline_layout: public distributed_layout {
@@ -182,7 +252,7 @@ public:
                const std::vector<unsigned>& shapes,
                const std::vector<ir::value *> &values_,
                ir::type *ty,
-                analysis::align* align);
+                analysis::align* align, target *tgt);
  void accept(layout_visitor* vst) { vst->visit_layout_shared(this); }
  // accessors
  size_t get_size()                         { return size_; }
@@ -197,6 +267,7 @@ public:
  ir::value* hmma_dot_b()                      { return hmma_dot_b_; }
  void set_mma_vec(int mma_vec)             { mma_vec_ = mma_vec; }
  int  get_mma_vec()                        { return mma_vec_;}
+  int  get_mma_strided()                    { return mma_strided_; }
  data_layout* get_arg_layout()             { return arg_layout_; }

 private:
@@ -209,6 +280,8 @@ private:
  ir::value* hmma_dot_b_;
  data_layout* arg_layout_;
  int mma_vec_;
+  int mma_strided_;
+  target *tgt_;
 };


--- a/include/triton/ir/builder.h
+++ b/include/triton/ir/builder.h
@@ -154,7 +154,7 @@ public:
  value *create_cos(value* arg);
  value *create_sin(value* arg);
  value *create_log(value* arg);
-  value *create_dot(value *A, value *B, value *C);
+  value *create_dot(value *A, value *B, value *C, bool allow_tf32);
  value *create_trans(value *A, const std::vector<int> &perm = {});
  value *create_sqrt(value *A);
  value *create_reduce(value *A, reduce_inst::op_t op, unsigned axis);
--- a/include/triton/ir/dispatch.h
+++ b/include/triton/ir/dispatch.h
@@ -80,7 +80,7 @@ struct dispatch{
  static ir::value *atomic_xchg(ir::value* ptr, ir::value *val, ir::value *msk, ir::builder *builder);

  // linear algebra
-  static ir::value *dot(ir::value *lhs, ir::value *rhs, ir::builder *builder);
+  static ir::value *dot(ir::value *lhs, ir::value *rhs, ir::constant_int *allow_tf32, ir::builder *builder);

  // indexing
  static ir::value *where(ir::value* condition, ir::value *x, ir::value *y, ir::builder *builder);
--- a/include/triton/ir/instructions.h
+++ b/include/triton/ir/instructions.h
@@ -742,26 +742,29 @@ public:
  };

 private:
-  dot_inst(value *A, value *B, value *C, TransT AT, TransT BT, const std::string &name, instruction *next);
+  dot_inst(value *A, value *B, value *C, TransT AT, TransT BT, bool allow_tf32, const std::string &name, instruction *next);
  std::string repr_impl() const { return "dot"; }
-
-  bool is_prefetched_ = false;
-  DataType C_type_ = DataType::FP32;
-  DataType A_type_ = DataType::FP16;
-  DataType B_type_ = DataType::FP16;
  
 public:
  bool is_prefetched() const { return is_prefetched_; }
  void set_prefetched(bool is_prefetched) { is_prefetched_ = is_prefetched; }
+  bool allow_tf32() const { return allow_tf32_; }

 public:
-  static instruction *create(value *A, value *B, value *C, bool AT, bool BT, const std::string &name = "", instruction *next = nullptr);
-  static instruction* create_nn(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr);
-  static instruction* create_nt(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr);
-  static instruction* create_tn(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr);
-  static instruction* create_tt(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr);
+  static instruction *create(value *A, value *B, value *C, bool AT, bool BT, bool allow_tf32, const std::string &name = "", instruction *next = nullptr);
+  static instruction* create_nn(value *A, value *B, value *C, bool allow_tf32, const std::string &name = "", instruction *next = nullptr);
+  static instruction* create_nt(value *A, value *B, value *C, bool allow_tf32, const std::string &name = "", instruction *next = nullptr);
+  static instruction* create_tn(value *A, value *B, value *C, bool allow_tf32, const std::string &name = "", instruction *next = nullptr);
+  static instruction* create_tt(value *A, value *B, value *C, bool allow_tf32, const std::string &name = "", instruction *next = nullptr);
  _TRITON_DEFINE_CLONE(dot_inst)
  _TRITON_DEFINE_ACCEPT(dot_inst)
+
+private:
+  bool is_prefetched_ = false;
+  bool allow_tf32_ = false;
+  DataType C_type_ = DataType::FP32;
+  DataType A_type_ = DataType::FP16;
+  DataType B_type_ = DataType::FP16;
 };

 //class outer_inst: public builtin_inst {