[codegen] worked around bug seemingly from nvptx/ptxas by simplifying multiplications by 1:

- Generated LLVM-IR looked correct - Illegal addressing disappeared when running cuda-memcheck - Illegal addressing disappeared when using nvptx-short-pointer
2019-08-29 21:34:23 -07:00
parent 141a823799
commit 7e0af2118c
9 changed files with 108 additions and 82 deletions
--- a/include/triton/codegen/transform/peephole.h
+++ b/include/triton/codegen/transform/peephole.h
@@ -23,6 +23,7 @@ private:
  bool rewrite_dot_fp32(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, ir::value *A, ir::value *B, ir::value *D);
  bool rewrite_dot_hmma(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, ir::value *A, ir::value *B, ir::value *D);
  bool rewrite_dot(ir::instruction *value, ir::builder& builder);
+  bool rewrite_mult(ir::instruction *value, ir::builder& builder);
  bool rewrite_unit_red(ir::instruction *value, ir::builder& builder);
  bool rewrite_gep_ptr_min_off_plus_off(ir::instruction *value, ir::builder& builder);

--- a/lib/codegen/analysis/tune.cc
+++ b/lib/codegen/analysis/tune.cc
@@ -95,7 +95,7 @@ void grids::init_c_graph(ir::instruction *v) {
  }
  // Splat
  else if(dynamic_cast<ir::splat_inst*>(v)){
-
+    return;
  }
  // Trans
  else if(auto *x = dynamic_cast<ir::trans_inst*>(v)){
--- a/lib/codegen/selection/selection.cc
+++ b/lib/codegen/selection/selection.cc
@@ -469,21 +469,21 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function<Value*(ir
    return (Instruction*)res;
  }
  if(ir::atomic_add_inst* ii = dynamic_cast<ir::atomic_add_inst*>(inst)){
-    Value *ptr = value(ii->get_operand(0));
-    Value *val = value(ii->get_operand(1));
-    Value *atom_f_add = nullptr;
-    if(val->getType()->isFloatTy())
-      atom_f_add = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_atomic_load_add_f32, {ptr->getType()});
-    else if(val->getType()->isHalfTy()){
-      Type *fp16 = Type::getHalfTy(ctx);
+//    Value *ptr = value(ii->get_operand(0));
+//    Value *val = value(ii->get_operand(1));
+//    Value *atom_f_add = nullptr;
+//    if(val->getType()->isFloatTy())
+//      atom_f_add = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_atomic_load_add_f32, {ptr->getType()});
+//    else if(val->getType()->isHalfTy()){
+//      Type *fp16 = Type::getHalfTy(ctx);

-      FunctionType *atom_ty = FunctionType::get(fp16, {fp16->getPointerTo(), fp16}, false);
-      atom_f_add = InlineAsm::get(atom_ty, " atom.relaxed.global.gpu.add.noftz.f16 $0, [$1], $2;", "=h,l,h", true);
-    }
-    if(atom_f_add == nullptr)
-      throw std::runtime_error("unsupported atomic add");
-    Value *res = builder.CreateCall(atom_f_add, {ptr, val});
-    return (Instruction*)res;
+//      FunctionType *atom_ty = FunctionType::get(fp16, {fp16->getPointerTo(), fp16}, false);
+//      atom_f_add = InlineAsm::get(atom_ty, " atom.relaxed.global.gpu.add.noftz.f16 $0, [$1], $2;", "=h,l,h", true);
+//    }
+//    if(atom_f_add == nullptr)
+    throw std::runtime_error("unsupported");
+//    Value *res = builder.CreateCall(atom_f_add, {ptr, val});
+//    return (Instruction*)res;
  }
  if(ir::sqrt_inst* ii = dynamic_cast<ir::sqrt_inst*>(inst)){
    Value *val = value(ii->get_operand(0));
--- a/lib/codegen/transform/peephole.cc
+++ b/lib/codegen/transform/peephole.cc
@@ -169,6 +169,7 @@ bool peephole::rewrite_dot(ir::instruction *value, ir::builder& builder){
      return false;
    ir::value *a = dot->get_operand(0);
    ir::value *b = dot->get_operand(1);
+    builder.set_insert_point(add);
    ir::value * new_dot = builder.insert(ir::dot_inst::create(a, b, other,
                                                              dot->is_a_trans(), dot->is_b_trans(),
                                                              dot->get_name()));
@@ -212,6 +213,30 @@ bool peephole::rewrite_unit_red(ir::instruction *value, ir::builder& builder){
  return false;
 }

+bool peephole::rewrite_mult(ir::instruction *value, ir::builder& builder) {
+    auto binop = dynamic_cast<ir::binary_operator*>(value);
+    if(binop && binop->get_op() == ir::binary_op_t::Mul) {
+      ir::value *lhs = binop->get_operand(0);
+      ir::value *rhs = binop->get_operand(1);
+      ir::constant_int *_1_lhs = nullptr;
+      if(ir::splat_inst *splat = dynamic_cast<ir::splat_inst*>(lhs))
+        _1_lhs = dynamic_cast<ir::constant_int*>(splat->get_operand(0));
+      ir::constant_int *_1_rhs = nullptr;
+      if(ir::splat_inst *splat = dynamic_cast<ir::splat_inst*>(rhs))
+        _1_rhs = dynamic_cast<ir::constant_int*>(splat->get_operand(0));
+      if(_1_lhs){
+        binop->replace_all_uses_with(rhs);
+        return true;
+      }
+      else if(_1_rhs){
+        binop->replace_all_uses_with(lhs);
+        return true;
+      }
+    }
+    return false;
+}
+
+
 bool peephole::rewrite_gep_ptr_min_off_plus_off(ir::instruction *value, ir::builder& builder) {
  auto x = dynamic_cast<ir::getelementptr_inst*>(value);
  if(!x)
@@ -250,8 +275,9 @@ void peephole::run(ir::module &mod) {
      if(seen.find(i) != seen.end())
        continue;
      bool was_modified = rewrite_dot(i, builder);
-      if(was_modified)
+      if(was_modified){
        seen.insert(i);
+      }
    }
  }while(seen.size() != n_seen);

@@ -265,6 +291,7 @@ void peephole::run(ir::module &mod) {
      if(seen.find(i) != seen.end())
        continue;
      bool was_modified = false;
+      was_modified = was_modified || rewrite_mult(i, builder);
      was_modified = was_modified || rewrite_trans_phi(i, builder);
      was_modified = was_modified || rewrite_unit_red(i, builder);
      was_modified = was_modified || rewrite_gep_ptr_min_off_plus_off(i, builder);
--- a/lib/driver/module.cc
+++ b/lib/driver/module.cc
@@ -218,29 +218,24 @@ ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(con
 /* ------------------------ */

 std::string cu_module::compile_llvm_module(llvm::Module* module) {
-  // set data layout
-  std::string layout = "e";
-  bool is_64bit = true;
-  bool use_short_pointers = true;
-  if (!is_64bit)
-    layout += "-p:32:32";
-  else if (use_short_pointers)
-    layout += "-p3:32:32-p4:32:32-p5:32:32";
-  layout += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
-  // create
-  llvm::SmallVector<char, 0> buffer;
-  module::compile_llvm_module(module, "nvptx64-nvidia-cuda", "sm_70", layout, buffer, "", Assembly);
-  std::string result(buffer.begin(), buffer.end());
-  size_t start_replace = result.find(".version");
-  size_t end_replace = result.find('\n', start_replace);
-  assert(start_replace != std::string::npos);
-  result.replace(start_replace, end_replace - start_replace, ".version 6.4");
-  return result;
+   // options
+   auto options = llvm::cl::getRegisteredOptions();
+   static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"])->setValue(true);
+   // create
+   llvm::SmallVector<char, 0> buffer;
+   module::compile_llvm_module(module, "nvptx64-nvidia-cuda", "sm_70", "", buffer, "", Assembly);
+   std::string result(buffer.begin(), buffer.end());
+   size_t start_replace = result.find(".version");
+   size_t end_replace = result.find('\n', start_replace);
+   assert(start_replace != std::string::npos);
+   result.replace(start_replace, end_replace - start_replace, ".version 6.4");
+   return result;
 }

 cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { }

 cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){
+//  std::cout << source << std::endl;
  cu_context::context_switcher ctx_switch(*context);
  // JIT compile source-code
  CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER};
--- a/lib/ir/print.cc
+++ b/lib/ir/print.cc
@@ -49,8 +49,13 @@ void print(module &mod, std::ostream& os) {
        size_t num_ops = inst->get_num_operands();
        if(num_ops > 0)
          os << " ";;
-        for(unsigned i = 0; i < num_ops; i++)
-          os << get_name(ops[i], cnt++) << (i < num_ops - 1?", ":"");
+        for(unsigned i = 0; i < num_ops; i++){
+          if(auto *x = dynamic_cast<ir::constant_int*>(ops[i]))
+            os << x->get_value();
+          else
+            os << get_name(ops[i], cnt++);
+          os << (i < num_ops - 1?", ":"");
+        }
        os << ";" << std::endl;
      }
      os << std::endl;
--- a/lib/runtime/function.cc
+++ b/lib/runtime/function.cc
@@ -217,6 +217,7 @@ std::unique_ptr<driver::module> function::make_bin(ir::module &module, driver::c
  dce.run(module);
  vectorize.run(module);
  dce.run(module);
+//  ir::print(module, std::cout);
  // generate llvm code
  llvm::LLVMContext ctx;
  std::unique_ptr<llvm::Module> llvm(new llvm::Module(module.get_name(), ctx));
--- a/python/examples/dot.py
+++ b/python/examples/dot.py
@@ -1,23 +1,43 @@
-import triton
 import tensorflow as tf
+import triton
 import numpy as np

 src = """
 #if AT == 1
 #define USEA ^a
+#define STRIDE_AK lda
+#define STRIDE_AM 1
+#define BROADCAST_AK :, newaxis
+#define BROADCAST_AM newaxis, :
+#define SHAPE_A TK, TM
 #else
 #define USEA a
+#define STRIDE_AK 1
+#define STRIDE_AM lda
+#define BROADCAST_AK newaxis, :
+#define BROADCAST_AM :, newaxis
+#define SHAPE_A TM, TK
 #endif

 #if BT == 1
 #define USEB ^b
+#define STRIDE_BK 1
+#define STRIDE_BN ldb
+#define BROADCAST_BK newaxis, :
+#define BROADCAST_BN :, newaxis
+#define SHAPE_B TN, TK
 #else
 #define USEB b
+#define STRIDE_BK ldb
+#define STRIDE_BN 1
+#define BROADCAST_BK :, newaxis
+#define BROADCAST_BN newaxis, :
+#define SHAPE_B TK, TN
 #endif

-void dot(TYPE * A __noalias __readonly __aligned(16),
-         TYPE * B __noalias __readonly __aligned(16),
-         TYPE * C __noalias __readonly __aligned(16),
+void dot(TYPE * A,
+         TYPE * B,
+         TYPE * C,
         int M, int N, int K,
         int lda __multipleof(8),
         int ldb __multipleof(8),
@@ -31,42 +51,20 @@ void dot(TYPE * A __noalias __readonly __aligned(16),
  int rka[TK] = 0 ... TK;
  int rkb[TK] = 0 ... TK;
  float xc[TM, TN] = 0;
-
-  /* pointers for A */
-#if AT == 1
-  TYPE* pa[TK, TM] = A + rka[:, newaxis]*lda + rxa[newaxis, :];
-  TYPE a[TK, TM] = *pa;
-#else
-  TYPE* pa[TM, TK] = A + rka[newaxis, :] + rxa[:, newaxis]*lda;
-  TYPE a[TM, TK] = *pa;
-#endif
-
-  /* pointers for B */
-#if BT == 1
-  TYPE* pb[TN, TK] = B + rkb[newaxis, :] + ryb[:, newaxis]*ldb;
-  TYPE b[TN, TK] = *pb;
-#else
-  TYPE* pb[TK, TN] = B + rkb[:, newaxis]*ldb + ryb[newaxis, :];
-  TYPE b[TK, TN] = *pb;
-#endif
-
+  /* pointers for operands */
+  TYPE* pa[SHAPE_A] = A + rka[BROADCAST_AK] * STRIDE_AK + rxa[BROADCAST_AM] * STRIDE_AM;
+  TYPE* pb[SHAPE_B] = B + rkb[BROADCAST_BK] * STRIDE_BK + ryb[BROADCAST_BN] * STRIDE_BN;
+  /* prefetches operands */
+  TYPE a[SHAPE_A] = *pa;
+  TYPE b[SHAPE_B] = *pb;
  /* reduction loop */
  for(int k = K; k > 0; k = k - TK){
    xc = USEA @ USEB + xc;
-#if AT == 1
-    pa = pa + TK*lda;
-#else
-    pa = pa + TK;
-#endif
-#if BT == 1
-    pb = pb + TK;
-#else
-    pb = pb + TK*ldb;
-#endif
+    pa = pa + TK * STRIDE_AK;
+    pb = pb + TK * STRIDE_BK;
    a = *pa;
    b = *pb;
  }
-
  /* epilogue */
  int rxc[TM] =  ridx * TM + (0 ... TM);
  int ryc[TN] =  ridy * TN + (0 ... TN);
@@ -75,7 +73,7 @@ void dot(TYPE * A __noalias __readonly __aligned(16),
  bool checkc0[TM] = rxc < M;
  bool checkc1[TN] = ryc < N;
  bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];
-  *?(checkc) pc = c;
+  *pc = c;
 }
 """

@@ -112,10 +110,12 @@ class dot_op:
                    AT = self.trans_a, BT = self.trans_b, TYPE = tf.float16, 
                    TM = [128], TN = [ 128], TK = [32])

-dot_nt = dot_op(False, True)
-dot_nn = dot_op(False, False)
-dot_tn = dot_op(True, False)
-dot_tt = dot_op(True, True)
+
+def dot(a, b, trans_a = False, trans_b = False):
+  if (trans_a, trans_b) not in dot.ops:
+    dot.ops[trans_a, trans_b] = dot_op(trans_a, trans_b)
+  return dot.ops[trans_a, trans_b](a, b)
+dot.ops = dict()

 # @triton.register_gradient(dot_op)
 # def _dot_grad(op, dy):
@@ -127,9 +127,7 @@ def run_dot():
  M, N, K = 128, 128, 128
  a = tf.placeholder(tf.float16, shape=[M, K])
  b = tf.placeholder(tf.float16, shape=[N, K])
-  # c = tf.matmul(a, b, transpose_a=True)
-  c = dot_nt(a, b)
-  # grads = tf.gradients(c, [a])
+  c = dot(a, b, trans_a = False, trans_b = True)
  # Reference
  ha = np.random.rand(M, K).astype(np.float16)
  hb = np.random.rand(K, N).astype(np.float16)
@@ -142,8 +140,6 @@ def run_dot():
  hresult = np.dot(ha, hb.T)
  dif = np.abs(result - hresult)
  np.savetxt('dif.dat', dif, '%2.4f')
-  print(hresult)
-  print(result)
  print("dif: %f" % np.max(dif))

 run_dot()
--- a/python/triton/ops.py
+++ b/python/triton/ops.py
@@ -105,7 +105,8 @@ def _build(src, path, framework):
  if framework == tensorflow_id:
    _import_tensorflow()
    library_dirs += [tensorflow.sysconfig.get_lib()]
-    include_dirs += [tensorflow.sysconfig.get_lib()]
+    include_dirs += [tensorflow.sysconfig.get_include()]
+    include_dirs += ['/usr/local/cuda/include/']
    libraries += ['tensorflow_framework']
  elif framework == torch_id:
    _import_torch()
@@ -215,7 +216,7 @@ class op:
    self.fw_grids = dict()
    self.src = src
    self.outputs = outputs
-    self.framework = _find_framework(None)
+    self.framework = _find_framework(framework)
      
  def __call__(self, *args, **kwargs):
    # create a new op when defines are different