From abe3fbb480efb25662d501d220cbab4eebde9995 Mon Sep 17 00:00:00 2001
From: Philippe Tillet <ptillet@g.harvard.edu>
Date: Sun, 20 Oct 2019 01:01:53 -0400
Subject: [PATCH] [test] [reduce] added test for 1D reduction

---
 lib/codegen/analysis/axes.cc       |  2 +
 lib/codegen/selection/generator.cc |  5 +-
 lib/ir/instructions.cc             |  6 +--
 lib/lang/ast.cc                    |  7 ++-
 lib/lang/code_gen.cc               |  3 ++
 tests/common/reduce.h              | 85 ++++++++++++++++--------------
 tests/common/src/reduce.h          | 25 ++++++---
 tests/unit/reduce.cc               |  1 +
 8 files changed, 79 insertions(+), 55 deletions(-)

diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc
index c446558a8..6271e224b 100644
--- a/lib/codegen/analysis/axes.cc
+++ b/lib/codegen/analysis/axes.cc
@@ -106,6 +106,8 @@ void axes::update_graph_elementwise(ir::instruction *i) {
 }
 
 void axes::update_graph_no_edge(ir::instruction *i) {
+  if(!i->get_type()->is_tile_ty())
+    return;
   auto rank = i->get_type()->get_tile_rank();
   for(unsigned d = 0; d < rank; d++)
     graph_.add_edge({i, d}, {i, d});
diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc
index d9d1e1cec..d7b9bc6a3 100644
--- a/lib/codegen/selection/generator.cc
+++ b/lib/codegen/selection/generator.cc
@@ -825,13 +825,12 @@ void generator::visit_reduce_inst(ir::reduce_inst* x) {
   }
   tgt_->add_barrier(mod_, *builder_);
   // write back
-  distributed_tile* x_tile = (distributed_tile*)tmap_.at(x);
-  x_tile->for_each([&](indices_t idx) {
+  for_each(x, [&](indices_t idx) {
     indices_t red_idx = idx;
     red_idx.insert(red_idx.begin() + axis, builder_->getInt32(0));
     Value *read_offset = shared_tile::shared_offset(*builder_, stile->get_shapes(), stile->get_perm(), stile->get_order(),  red_idx);
     Value *read_ptr = builder_->CreateGEP(base_ptr, read_offset);
-    x_tile->set_value(idx, builder_->CreateLoad(read_ptr));
+    set_value(x, idx, builder_->CreateLoad(read_ptr));
   });
 }
 
diff --git a/lib/ir/instructions.cc b/lib/ir/instructions.cc
index 01f961402..0be815a51 100644
--- a/lib/ir/instructions.cc
+++ b/lib/ir/instructions.cc
@@ -651,10 +651,10 @@ type* reduce_inst::get_res_type(value *arg, unsigned axis) {
   ir::tile_type::tile_shapes_t shapes = arg->get_type()->get_tile_shapes();
   shapes.erase(shapes.begin() + axis);
   type *scalar_ty = arg->get_type()->get_scalar_ty();
-  if(shapes.size() == 0)
+  if(shapes.empty())
+//    shapes.push_back(1);
     return scalar_ty;
-  else
-    return tile_type::get(scalar_ty, shapes);
+  return tile_type::get(scalar_ty, shapes);
 }
 
 reduce_inst::reduce_inst(value *arg, op_t op, unsigned axis, const std::string &name, instruction *next)
diff --git a/lib/lang/ast.cc b/lib/lang/ast.cc
index b0a50adc3..bf0c7e964 100644
--- a/lib/lang/ast.cc
+++ b/lib/lang/ast.cc
@@ -567,7 +567,7 @@ void BinaryOp::AssignOpTypeChecking() {
   // The other constraints are lefted to cast operator
   rhs_ = Expr::MayCast(rhs_, ScalarOrLikeTile(rhs_, lhsScalType));
   type_ = lhs_->Type();
-  Broadcast(this, lhs_, rhs_, type_);
+  rhs_ = UnaryOp::New(Token::CAST, rhs_, type_);
 }
 
 /*
@@ -688,7 +688,10 @@ void UnaryOp::ReduceOpTypeChecking() {
     Error(this, "array expected for reduction operation");
   auto shape = tileType->Shape();
   shape.erase(shape.begin() + ax);
-  type_ = TileType::New(shape, tileType->Derived());
+  if(shape.empty())
+    type_ = tileType->Derived();
+  else
+    type_ = TileType::New(shape, tileType->Derived());
 }
 
 void UnaryOp::TransOpTypeChecking() {
diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc
index 4bf9d4a2c..aee604b4a 100644
--- a/lib/lang/code_gen.cc
+++ b/lib/lang/code_gen.cc
@@ -467,6 +467,9 @@ ir::value* Generator::GenBroadcastOp(ir::value* src, ir::type* dst_ty) {
       return bld_->create_broadcast(src, dst_shapes);
     }
   }
+  else if(src->get_type()->is_tile_ty() && src->get_type()->get_tile_num_elements() == 1){
+    return bld_->create_downcast(src);
+  }
   return src;
 }
 
diff --git a/tests/common/reduce.h b/tests/common/reduce.h
index 86e066638..ba4e6e470 100644
--- a/tests/common/reduce.h
+++ b/tests/common/reduce.h
@@ -19,6 +19,8 @@ void cc_reduce_nd(std::vector<T> &y, const std::vector<T> &x, reduce_op_t op, si
   // remove shape at index axis to get outer dimensions
   std::vector<int> outer = shapes;
   outer.erase(outer.begin() + axis);
+  if(outer.empty())
+    outer.push_back(1);
   // retrieve shape at index axis to get inner dimension
   int inner = shapes[axis];
   // accumualtion function
@@ -42,7 +44,7 @@ enum run_mode_t {
   TEST
 };
 
-void triton_reduce_nd(drv::stream* stream, const std::vector<int32_t>& shape,
+void triton_reduce_nd(drv::stream* stream, const std::vector<int32_t>& shape_x,
                       int axis, reduce_op_t op,
                       const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order,
                       std::vector<std::vector<std::string>> TS,
@@ -53,86 +55,91 @@ void triton_reduce_nd(drv::stream* stream, const std::vector<int32_t>& shape,
   drv::context* context = stream->context();
   size_t axy = (axis == 0) ? 1 : 0;
 
+  // shape
+  std::vector<int> shape_y = shape_x;
+  shape_y.erase(shape_y.begin() + axis);
+
   // rank
-  size_t rank = shape.size();
+  int rank_x = shape_x.size();
+  int rank_y = shape_y.size();
+
   // size
-  size_t size = 1;
-  for(int32_t d: shape)
-    size *= d;
-  std::vector<std::string> shapename = {"S0", "S1", "S2"};
+  size_t size_x = 1;
+  for(int32_t d: shape_x)
+    size_x *= d;
+  size_t size_y = 1;
+  for(int32_t d: shape_y)
+    size_y *= d;
+
   // strides for x
+  std::vector<std::string> x_shapename = {"S0", "S1", "S2"};
   std::vector<std::string> x_strides = {"1"};
-  for(size_t d = 0; d < rank - 1; d++)
-    x_strides.push_back(x_strides[d] + " * " + shapename[x_order[d]]);
+  for(int d = 0; d < rank_x - 1; d++)
+    x_strides.push_back(x_strides[d] + " * " + x_shapename[x_order[d]]);
+
   // strides for y
+  std::vector<std::string> y_shapename = x_shapename;
+  y_shapename.erase(y_shapename.begin() + axis);
   std::vector<std::string> y_strides = {"1"};
-  for(size_t d = 0; d < rank - 1; d++)
-    y_strides.push_back(y_strides[d] + " * " + shapename[y_order[d]]);
+  for(int d = 0; d < rank_y - 1; d++)
+    y_strides.push_back(y_strides[d] + " * " + y_shapename[y_order[d]]);
 
-  // create inputs
-  auto dx = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size*dtsize));
-  auto dy = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size*dtsize));
-  // create options
+  // options
   rt::function::options_space_t opt;
-
-  // type
   opt.defines.push_back({"TYPE", {ty}});
-  // x strides
-  for(size_t d = 0; d < rank; d++)
+  for(int d = 0; d < rank_x; d++)
     opt.defines.push_back({"STRIDE_XS" + std::to_string(x_order[d]), {x_strides[d]}});
-  // y strides
-  for(size_t d = 0; d < rank; d++)
+  for(int d = 0; d < rank_y; d++)
     opt.defines.push_back({"STRIDE_YS" + std::to_string(y_order[d]), {y_strides[d]}});
   if(TS.empty())
-    TS = tile_nd(rank);
-  // tile size
-  for(size_t d = 0; d < rank; d++)
+    TS = tile_nd(rank_x);
+  for(int d = 0; d < rank_x; d++)
     opt.defines.push_back({"TS" + std::to_string(d), TS[d]});
-  // non-reduced axis
-  std::string RY = (axis == 0) ? "rn" : "rm";
-  opt.defines.push_back({"TY", {std::to_string(shape[axy])}});
+  std::string RY = (axis == 0) ? "rs1" : "rs0";
+  opt.defines.push_back({"TY", {std::to_string(shape_x[axy])}});
   opt.defines.push_back({"RY", {RY}});
-  // reduction broadcasting
   std::string RED = "";
-  for(int n = 0; n < 2; n++){
+  for(int n = 0; n < rank_x; n++){
     if(n > 0)
       RED += ", ";
     RED += (n==axis) ? to_str(op) : ":";
   }
   opt.defines.push_back({"RED", {RED}});
-
-  opt.num_warps = {4};
+  opt.num_warps = {1};
 
   // kernel
-  rt::function function(src::reduce2d, opt);
+  rt::function function(src::reduce_nd[rank_x - 1], opt);
+
+  // input buffers
+  auto dx = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size_x*dtsize));
+  auto dy = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size_y*dtsize));
 
   // grid
   std::vector<rt::arg> args = {&*dx, &*dy};
-  for(int32_t d: shape)
+  for(int32_t d: shape_x)
     args.push_back(d);
-  args.push_back(shape[0]);
   std::vector<std::string> ts = {"TS0", "TS1", "TS2"};
-  auto grid = grid_nd(shape, ts);
+  auto grid = grid_nd(shape_x, ts);
 
   // metrics
   if(mode == BENCH){
-    auto gbps = [&](double ns) { return 2 * size * dtsize / (ns * 1e-9) * 1e-9; };
+    auto gbps = [&](double ns) { return 2 * size_x * dtsize / (ns * 1e-9) * 1e-9; };
     double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream);
     bench.push_back(gbps(triton_ns));
   }
 
   // test triton
   if(mode == TEST){
-    std::vector<NumericT> hy(shape[axy]);
-    std::vector<NumericT> ry(shape[axy]);
-    std::vector<NumericT> hx(shape[0]*shape[1]);
+    std::vector<NumericT> hy(size_y);
+    std::vector<NumericT> ry(size_y);
+    std::vector<NumericT> hx(size_x);
     init_zeros(hy);
     init_rand(hx);
     stream->write(&*dx, true, 0, hx);
     function(args, grid, stream);
     stream->synchronize();
     stream->read(&*dy, true, 0, hy);
-    cc_reduce_nd(ry, hx, op, axis, shape);
+    cc_reduce_nd(ry, hx, op, axis, shape_x);
     test = testing::diff(hy, ry);
   }
 }
diff --git a/tests/common/src/reduce.h b/tests/common/src/reduce.h
index cc44ca5fc..158e68567 100644
--- a/tests/common/src/reduce.h
+++ b/tests/common/src/reduce.h
@@ -4,7 +4,11 @@ namespace src {
 R"(
 void reduce1d(TYPE * X __noalias __readonly __aligned(16),
               TYPE * Y __noalias __readonly __aligned(16),
-              int N) {
+              int S0) {
+  int pid0 = get_program_id(0);
+  int rs0[TS0] = pid0 * TS0 + 0 ... TS0;
+  TYPE* px[TS0] = X + rs0;
+  *Y = (*px)[RED];
 }
 )";
 
@@ -13,15 +17,20 @@ void reduce1d(TYPE * X __noalias __readonly __aligned(16),
 R"(
 void reduce2d(TYPE * X __noalias __readonly __aligned(16),
             TYPE * Y __noalias __writeonly __aligned(16),
-            int M, int N, int ldx) {
-  int ridm = get_program_id(0);
-  int ridn = get_program_id(1);
-  int rm[TS0] = ridm * TS0 + 0 ... TS0;
-  int rn[TS1] = ridn * TS1 + 0 ... TS1;
-  TYPE* px[TS0, TS1] = X + rm[:, newaxis] + rn[newaxis, :] * ldx;
-  TYPE* py[TY] = Y + RY;
+            int S0, int S1) {
+  int pid0 = get_program_id(0);
+  int pid1 = get_program_id(1);
+  int rs0[TS0] = pid0 * TS0 + 0 ... TS0;
+  int rs1[TS1] = pid1 * TS1 + 0 ... TS1;
+  TYPE* px[TS0, TS1] = X + rs0[:, newaxis] * STRIDE_XS0
+                         + rs1[newaxis, :] * STRIDE_XS1;
+  TYPE* py[TY] = Y + RY * STRIDE_YS0;
   *py = (*px)[RED];
 }
 )";
 
+
+  const char* reduce_nd[] = {reduce1d, reduce2d};
+
+
 }
diff --git a/tests/unit/reduce.cc b/tests/unit/reduce.cc
index 0ee43cbc5..5a311686c 100644
--- a/tests/unit/reduce.cc
+++ b/tests/unit/reduce.cc
@@ -20,6 +20,7 @@ int main() {
   // shapes to benchmark
   typedef std::tuple<std::vector<int>, int, reduce_op_t> config_t;
   std::vector<config_t> configs = {
+    config_t{{32}, 0, ADD},
     config_t{{32, 32}, 0, MAX},
     config_t{{32, 32}, 1, ADD},
     config_t{{32, 64}, 0, ADD},