[BACKEND] Various bug fixes; making reductions faster (#533)

2022-05-31 17:14:44 -07:00
parent 37037bb3be
commit 3e7500dfe6
12 changed files with 174 additions and 66 deletions
--- a/lib/codegen/analysis/align.cc
+++ b/lib/codegen/analysis/align.cc
@@ -319,8 +319,8 @@ std::vector<unsigned> align::populate_max_contiguous_binop(ir::binary_operator*
    }
    if(x->is_int_add_sub()){
      unsigned lvalue = 1, rvalue = 1;
-      lvalue = gcd(rhs_max_contiguous[d], lhs_starting_multiple[d]);
-      rvalue = gcd(lhs_max_contiguous[d], rhs_starting_multiple[d]);
+      lvalue = gcd(rhs_max_contiguous[d], lhs_cst_info[d].num_cst);
+      rvalue = gcd(lhs_max_contiguous[d], rhs_cst_info[d].num_cst);
      value = std::max(lvalue, rvalue);
    }
    result.push_back(value);
--- a/lib/codegen/analysis/layout.cc
+++ b/lib/codegen/analysis/layout.cc
@@ -209,14 +209,15 @@ mma_layout::mma_layout(size_t num_warps,
    rep_ = {2*pack_size_0, 2*pack_size_1, 1};
    spw_ = {fpw_[0]*4*rep_[0], fpw_[1]*4*rep_[1], 1};
    contig_per_thread_ = {1, 1};
+    order_ = {0, 1};
  }
  else{
    // fpw_ = {1, 1, 1};
    spw_ = mma_instr_shape_.at(tensor_core_type_); // e.g., {16, 8, 16} for f32.f16.f16.f32
    contig_per_thread_ = {1, 2};
+    order_ = {1, 0};
    // rep_ = {2,  2, 1};
  }
-  order_ = {0, 1};

  /* warps per tile */
  wpt_ = {1, 1, 1};
@@ -616,8 +617,9 @@ void layouts::run(ir::module &mod) {
      unsigned axis = red->get_axis();
      // shape
      auto shapes = arg->get_type()->get_block_shapes();
-      scanline_layout *layout = get(arg)->to_scanline();
-      shapes[axis] = layout->mts(axis);
+      distributed_layout* layout = dynamic_cast<analysis::distributed_layout*>(get(arg));
+      shapes[axis] = layout->shape_per_cta(axis) / layout->contig_per_thread(axis);
+      
      // create layout
      layouts_[id] = new shared_layout(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), align_, tgt_);
      tmp_[red] = id;