Revert "[BACKEND] Various bug fixes; making reductions faster (#533)".

This is a more stable commit that produce bitwise identical code to earlier versions. Using commits after this one may lead to slightly different numerics
2022-06-03 11:36:06 -07:00
parent efa04cac1f
commit a60374a597
11 changed files with 65 additions and 173 deletions
--- a/lib/codegen/analysis/layout.cc
+++ b/lib/codegen/analysis/layout.cc
@@ -209,15 +209,14 @@ mma_layout::mma_layout(size_t num_warps,
    rep_ = {2*pack_size_0, 2*pack_size_1, 1};
    spw_ = {fpw_[0]*4*rep_[0], fpw_[1]*4*rep_[1], 1};
    contig_per_thread_ = {1, 1};
-    order_ = {0, 1};
  }
  else{
    // fpw_ = {1, 1, 1};
    spw_ = mma_instr_shape_.at(tensor_core_type_); // e.g., {16, 8, 16} for f32.f16.f16.f32
    contig_per_thread_ = {1, 2};
-    order_ = {1, 0};
    // rep_ = {2,  2, 1};
  }
+  order_ = {0, 1};

  /* warps per tile */
  wpt_ = {1, 1, 1};
@@ -617,9 +616,8 @@ void layouts::run(ir::module &mod) {
      unsigned axis = red->get_axis();
      // shape
      auto shapes = arg->get_type()->get_block_shapes();
-      distributed_layout* layout = dynamic_cast<analysis::distributed_layout*>(get(arg));
-      shapes[axis] = layout->shape_per_cta(axis) / layout->contig_per_thread(axis);
-      
+      scanline_layout *layout = get(arg)->to_scanline();
+      shapes[axis] = layout->mts(axis);
      // create layout
      layouts_[id] = new shared_layout(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), align_, tgt_);
      tmp_[red] = id;