[CODEGEN] Various bugfixes and stability improvements in compiler backend (#240)

2021-08-30 11:50:35 -07:00
parent 85426dbaf7
commit 4ff3714d61
25 changed files with 568 additions and 399 deletions
--- a/lib/codegen/analysis/allocation.cc
+++ b/lib/codegen/analysis/allocation.cc
@@ -50,7 +50,6 @@ void allocation::run(ir::module &mod) {
      J.erase(j_it);
    }
  }
-
  // Build interference graph
  std::map<shared_layout*, std::set<shared_layout*>> interferences;
  for(shared_layout* x: V)
@@ -66,13 +65,10 @@ void allocation::run(ir::module &mod) {
        && XS.intersect(YS))
      interferences[x].insert(y);
  }
-
  // Initialize colors
  std::map<shared_layout*, int> colors;
  for(shared_layout* X: V)
    colors[X] = (X==V[0])?0:-1;
-
-
  // First-fit graph coloring
  std::vector<bool> available(V.size());
  for(shared_layout* x: V){
@@ -87,7 +83,6 @@ void allocation::run(ir::module &mod) {
    auto It = std::find(available.begin(), available.end(), true);
    colors[x] = std::distance(available.begin(), It);
  }
-
  // Finalize allocation
  for(shared_layout* x: V){
    unsigned Adj = 0;
@@ -95,7 +90,6 @@ void allocation::run(ir::module &mod) {
      Adj = std::max<unsigned>(Adj, starts[y] + y->get_size());
    offsets_[x] = starts[x] + colors[x] * Adj;
  }
-
  // Save maximum size of induced memory space
  allocated_size_ = 0;
  for(shared_layout* x: V)
--- a/lib/codegen/analysis/axes.cc
+++ b/lib/codegen/analysis/axes.cc
@@ -105,17 +105,17 @@ void axes::update_graph_no_edge(ir::instruction *i) {

 void axes::update_graph(ir::instruction *i) {
  switch (i->get_id()) {
-    case ir::INST_REDUCE:           return update_graph_reduce(i);
-    case ir::INST_RESHAPE:          return update_graph_reshape(i);
-    case ir::INST_SPLAT:            return update_graph_no_edge(i);;
-    case ir::INST_TRANS:            return update_graph_trans(i);
-    case ir::INST_BROADCAST:        return update_graph_broadcast(i);
-    case ir::INST_DOT:              return update_graph_dot(i);
-    case ir::INST_COPY_TO_SHARED:   return update_graph_no_edge(i);
-    case ir::INST_MASKED_LOAD_ASYNC:return update_graph_elementwise(i, false);
-    case ir::INST_COPY_FROM_SHARED: return update_graph_no_edge(i);
-    case ir::INST_RECOALESCE:       return update_graph_no_edge(i);
-    default:                        return update_graph_elementwise(i);
+    case ir::INST_REDUCE:            return update_graph_reduce(i);
+    case ir::INST_RESHAPE:           return update_graph_reshape(i);
+    case ir::INST_SPLAT:             return update_graph_no_edge(i);;
+    case ir::INST_TRANS:             return update_graph_trans(i);
+    case ir::INST_BROADCAST:         return update_graph_broadcast(i);
+    case ir::INST_DOT:               return update_graph_dot(i);
+    case ir::INST_COPY_TO_SHARED:    return update_graph_no_edge(i);
+    case ir::INST_MASKED_LOAD_ASYNC: return update_graph_elementwise(i, false);
+    case ir::INST_COPY_FROM_SHARED:  return update_graph_no_edge(i);
+    case ir::INST_CVT_LAYOUT:        return update_graph_no_edge(i);
+    default:                         return update_graph_elementwise(i);
  }
  return;
 }
@@ -135,11 +135,15 @@ std::vector<int> axes::get(ir::value *value) {
 void axes::run(ir::module &mod) {
  // make graph
  graph_.clear();
+  axes_.clear();
  ir::for_each_instruction(mod, [this](ir::instruction *x) {
    update_graph(x);
  });
  // find connected components
  graph_.connected_components(nullptr, &axes_);
+  std::set<size_t> uniq;
+  for(auto x: axes_)
+    uniq.insert(x.second);
 }

 }
--- a/lib/codegen/analysis/layout.cc
+++ b/lib/codegen/analysis/layout.cc
@@ -109,9 +109,6 @@ data_layout::data_layout(id_t id,
        max_contiguous = curr;
    }
  }
-  bool is_recoalesce = false;
-  for(ir::value* v: values)
-    is_recoalesce = is_recoalesce || dynamic_cast<ir::recoalesce_inst*>(v);
  if(max_contiguous.size() > 0){
    std::sort(order_.begin(), order_.end(), [&](unsigned a, unsigned b) {
      return max_contiguous[a] > max_contiguous[b];
@@ -129,6 +126,13 @@ int data_layout::find_axis(int to_find) const {
 }


+distributed_layout::distributed_layout(id_t id,
+                         const std::vector<int> &axes,
+                         const std::vector<unsigned> &shape,
+                         const std::vector<ir::value *> &values,
+                         analysis::align* align): data_layout(id, axes, shape, values, align)
+{ }
+
 /* -------------------------------- *
 *           MMA Layout             *
 * -------------------------------- */
@@ -138,20 +142,11 @@ mma_layout::mma_layout(size_t num_warps,
                       const std::vector<unsigned>& shape,
                       const std::vector<ir::value *> &values,
                       analysis::align* align, target* tgt,
-                       shared_layout *layout_a, shared_layout *layout_b): data_layout(MMA, axes, shape, values, align) {
+                       shared_layout *layout_a, shared_layout *layout_b): distributed_layout(MMA, axes, shape, values, align) {
  /* fragments per warp */
  // try to make things as square as possible to maximize data re-use
  if(tgt->as_nvidia()->sm() < 80){
    fpw_ = {2, 2, 1};
-//    std::vector<int> fpw_nm1;
-//    unsigned num_fragments = std::min<unsigned>((shape_[0]/8)*(shape_[1]/8), 4);
-//    do {
-//      fpw_nm1 = fpw_;
-//      if(fpw_[0]*fpw_[1] < num_fragments)
-//        fpw_[0] = clamp(fpw_[0]*2, 1, shape_[0] / 8);
-//      if(fpw_[0]*fpw_[1] < num_fragments)
-//        fpw_[1] = clamp(fpw_[1]*2, 1, shape_[1] / 8);
-//    }while(fpw_nm1 != fpw_);
    auto ord_a = layout_a->get_order();
    auto ord_b = layout_b->get_order();
    bool is_a_row = ord_a[0] != 0;
@@ -168,6 +163,7 @@ mma_layout::mma_layout(size_t num_warps,
    spw_ = {16, 8, 1};
    rep_ = {2,  2, 1};
  }
+  order_ = {0, 1};

  /* warps per tile */
  // try to make things as square as possible to maximize data re-use
@@ -182,7 +178,7 @@ mma_layout::mma_layout(size_t num_warps,
  }while(wpt_nm1 != wpt_);

  /* shape per block */
-  spt_ = {spw_[0]*wpt_[0], spw_[1]*wpt_[1], 1};
+  shape_per_cta_ = {spw_[0]*wpt_[0], spw_[1]*wpt_[1], 1};
 }


@@ -194,7 +190,7 @@ scanline_layout::scanline_layout(size_t num_warps,
                                 const std::vector<int>& axes,
                                 const std::vector<unsigned>& shape,
                                 const std::vector<ir::value *> &values,
-                                 analysis::align* align, target *tgt): data_layout(SCANLINE, axes, shape, values, align){
+                                 analysis::align* align, target *tgt): distributed_layout(SCANLINE, axes, shape, values, align){
  unsigned size = std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<int>());
  unsigned num_threads = tgt->is_gpu() ? num_warps * 32 : 1;
  nts_.resize(shape_.size());
@@ -230,6 +226,10 @@ scanline_layout::scanline_layout(size_t num_warps,
    mts_[i] = clamp(num_threads, 1, shape_[i] / nts_[i]);
    num_threads = num_threads / mts_[i];
  }
+
+  shape_per_cta_.resize(shape_.size());
+  for(size_t d = 0; d < shape_.size(); d++)
+    shape_per_cta_[d] = mts_[d]*nts_[d];
 }


@@ -489,6 +489,9 @@ void layouts::create(size_t id, const std::vector<ir::value*>& values) {
 void layouts::run(ir::module &mod) {
  // make graph
  graph_.clear();
+  layouts_.clear();
+  groups_.clear();
+
  ir::for_each_instruction(mod, [this](ir::instruction* i) {
    make_graph(i);
  });
@@ -515,23 +518,18 @@ void layouts::run(ir::module &mod) {
      layouts_[id] = new shared_layout(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), align_);
      tmp_[red] = id;
    }
-    if(auto *recoalasce = dynamic_cast<ir::recoalesce_inst*>(i)){
-      ir::value *val = recoalasce->get_operand(0);
-      mma_layout* in_layout = get(val)->to_mma();
-      scanline_layout* out_layout = get(i)->to_scanline();
-      if(!in_layout || !out_layout)
-        return;
+    if(auto *val = dynamic_cast<ir::cvt_layout_inst*>(i)){
+      distributed_layout* out_layout = dynamic_cast<distributed_layout*>(get(val));
+      distributed_layout* in_layout = dynamic_cast<distributed_layout*>(get(i->get_operand(0)));
      id++;
-      ir::type::block_shapes_t in_shape = val->get_type()->get_block_shapes();
-      ir::type::block_shapes_t shape(in_shape.size());
-      size_t ld = out_layout->get_order(0);
-      shape[ld] = in_shape[ld];
-      for(size_t k = 0; k < in_shape.size(); k++)
-        if(k != ld)
-          shape[k] = in_layout->to_mma()->spt(k);
-      // create layout
-      layouts_[id] = new shared_layout(out_layout, axes_->get(val), shape, {recoalasce}, val->get_type()->get_scalar_ty(), align_);
-      tmp_[recoalasce] = id;
+      size_t dim = val->get_type()->get_tile_rank();
+      ir::type::block_shapes_t shape(dim);
+      for(size_t k = 0; k < dim; k++){
+        shape[k] = std::max(in_layout->shape_per_cta(k),
+                            out_layout->shape_per_cta(k));
+      }
+      layouts_[id] = new shared_layout(out_layout, axes_->get(val), shape, {val}, val->get_type()->get_scalar_ty(), align_);
+      tmp_[val] = id;
    }
    if(auto *atom = dynamic_cast<ir::atomic_inst*>(i)){
      id++;