[GENERAL] Cleaned polymorphic structure of layouts analysis pass

2020-01-20 15:15:32 -05:00
parent 382ca2c745
commit 78b98fb7cf
17 changed files with 500 additions and 480 deletions
--- a/lib/codegen/selection/machine_layout.cc
+++ b/lib/codegen/selection/machine_layout.cc
@@ -71,18 +71,18 @@ inline int32_t ceil(int32_t num, int32_t div){



-machine_layout_shared_t::machine_layout_shared_t(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc,
-                                                 Value *&sh_mem_ptr, analysis::layout_shared_t *layout,
+machine_shared_layout::machine_shared_layout(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc,
+                                                 Value *&sh_mem_ptr, analysis::shared_layout *layout,
                                                 std::map<ir::value *, Value *>& vmap,
                                                 std::map<ir::value *, tile *>& tmap)
  : mod_(mod), builder_(builder), tgt_(tgt), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), layout_(layout), vmap_(vmap), tmap_(tmap) {

-  Type* ty = llvm_type(layout_->ty, builder_->getContext());
+  Type* ty = llvm_type(layout_->get_type(), builder_->getContext());
  PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr_->getType()->getPointerAddressSpace());
  // double-buffered
-  if(layout_->double_buffer) {
+  if(layout_->get_double_buffer()) {
    BasicBlock *current = builder_->GetInsertBlock();
-    auto info = *layout_->double_buffer;
+    auto info = *layout_->get_double_buffer();
    ir::phi_node *phi = info.phi;
    BasicBlock *parent = (BasicBlock*)vmap_.at((ir::value*)(phi->get_parent()));
    if(parent->empty())
@@ -105,31 +105,31 @@ machine_layout_shared_t::machine_layout_shared_t(Module *mod, Builder *builder,
 }


-tile* machine_layout_shared_t::create(ir::value *v) {
-  auto order = layout_->order;
-  auto shapes = layout_->shapes;
-  Type* ty = llvm_type(layout_->ty, builder_->getContext());
-  // double-buffered
-  if(layout_->double_buffer) {
-    if(v == layout_->double_buffer->phi)
-      return new shared_tile(ty, shapes, order, ptr_, *builder_, offset_);
-    if(v == layout_->double_buffer->latch)
-      return new shared_tile(ty, shapes, order, next_ptr_, *builder_);
-    return new shared_tile(ty, shapes, order, pre_ptr_, *builder_);
-  }
-  else {
-    return new shared_tile(ty, shapes, order, ptr_, *builder_);
-  }
+tile* machine_shared_layout::create(ir::value *v) {
+  Type* ty = llvm_type(layout_->get_type(), builder_->getContext());
+  auto double_buffer = layout_->get_double_buffer();
+  // offset
+  Value *offset = nullptr;
+  if(double_buffer && v == double_buffer->phi)
+    offset = offset_;
+  // base pointer
+  Value *ptr = ptr_;
+  if(double_buffer && v == double_buffer->latch)
+    ptr = next_ptr_;
+  else if(double_buffer && v == double_buffer->first)
+    ptr = pre_ptr_;
+  // create tile
+  return new shared_tile(ty, layout_->get_shape(), layout_->get_order(), ptr, *builder_, offset);
 }

-machine_layout_distributed_t::machine_layout_distributed_t(Module *mod, Builder *builder, target *tgt, Type *ty,
+machine_distributed_layout::machine_distributed_layout(Module *mod, Builder *builder, target *tgt,
                             analysis::axes *a_axes, std::map<unsigned, distributed_axis>& axes,
-                             analysis::layout_t *layout)
-  : mod_(mod), builder_(builder), tgt_(tgt), ty_(ty), a_axes_(a_axes), axes_(axes), layout_(layout) {
+                             analysis::data_layout *layout)
+  : mod_(mod), builder_(builder), tgt_(tgt), a_axes_(a_axes), axes_(axes), layout_(layout) {

 }

-tile *machine_layout_distributed_t::create(ir::value *v) {
+tile *machine_distributed_layout::create(ir::value *v) {
  Type *ty = llvm_type(v->get_type()->get_scalar_ty(), builder_->getContext());
  const auto &shapes = v->get_type()->get_tile_shapes();
  size_t rank = shapes.size();
@@ -151,12 +151,10 @@ tile *machine_layout_distributed_t::create(ir::value *v) {
  auto cmp = [&](int x, int y) {
    unsigned axx = a_axes_->get(v, x);
    unsigned axy = a_axes_->get(v, y);
-    auto itx = std::find(layout_->axes.begin(), layout_->axes.end(), axx);
-    auto ity = std::find(layout_->axes.begin(), layout_->axes.end(), axy);
-    size_t posx = std::distance(layout_->axes.begin(), itx);
-    size_t posy = std::distance(layout_->axes.begin(), ity);
+    size_t posx = layout_->find_axis(axx);
+    size_t posy = layout_->find_axis(axy);
    if(posx < rank && posy < rank)
-      return layout_->order[posx] < layout_->order[posy];
+      return layout_->get_order(posx) < layout_->get_order(posy);
    return false;
  };
  std::sort(order.begin(), order.end(), cmp);
@@ -164,22 +162,21 @@ tile *machine_layout_distributed_t::create(ir::value *v) {
  return new distributed_tile(ty, shapes, order, axes, *builder_);
 }

-machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *builder,
-                          target *tgt, Type *ty, analysis::axes *a_axes,
+machine_mma884_layout::machine_mma884_layout(Module *mod, Builder *builder,
+                          target *tgt, analysis::axes *a_axes,
                          std::map<unsigned, distributed_axis>& axes,
-                          analysis::layout_hmma_884_t* layout)
-  : machine_layout_distributed_t(mod, builder, tgt, ty, a_axes, axes, layout) {
+                          analysis::mma884_layout* layout)
+  : machine_distributed_layout(mod, builder, tgt, a_axes, axes, layout) {

  Value *warp_size = builder_->getInt32(32);
  Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0);
  Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size);
  Value *u_warp_id = builder_->CreateUDiv(u_thread_id_0, warp_size);

-  const auto& shapes = layout->shapes;
-  if(shapes.size() > 3)
+  const auto& shape = layout->get_shape();
+  if(shape.size() > 3)
    throw std::runtime_error("unsupported");
-
-  bool is_batched = shapes.size() >= 3;
+  bool is_batched = shape.size() >= 3;

  Value *_1 = builder_->getInt32(1);
  Value *_2 = builder_->getInt32(2);
@@ -188,13 +185,13 @@ machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *build
  Value *_16 = builder_->getInt32(16);

  // fragments per warp
-  unsigned fpw_0 = layout->fpw.at(0);
-  unsigned fpw_1 = layout->fpw.at(1);
-  unsigned fpw_2 = is_batched ? layout->fpw.at(2) : 1;
+  unsigned fpw_0 = layout->fpw(0);
+  unsigned fpw_1 = layout->fpw(1);
+  unsigned fpw_2 = is_batched ? layout->fpw(2) : 1;
  // warps per tile
-  unsigned wpt_0 = layout->wpt.at(0);
-  unsigned wpt_1 = layout->wpt.at(1);
-  unsigned wpt_2 = is_batched ? layout->wpt.at(2) : 1;
+  unsigned wpt_0 = layout->wpt(0);
+  unsigned wpt_1 = layout->wpt(1);
+  unsigned wpt_2 = is_batched ? layout->wpt(2) : 1;
  // hmma warp tile size
  unsigned hmma_wts_0 = fpw_0 * 8;
  unsigned hmma_wts_1 = fpw_1 * 8;
@@ -204,9 +201,9 @@ machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *build
  unsigned hmma_bts_1 = hmma_wts_1 * wpt_1;
  unsigned hmma_bts_2 = is_batched ? hmma_wts_2 * wpt_2 : 1;
  // number of repetition
-  unsigned num_rep_0 = shapes[0] / hmma_bts_0;
-  unsigned num_rep_1 = shapes[1] / hmma_bts_1;
-  unsigned num_rep_2 = is_batched ? shapes[2] / hmma_bts_2 : 1;
+  unsigned num_rep_0 = shape[0] / hmma_bts_0;
+  unsigned num_rep_1 = shape[1] / hmma_bts_1;
+  unsigned num_rep_2 = is_batched ? shape[2] / hmma_bts_2 : 1;
  // size of each pack (interleaving)
  pack_size_0_ = std::min<unsigned>(num_rep_0, 1);
  pack_size_1_ = std::min<unsigned>(num_rep_1, 1);
@@ -275,44 +272,52 @@ machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *build


  /* axes */
-  axes_[layout->axes[0]] = distributed_axis{1, idx_i, warp_id_0};
-  axes_[layout->axes[1]] = distributed_axis{1, idx_j, warp_id_1};
+  axes_[layout->get_axis(0)] = distributed_axis{1, idx_i, warp_id_0};
+  axes_[layout->get_axis(1)] = distributed_axis{1, idx_j, warp_id_1};
  if(is_batched)
-    axes_[layout->axes[2]] = distributed_axis{1, idx_z, warp_id_2};
+    axes_[layout->get_axis(2)] = distributed_axis{1, idx_z, warp_id_2};
 }


-machine_layout_scanline_t::machine_layout_scanline_t(Module *mod, Builder *builder,
-                                                     target *tgt, Type *ty,
+machine_scanline_layout::machine_scanline_layout(Module *mod, Builder *builder,
+                                                     target *tgt,
                                                     analysis::axes *a_axes, std::map<unsigned, distributed_axis> &axes,
-                                                     analysis::layout_scanline_t* layout)
-  : machine_layout_distributed_t(mod, builder, tgt, ty, a_axes, axes, layout) {
+                                                     analysis::scanline_layout* layout)
+  : machine_distributed_layout(mod, builder, tgt, a_axes, axes, layout) {

  Value *warp_size = builder_->getInt32(32);
  Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0);
  Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size);
  Value *u_warp_id = builder_->CreateUDiv(u_thread_id_0, warp_size);

-  auto order = layout->order;
-  const auto& shapes = layout->shapes;
-  size_t dim = shapes.size();
-  std::vector<int> nts = layout->nts;
-  std::vector<int> mts = layout->mts;
+  auto order = layout->get_order();
+  const auto& shape = layout->get_shape();
  Value* full_thread_id = builder_->CreateAdd(builder_->CreateMul(u_warp_id, builder_->getInt32(32)), u_thread_id);
-  std::vector<Value*> thread_id = delinearize(full_thread_id, order, mts, *builder_);
+  // Delinearize
+  size_t dim = shape.size();
+  std::vector<Value*> thread_id(dim);
+  for(unsigned k = 0; k < dim - 1; k++){
+    Constant *dim_k = builder_->getInt32(layout->mts(order[k]));
+    Value *rem = builder_->CreateURem(full_thread_id, dim_k);
+    full_thread_id = builder_->CreateUDiv(full_thread_id, dim_k);
+    thread_id[order[k]] = rem;
+  }
+  thread_id[order[dim - 1]] = full_thread_id;
  // Create axes
  for(unsigned k = 0; k < dim; k++) {
+    int nts = layout->nts(k);
+    int mts = layout->mts(k);
    std::string str_k = std::to_string(k);
-    Value *contiguous_k = builder_->getInt32(nts[k]);
+    Value *contiguous_k = builder_->getInt32(nts);
    Value *scaled_thread_id = builder_->CreateMul(thread_id[k], contiguous_k);
-    unsigned per_block  = nts[k] * mts[k];
-    unsigned per_thread = nts[k] * shapes[k] / per_block;
+    unsigned per_block  = nts * mts;
+    unsigned per_thread = nts * shape[k] / per_block;
    std::vector<Value*> idx_list(per_thread);
    for(unsigned n = 0 ; n < per_thread; n++){
-      unsigned offset = n / nts[k] * per_block + n % nts[k];
+      unsigned offset = n / nts * per_block + n % nts;
      idx_list[n] = builder_->CreateAdd(scaled_thread_id, builder_->getInt32(offset), "idx_" + str_k + "_" + std::to_string(n));
    }
-    axes_[layout->axes[k]] = distributed_axis{nts[k], idx_list, thread_id[k]};
+    axes_[layout->get_axis(k)] = distributed_axis{nts, idx_list, thread_id[k]};
  }
 }