more work on padding

2019-09-27 22:15:30 -04:00
parent 575dd06be3
commit ed1b2bc563
19 changed files with 191 additions and 191 deletions
--- a/lib/codegen/analysis/align.cc
+++ b/lib/codegen/analysis/align.cc
@@ -487,9 +487,6 @@ void align::populate(ir::value *v) {
  populate_is_constant(v);
  populate_starting_multiple(v);
  populate_max_contiguous(v);
-//  std::cout << v->get_name() << std::endl;
-//  if(max_contiguous_[v].size() == 2)
-//    std::cout << max_contiguous_[v][0] << " " << max_contiguous_[v][1] << std::endl;
 }

 void align::run(ir::module &mod) {
--- a/lib/codegen/analysis/allocation.cc
+++ b/lib/codegen/analysis/allocation.cc
@@ -21,22 +21,22 @@ void allocation::run(ir::module &mod) {
  using std::min;
  typedef std::multimap<unsigned, segment> triples_map_type;

-  std::vector<buffer_t> I;
+  std::vector<buffer_t*> I;
  for(auto x: liveness_->intervals())
    I.push_back(x.first);
-  std::vector<buffer_t> J = I;
+  std::vector<buffer_t*> J = I;

  triples_map_type H;
  H.insert({0, segment{0, INT_MAX}});

-  std::vector<buffer_t> V;
-  std::map<buffer_t, unsigned> starts;
+  std::vector<buffer_t*> V;
+  std::map<buffer_t*, unsigned> starts;
  while(!J.empty()){
    auto h_it = H.begin();
    unsigned w = h_it->first;
    segment xh = h_it->second;
    H.erase(h_it);
-    auto j_it = std::find_if(J.begin(), J.end(), [&](buffer_t JJ){
+    auto j_it = std::find_if(J.begin(), J.end(), [&](buffer_t* JJ){
      segment xj = liveness_->get_interval(JJ);
      bool res = xj.intersect(xh);
      for(auto val: H)
@@ -44,7 +44,7 @@ void allocation::run(ir::module &mod) {
      return res;
    });
    if(j_it != J.end()){
-      unsigned size = j_it->size;
+      unsigned size = (*j_it)->size;
      segment xj = liveness_->get_interval(*j_it);
      starts[*j_it] = w;
      H.insert({w + size, segment{max(xh.start, xj.start), min(xh.end, xj.end)}});
@@ -58,14 +58,14 @@ void allocation::run(ir::module &mod) {
  }

  // Build interference graph
-  std::map<buffer_t, std::set<buffer_t>> interferences;
-  for(buffer_t x: V)
-  for(buffer_t y: V){
-    if(x.id == y.id)
+  std::map<buffer_t*, std::set<buffer_t*>> interferences;
+  for(buffer_t* x: V)
+  for(buffer_t* y: V){
+    if(x->id == y->id)
      continue;
    unsigned X0 = starts[x], Y0 = starts[y];
-    unsigned NX = x.size;
-    unsigned NY = y.size;
+    unsigned NX = x->size;
+    unsigned NY = y->size;
    segment XS = {X0, X0 + NX};
    segment YS = {Y0, Y0 + NY};
    if(liveness_->get_interval(x).intersect(liveness_->get_interval(y))
@@ -74,17 +74,17 @@ void allocation::run(ir::module &mod) {
  }

  // Initialize colors
-  std::map<buffer_t, int> colors;
-  for(buffer_t X: V)
-    colors[X] = (X.id==V[0].id)?0:-1;
+  std::map<buffer_t*, int> colors;
+  for(buffer_t* X: V)
+    colors[X] = (X->id==V[0]->id)?0:-1;


  // First-fit graph coloring
  std::vector<bool> available(V.size());
-  for(buffer_t x: V){
+  for(buffer_t* x: V){
    // Non-neighboring colors are available
    std::fill(available.begin(), available.end(), true);
-    for(buffer_t Y: interferences[x]){
+    for(buffer_t* Y: interferences[x]){
      int color = colors[Y];
      if(color >= 0)
        available[color] = false;
@@ -95,25 +95,24 @@ void allocation::run(ir::module &mod) {
  }

  // Finalize allocation
-  for(buffer_t x: V){
+  for(buffer_t* x: V){
    unsigned Adj = 0;
-    for(buffer_t y: interferences[x])
-      Adj = std::max<unsigned>(Adj, starts[y] + y.size);
+    for(buffer_t* y: interferences[x])
+      Adj = std::max<unsigned>(Adj, starts[y] + y->size);
    // create offsets
    for(ir::value *v: liveness_->get_values(x)){
      offsets_[v] = starts[x] + colors[x] * Adj;
      if(liveness_->has_double(v)){
        auto info = liveness_->get_double(v);
-        offsets_[info.latch] = offsets_[v] + x.size / 2;
+        offsets_[info.latch] = offsets_[v] + x->size / 2;
      }
    }
  }

  // Save maximum size of induced memory space
  allocated_size_ = 0;
-  for(auto &x: offsets_){
-    allocated_size_ = std::max<size_t>(allocated_size_, x.second + liveness_->get_buffer(x.first).size);
-  }
+  for(buffer_t* x: V)
+    allocated_size_ = std::max<size_t>(allocated_size_, starts[x] + x->size);
 }

 }
--- a/lib/codegen/analysis/axes.cc
+++ b/lib/codegen/analysis/axes.cc
@@ -74,7 +74,7 @@ void axes::update_graph_trans(ir::instruction *i) {
  auto perm = trans->get_perm();
  // add edge between axis perm[d] and axis d
  for(unsigned d = 0; d < perm.size(); d++)
-    add_constraint({i, perm[d]->get_value()}, {op, d});
+    add_constraint({i, perm[d]}, {op, d});
 }

 void axes::update_graph_broadcast(ir::instruction *i) {
--- a/lib/codegen/analysis/liveness.cc
+++ b/lib/codegen/analysis/liveness.cc
@@ -58,6 +58,18 @@ void liveness::make_graph(ir::instruction *i) {
    graph_[i].insert(latch);
    graph_[latch].insert(i);
  }
+  if(i->get_id() == ir::INST_PHI){
+    ir::phi_node* phi = (ir::phi_node*)i;
+    for(ir::value* op: phi->ops()){
+      auto* iop = dynamic_cast<ir::instruction*>(op);
+      if(!iop || storage_info.at(iop->get_id()).first != SHARED)
+        continue;
+      nodes_.insert(phi);
+      nodes_.insert(op);
+      graph_[phi].insert(op);
+      graph_[op].insert(phi);
+    }
+  }
  if(i->get_id() == ir::INST_TRANS){
    nodes_.insert(i);
    nodes_.insert(i->get_operand(0));
@@ -67,39 +79,63 @@ void liveness::make_graph(ir::instruction *i) {
 }

 // connected components
-void liveness::connected_components(node_t x, std::set<node_t> &nodes, graph_t &graph, unsigned group_id) {
-  buffer_t buffer{group_id, num_bytes(x)};
+void liveness::connected_components(node_t x, std::set<node_t> &nodes, graph_t &graph, buffer_t* buffer) {
  groups_[x] = buffer;
  values_[buffer].push_back(x);
  if(nodes.find(x) != nodes.end()){
    nodes.erase(x);
    for(const node_t &y: graph[x])
-      connected_components(y, nodes, graph, group_id);
+      connected_components(y, nodes, graph, buffer);
  }
 }

-unsigned liveness::is_ld_padded(ir::value *x) {
-  if(auto *trans = dynamic_cast<ir::trans_inst*>(x)){
-    if(trans->get_perm()[0]->get_value() != 0)
-      return 4;
+bool liveness::do_pad(ir::value *x) {
+  // alignment for matrix product
+  if(auto* dot = dynamic_cast<ir::dot_inst*>(x)) {
+    auto order = tiles_->order(x);
+    // a
+    ir::value *a = dot->get_operand(0);\
+    size_t previous_a = pad_[a];
+    bool a_trans = dynamic_cast<ir::trans_inst*>(a);
+    bool a_row = order[0] == 1;
+    if(tiles_->hmma(x) == HMMA_A_ROW)
+      pad_[a] = 16;
+    else if(tiles_->hmma(x) == HMMA_A_COL)
+      pad_[a] = 8;
+    else if(a_trans ^ a_row)
+      pad_[a] = 4;
+    else
+      pad_[a] = 0;
+    // b
+    ir::value *b = dot->get_operand(1);
+    size_t previous_b = pad_[b];
+    bool b_trans = dynamic_cast<ir::trans_inst*>(a);
+    bool b_col = order[0] == 0;
+    if(tiles_->hmma(x) == HMMA_B_COL)
+      pad_[b] = 16;
+    if(tiles_->hmma(x) == HMMA_B_ROW)
+      pad_[b] = 8;
+    if(b_trans ^ b_col)
+      pad_[b] = 4;
+    else
+      pad_[b] = 0;
+    return previous_a != pad_[a] || previous_b != pad_[b];
  }
-  auto order = tiles_->order(x);
-  bool is_col_major = order[0] == 0;
-  if(tiles_->hmma(x) == HMMA_A_ROW)
-    return is_col_major ? 16 : 16;
-  if(tiles_->hmma(x) == HMMA_A_COL)
-    return is_col_major ? 8 : 8;
-  if(tiles_->hmma(x) == HMMA_B_COL)
-    return is_col_major ? 16 : 16;
-  if(tiles_->hmma(x) == HMMA_B_ROW)
-    return is_col_major ? 8 : 8;
+  // padding for phi-nodes
  if(auto* phi = dynamic_cast<ir::phi_node*>(x)) {
-    unsigned result = 0;
-    for(unsigned i = 0; i < phi->get_num_incoming(); i++)
-      result = std::max(result, is_ld_padded(phi->get_incoming_value(i)));
-    return result;
+    bool has_changed = false;
+    for(unsigned i = 0; i < phi->get_num_incoming(); i++){
+      ir::value* op = phi->get_operand(i);
+      size_t previous = pad_[op];
+      pad_[op] = std::max(pad_[op], pad_[phi]);
+      has_changed |= previous != pad_[op];
+    }
+    return has_changed;
  }
-  return 0;
+  // default -- no pading
+  size_t previous = pad_[x];
+  pad_[x] = std::max<int>(previous, 0);
+  return pad_[x] != previous;
 }

 unsigned liveness::num_bytes(ir::value *x) {
@@ -120,7 +156,8 @@ unsigned liveness::num_bytes(ir::value *x) {
    return num_elements * num_bytes * depth;
  }
  unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8;
-  unsigned pad = is_ld_padded(x);
+  unsigned pad = pad_.at(x);
+  std::cout << x->get_name() << " " << pad << std::endl;
  if(pad > 0){
    unsigned ld = x->get_type()->get_tile_shapes()[tiles_->order(x)[0]];
    num_bytes += pad * num_bytes / ld;
@@ -134,6 +171,7 @@ unsigned liveness::num_bytes(ir::value *x) {
 void liveness::run(ir::module &mod) {
  double_.clear();
  indices.clear();
+  pad_.clear();
  intervals_.clear();
  parents_.clear();

@@ -142,6 +180,15 @@ void liveness::run(ir::module &mod) {
    this->extract_double_bufferable(i);
  });

+  // Padding information
+  bool has_changed;
+  do{
+    has_changed = false;
+    ir::for_each_value(mod, [this, &has_changed](ir::value* v){
+      has_changed |= this->do_pad(v);
+    });
+  }while(has_changed);
+
  // Create buffer dependency graph
  ir::for_each_instruction(mod, [this](ir::instruction* i) {
    this->make_graph(i);
@@ -150,7 +197,10 @@ void liveness::run(ir::module &mod) {
  // connected components
  unsigned group_id = 0;
  while(!nodes_.empty()){
-    connected_components(*nodes_.begin(), nodes_, graph_, group_id++);
+    buffer_t* buffer = new buffer_t{group_id++};
+    connected_components(*nodes_.begin(), nodes_, graph_, buffer);
+    for(ir::value *v: values_.at(buffer))
+      buffer->size = std::max<int>(buffer->size, num_bytes(v));
  }

  // Assigns index to each instruction
--- a/lib/codegen/analysis/tiles.cc
+++ b/lib/codegen/analysis/tiles.cc
@@ -40,7 +40,7 @@ bool is_hmma_a_col(ir::value* v) {
  for(ir::user *u: v->get_users())
    if(is_hmma_c(u)){
      ir::dot_inst* dot = (ir::dot_inst*)u;
-      if((v == dot->get_operand(0)) && !dot->is_a_trans())
+      if((v == dot->get_operand(0)))
        return true;
    }
 }
@@ -49,7 +49,7 @@ bool is_hmma_a_row(ir::value* v) {
  for(ir::user *u: v->get_users())
    if(is_hmma_c(u)){
      ir::dot_inst* dot = (ir::dot_inst*)u;
-      if((v == dot->get_operand(0)) && dot->is_a_trans())
+      if((v == dot->get_operand(0)))
        return true;
    }
 }
@@ -58,7 +58,7 @@ bool is_hmma_b_col(ir::value* v) {
  for(ir::user *u: v->get_users())
    if(is_hmma_c(u)){
      ir::dot_inst* dot = (ir::dot_inst*)u;
-      if((v == dot->get_operand(1)) && !dot->is_b_trans())
+      if((v == dot->get_operand(1)))
        return true;
    }
 }
@@ -67,7 +67,7 @@ bool is_hmma_b_row(ir::value* v) {
  for(ir::user *u: v->get_users())
    if(is_hmma_c(u)){
      ir::dot_inst* dot = (ir::dot_inst*)u;
-      if((v == dot->get_operand(1)) && dot->is_b_trans())
+      if((v == dot->get_operand(1)))
        return true;
    }
 }
@@ -170,6 +170,7 @@ void tiles::init_scanline_tile(ir::value *i) {
  unsigned effective_num_threads = 1;
  for(size_t d = 0; d < shapes.size(); d++)
    effective_num_threads *= mts_[axes_->get_id(i, d)];
+//  std::cout << num_threads << " " << effective_num_threads << std::endl;
  if(num_threads != effective_num_threads)
    throw std::runtime_error("cannot create a kernel with this amount of warps");
 }
@@ -219,7 +220,7 @@ void tiles::run(ir::module &) {
    largest_[i] = *std::max_element(values.begin(), values.end(), cmp);
  }

-  // find out the order of a group
+  // find out the layout ordering of a group
  for(size_t i = 0; i < num_groups; i++){
    std::set<ir::io_inst*> io;
    for(ir::value* v: layout_->values(i))
@@ -239,11 +240,6 @@ void tiles::run(ir::module &) {
    order_[i] = order;
  }
  for(size_t i = 0; i < num_groups; i++){
-    bool is_hmma_op = hmma_[i] == HMMA_A_COL || hmma_[i] == HMMA_A_ROW ||
-                      hmma_[i] == HMMA_B_COL || hmma_[i] == HMMA_B_ROW;
-    if(!is_hmma_op)
-      continue;
-    // extract copies to shared memory
    std::vector<ir::copy_to_shared_inst*> cts;
    for(ir::value* v: layout_->values(i))
      if(auto *x = dynamic_cast<ir::copy_to_shared_inst*>(v))
--- a/lib/codegen/selection.cc
+++ b/lib/codegen/selection.cc
@@ -146,26 +146,30 @@ void shared_tile::extract_constant(const indices_t &arg_idx, indices_t &non_cst_
 }


-Value* shared_tile::shared_offset(llvm::IRBuilder<> &builder, const shapes_t& shapes, const std::vector<int>& order, indices_t idx) {
+Value* shared_tile::shared_offset(llvm::IRBuilder<> &builder, const shapes_t& shapes, const std::vector<int>& perm, const std::vector<int>& order, indices_t idx) {
+  // strides
+  std::vector<Value*> strides(order.size());
+  strides[order[0]] = builder.getInt32(1);
+  for(size_t i = 1; i < idx.size(); i++)
+    strides[order[i]] = builder.CreateMul(strides[order[i-1]], builder.getInt32(shapes[order[i-1]]));
+  // result
  Value *result = builder.getInt32(0);
-  result = builder.CreateAdd(result, idx[order[0]]);
-  Value *ld = builder.getInt32(shapes[order[0]]);
-  for(size_t i = 1; i < idx.size(); i++) {
-    result = builder.CreateAdd(result, builder.CreateMul(idx[order[i]], ld));
-    if(i < idx.size() - 1){
-      ld = builder.CreateMul(ld, builder.getInt32(shapes[order[i]]));
-    }
-  }
+  for(size_t i = 0; i < strides.size(); i++)
+    result = builder.CreateAdd(result, builder.CreateMul(idx[perm[i]], strides[i]));
  return result;
 }

-shared_tile::shared_tile(Type *ty, const shapes_t &shapes, const std::vector<int>& order, Value *ptr, llvm::IRBuilder<> &builder, Value *offset):
-  tile(ty, shapes), order_(order), ptr_(ptr), builder_(builder), offset_(offset), vector_size_(1){
+shared_tile::shared_tile(Type *ty, const shapes_t &shapes, const std::vector<int>& order, Value *ptr, llvm::IRBuilder<> &builder, Value *offset, const std::vector<int>& perm):
+  tile(ty, shapes), order_(order), ptr_(ptr), builder_(builder), offset_(offset), vector_size_(1), perm_(perm){
  return_vector_ = false;
+  if(perm_.empty()){
+    perm_.resize(shapes.size());
+    std::iota(perm_.begin(), perm_.end(), 0);
+  }
 }

 void shared_tile::set_value(indices_t idx, Value *value) {
-  Value *ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, order_, idx));
+  Value *ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, perm_, order_, idx));
  unsigned addr_space = ptr->getType()->getPointerAddressSpace();
  ptr = builder_.CreateBitCast(ptr, value->getType()->getPointerTo(addr_space));
  builder_.CreateStore(value, ptr);
@@ -196,7 +200,7 @@ Value* shared_tile::get_value(indices_t idx) {
 //    if(isa<Instruction>(non_cst_idx.front())){
 //      builder_.SetInsertPoint((Instruction*)non_cst_idx.front());
 //    }
-    base_ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, order_, non_cst_idx));
+    base_ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, perm_, order_, non_cst_idx));
    if(vector_size_ > 1){
      Type *vec_ty = VectorType::get(ty, vector_size);
      Type *vec_ptr_ty = PointerType::get(vec_ty, base_ptr->getType()->getPointerAddressSpace());
@@ -204,7 +208,7 @@ Value* shared_tile::get_value(indices_t idx) {
    }
 //    builder_.SetInsertPoint(store);
  }
-  Value *offset = shared_offset(builder_, shapes_, order_, cst_idx);
+  Value *offset = shared_offset(builder_, shapes_, perm_, order_, cst_idx);
  Value *div = offset;
  if(vector_size_ > 1)
    div = builder_.CreateUDiv(offset, builder_.getInt32(vector_size_));
@@ -725,7 +729,7 @@ void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh
    return;
  auto order = tiles_->order(v);
  auto shapes = v->get_type()->get_tile_shapes();
-  unsigned pad = liveness_->is_ld_padded(v);
+  unsigned pad = liveness_->get_pad(v);
  if(pad > 0)
    shapes[order[0]] += pad;
  Type* ty = llvm_type(v->get_type()->get_scalar_ty(), builder.getContext());
@@ -923,7 +927,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn,
    write_idx.insert(write_idx.begin() + axis, lane);

    // shared memory write  pointer
-    Value *write_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), op_tile->get_order(), write_idx);
+    Value *write_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), {0, 1}, op_tile->get_order(), write_idx);
    Value *write_ptr = builder.CreateGEP(base_ptr, write_offset);

    // initialize shared memory
@@ -936,7 +940,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn,
      indices_t current(write_idx.size(), builder.getInt32(0));
      current[axis] = builder.getInt32(i);
      // shared memory offset
-      Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), op_tile->get_order(), current);
+      Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), {0, 1}, op_tile->get_order(), current);
      Value *is_active = builder.CreateICmpULT(lane, builder.getInt32(i));
      read_offset = builder.CreateSelect(is_active, read_offset, builder.getInt32(0));
      // shared memory read pointer
@@ -952,7 +956,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn,
    // result is on the first lane of shared memory
    indices_t final = write_idx;
    final[axis] = builder.getInt32(0);
-    Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), op_tile->get_order(), final);
+    Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), {0, 1}, op_tile->get_order(), final);
    Value *read_ptr = builder.CreateGEP(base_ptr, read_offset);
    tgt_->add_barrier(module, builder);
    result = builder.CreateLoad(read_ptr);
@@ -1041,11 +1045,7 @@ void selection::lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ct

 void selection::lower_trans(ir::trans_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) {
  shared_tile* in = (shared_tile*)tmap_.at(x->get_operand(0));
-  auto in_order = in->get_order();
-  std::vector<int> order;
-  for(auto p: x->get_perm())
-    order.push_back(in_order[p->get_value()]);
-  shared_tile* out = new shared_tile(in->get_ty(), in->get_shapes(), order, in->get_pointer(), builder, in->get_offset());
+  shared_tile* out = new shared_tile(in->get_ty(), in->get_shapes(), in->get_order(), in->get_pointer(), builder, in->get_offset(), x->get_perm());
  tmap_[x] = out;
 }

@@ -1082,8 +1082,8 @@ void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn
  auto ord_a = tiles_->order(dot->get_operand(0));
  auto ord_b = tiles_->order(dot->get_operand(1));

-  bool is_a_row = dot->is_a_trans() ^ ord_a[ord_a.size() - 2] == 1;
-  bool is_b_row = dot->is_b_trans() ^ ord_b[ord_b.size() - 2] == 1;
+  bool is_a_row = ord_a[ord_a.size() - 2] == 1;
+  bool is_b_row = ord_b[ord_b.size() - 2] == 1;

  if(is_a_row){
    offset_a_i = builder.CreateAdd(offset_a_i, builder.CreateURem(u_thread_id, builder.getInt32(4)));
@@ -1125,10 +1125,6 @@ void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn
      Value *current_offset_b_i = builder.CreateAdd(offset_b_j, builder.getInt32(pack_j*stride_rep_j*pack_size_1_));
      indices_t idx_a = {current_offset_a_i, builder.CreateAdd(offset_a_k, _K)};
      indices_t idx_b = {current_offset_b_i, builder.CreateAdd(offset_b_k, _K)};
-      if(dot->is_a_trans())
-        std::swap(idx_a[0], idx_a[1]);
-      if(!dot->is_b_trans())
-        std::swap(idx_b[0], idx_b[1]);
      idx_a.insert(idx_a.end(), x.first.begin(), x.first.end());
      idx_b.insert(idx_b.end(), x.first.begin(), x.first.end());
      Value *ha = TA->get_value(idx_a);
@@ -1188,10 +1184,6 @@ void selection::lower_scanline_dot(ir::dot_inst *dot, LLVMContext &ctx, Function
      // input indices
      indices_t a_idx = {idx[0], builder.getInt32(K)};
      indices_t b_idx = {builder.getInt32(K), idx[1]};
-      if(dot->is_a_trans())
-        std::swap(a_idx[0], a_idx[1]);
-      if(dot->is_b_trans())
-        std::swap(b_idx[0], b_idx[1]);
      // add batching dimension
      for(size_t i = 2; i < idx.size(); i++){
        a_idx.insert(a_idx.end(), idx[i]);
@@ -1217,10 +1209,8 @@ void selection::lower_outer_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *f
    Value *res = TD->get_value(idx);
    indices_t a_idx = {idx[0], builder.getInt32(0)};
    indices_t b_idx = {builder.getInt32(0), idx[1]};
-    if(dot->is_a_trans())
-      std::swap(a_idx[0], a_idx[1]);
-    if(dot->is_b_trans())
-      std::swap(b_idx[0], b_idx[1]);
+    std::swap(a_idx[0], a_idx[1]);
+    std::swap(b_idx[0], b_idx[1]);
    Value *a = TA->get_value(a_idx);
    Value *b = TB->get_value(b_idx);
    if(a->getType() != c_ty)
@@ -1243,7 +1233,7 @@ void selection::lower_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRB
  Type *c_ty = llvm_type(D->get_type()->get_scalar_ty(), ctx);
  Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty});
  auto A_shapes = A->get_type()->get_tile_shapes();
-  size_t red_axis = dot->is_a_trans() ? 0 : 1;
+  size_t red_axis = 1;
  unsigned NK = A_shapes[red_axis];

  if(NK != 1) {
@@ -1552,8 +1542,8 @@ void selection::run(ir::module &src, Module &dst) {
            offset->addIncoming(next_offset, llvm_inc_block);
          }
          else {
-            unsigned num_bytes = phi->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8;
-            offset->addIncoming(dst_builder.getInt32(liveness_->num_bytes(phi)/(num_bytes)), llvm_inc_block);
+            unsigned num_bytes = inst->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8;
+            offset->addIncoming(dst_builder.getInt32(liveness_->get_buffer(inst)->size / (2*num_bytes)), llvm_inc_block);
          }
          ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block);
        }
--- a/lib/codegen/transform/membar.cc
+++ b/lib/codegen/transform/membar.cc
@@ -38,8 +38,8 @@ void membar::add_reference(ir::value *v, interval_vec_t &res){
    return;
  if(alloc_->has_offset(v)){
    unsigned offset = alloc_->offset(v);
-    unsigned num_bytes = liveness_->num_bytes(v);
-    res.push_back(interval_t(offset, offset + num_bytes));
+    unsigned size = liveness_->get_buffer(v)->size;
+    res.push_back(interval_t(offset, offset + size));
  }
 }

--- a/lib/codegen/transform/peephole.cc
+++ b/lib/codegen/transform/peephole.cc
@@ -8,37 +8,8 @@ namespace codegen{
 namespace transform{


-inline bool is_trans(ir::value *v){
-  auto *x = dynamic_cast<ir::trans_inst*>(v);
-  if(!x)
-    return false;
-  std::vector<ir::constant_int*> perm = x->get_perm();
-  std::vector<ir::constant_int*> ref;
-  ir::type *int32_ty = ir::type::get_int32_ty(v->get_type()->get_context());
-  for(size_t i = 0; i < perm.size(); i++)
-    ref.push_back(ir::constant_int::get(int32_ty, i));
-  std::swap(ref[0], ref[1]);
-  // true is perm == ref
-  return std::equal(perm.begin(), perm.end(), ref.begin());
-}
-
-inline bool is_hmma(ir::value *v){
-  bool result = false;
-  if(auto *x = dynamic_cast<ir::dot_inst*>(v)){
-    ir::value *a = x->get_operand(0);
-    ir::type *a_ty = a->get_type();
-    ir::value *b = x->get_operand(1);
-    ir::type *b_ty = b->get_type();
-    // inputs have to be FP16
-    result = a_ty->get_scalar_ty()->is_half_ty() && b_ty->get_scalar_ty()->is_half_ty();
-//   reduction has to be multiple of 4
-//    result = result && ((a_ty->get_tile_shapes()[1]->get_value() % 4) == 0);
-  }
-  return result;
-}
-
 ir::value* rewrite_trans_phi_impl(ir::value *value, ir::builder &builder,
-                                 const std::vector<ir::constant_int*>& perm) {
+                                 const std::vector<int>& perm) {
  if(auto phi = dynamic_cast<ir::phi_node*>(value)) {
    // transpose operands
    std::vector<ir::value*> incs;
@@ -106,9 +77,7 @@ bool peephole::rewrite_dot(ir::instruction *value, ir::builder& builder){
    ir::value *a = dot->get_operand(0);
    ir::value *b = dot->get_operand(1);
    builder.set_insert_point(add);
-    ir::value * new_dot = builder.insert(ir::dot_inst::create(a, b, other,
-                                                              dot->is_a_trans(), dot->is_b_trans(),
-                                                              dot->get_name()));
+    ir::value * new_dot = builder.insert(ir::dot_inst::create_nn(a, b, other, dot->get_name()));
    add->replace_all_uses_with(new_dot);
    return true;
  }