[code generation]: more progress for instruction selection

2019-01-26 02:05:56 -05:00
parent e2de27dfe2
commit e522b06be2
8 changed files with 91 additions and 34 deletions
--- a/examples/matrix.cpp
+++ b/examples/matrix.cpp
@@ -32,10 +32,7 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\
  int32 k;\
  fp32* pa[32, 8] = a + rx[:, newaxis] + rka[newaxis, :]*M;\
  fp32* pb[32, 8] = b + ry[:, newaxis] + rkb[newaxis, :]*K;\
-  fp32* pc[32, 32];\
-  for(k = 0; k < K; k = k + 8){\
-  }\
-  pc = c + rx[:, newaxis] + ry[newaxis, :];\
+  fp32* pc[32, 32] = c + rx[:, newaxis] + ry[newaxis, :]*M;\
  *pc = C;\
 }\
 ";
@@ -59,15 +56,13 @@ int main() {
   tune.run(module);
   std::vector<unsigned> params = {
     // asm
-     2, 16, 1,
+     2, 8, 1,
     // bsn
-     2, 16, 1,
+     4, 4, 1,
     // pa
-     1, 2, 4,
+     2, 4, 1,
     // pb
-     1, 2, 4,
-     // c
-     2, 16, 1, 1, 2, 4
+     1, 8, 1,
   };
   std::map<tdl::ir::value*, std::vector<std::string>> errors;
   unsigned i = 0;
@@ -75,11 +70,11 @@ int main() {
   for(unsigned *x: tune.get_params(module))
     *x = params[i++];
   tune.check_constraints(module, errors);
-//   std::cout << "errors: " << errors.size() << std::endl;
-//   for(auto &x: errors){
-//   for(auto &e: x.second)
-//     std::cout << e << std::endl;
-//   }
+   std::cout << "errors: " << errors.size() << std::endl;
+   for(auto &x: errors){
+   for(auto &e: x.second)
+     std::cout << e << std::endl;
+   }
   shared.run(module);
   liveness.run(module);
   allocation.run();
--- a/include/ast/ast.h
+++ b/include/ast/ast.h
@@ -114,9 +114,15 @@ public:
  const slice_enum_t type_;
 };

+class named_expression;
+
 class expression: public node{
 public:
  virtual ir::value* codegen(ir::module *) const = 0;
+  named_expression *lvalue() const { return lvalue_; }
+
+protected:
+  named_expression *lvalue_;
 };

 class postfix_expression: public expression{
@@ -163,10 +169,9 @@ private:
  const list<slice*>* slices_;
 };

-
 class named_expression: public expression {
 public:
-  named_expression(node *id): id_((const identifier*)id) { }
+  named_expression(node *id): id_((const identifier*)id) { lvalue_ = this; }
  const identifier *id() const { return id_; }
  ir::value* codegen(ir::module * mod) const;

@@ -227,8 +232,11 @@ private:

 public:
  unary_operator(UNARY_OP_T op, node *arg)
-    : op_(op),
-      arg_((expression*)arg) { }
+      : op_(op),
+        arg_((expression*)arg) {
+    if(op == DEREF)
+      this->lvalue_ = arg_->lvalue();
+  }

  UNARY_OP_T get_op() const { return op_; }
  ir::value* codegen(ir::module *mod) const;
--- a/include/codegen/selection.h
+++ b/include/codegen/selection.h
@@ -32,9 +32,10 @@ protected:
  typedef std::vector<unsigned> shapes_t;

 public:
-  tile(const shapes_t &shapes): shapes_(shapes){ }
+  tile(llvm::Type *ty, const shapes_t &shapes): shapes_(shapes){ }

 private:
+  llvm::Type *ty_;
  shapes_t shapes_;
 };

@@ -46,13 +47,20 @@ public:

 class distributed_tile: public tile{
  typedef std::vector<distributed_axis> axes_t;
+  typedef std::vector<llvm::Value*> indices_t;
+  typedef std::map<indices_t, unsigned> indices_map_t;
+  typedef std::vector<llvm::Value*> values_t;
+
+private:
+  void init_indices();

 public:
-  distributed_tile(const shapes_t& shapes, const axes_t &axes)
-    : tile(shapes), axes_(axes) {}
+  distributed_tile(llvm::Type *ty, const shapes_t& shapes, const axes_t &axes);

 private:
  axes_t axes_;
+  indices_map_t indices_;
+  values_t values_;
 };


--- a/include/ir/instructions.h
+++ b/include/ir/instructions.h
@@ -26,6 +26,8 @@ public:
  const basic_block *get_parent() const { return parent_;  }
  basic_block *get_parent()             { return parent_;  }
  void erase_from_parent();
+  // helpers
+  bool has_tile_result_or_op();

 private:
  basic_block *parent_;
--- a/lib/ast/lowering.cpp
+++ b/lib/ast/lowering.cpp
@@ -523,7 +523,8 @@ ir::value *assignment_expression::codegen(ir::module *mod) const{
    mod->set_value(x->id()->name(), rvalue);
  else if(auto* x = dynamic_cast<const unary_operator*>(lvalue_)){
    assert(x->get_op()==DEREF);
-    ir::value *ptr = x->codegen(mod);
+    assert(x->lvalue());
+    ir::value *ptr = x->lvalue()->codegen(mod);
    mod->get_builder().create_store(ptr, rvalue);
  }
  return rvalue;
--- a/lib/codegen/selection.cpp
+++ b/lib/codegen/selection.cpp
@@ -13,6 +13,33 @@ namespace codegen{

 using namespace llvm;

+/* Distributed Tile */
+void distributed_tile::init_indices() {
+  std::vector<size_t> id(axes_.size(), 0);
+  size_t k = 0;
+  while(true) {
+    indices_t current;
+    for(size_t d = 0; d < id.size(); d++)
+      current.push_back(axes_[d].values[id[d]]);
+    indices_[current] = indices_.size();
+    id[0]++;
+    while(id[k] == axes_[k].values.size()){
+      if(k == id.size() - 1)
+        return;
+      id[k++] = 0;
+      id[k]++;
+    }
+    k = 0;
+  }
+}
+
+distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes)
+    : tile(ty, shapes), axes_(axes) {
+  init_indices();
+  for(size_t i = 0; i < indices_.size(); i++)
+    values_.push_back(UndefValue::get(ty_));
+}
+

 /* convert ir::type to Type */
 Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) {
@@ -186,7 +213,7 @@ void selection::init_axes(ir::instruction *instr, IRBuilder<> &builder, Value *u
      unsigned offset = n / contiguous[k] * per_block + n % contiguous[k];
      idx_list[n] = builder.CreateAdd(thread_id, builder.getInt32(offset));
    }
-    axes[k] = {idx_list};
+    axes[k] = distributed_axis{idx_list};
  }
  // Store axes
  axes_[instr] = axes;
@@ -230,6 +257,7 @@ void selection::create_grids(std::vector<ir::instruction*> &grids,
 void selection::init_grids(ir::function *fn, IRBuilder<> &builder){
  // fetch linear ID
  Module *mod = builder.GetInsertBlock()->getParent()->getParent();
+  LLVMContext &ctx = builder.getContext();
  Function *get_thread_id = Intrinsic::getDeclaration(mod, Intrinsic::nvvm_read_ptx_sreg_tid_x);
  Value *warp_size = builder.getInt32(32);
  Value *u_thread_id = builder.CreateCall(get_thread_id, {});
@@ -248,9 +276,10 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder){
      continue;
    bool is_shared = dynamic_cast<ir::copy_to_shared_inst*>(i);
    const auto& shapes = i->get_type()->get_tile_shapes();
+    Type* ty = llvm_type(i->get_type(), ctx);
    // create shared tile
    if(is_shared){
-      tmap_.insert({i, new shared_tile(shapes)});
+      tmap_.insert({i, new shared_tile(ty, shapes)});
    }
    // create distributed tile
    else {
@@ -264,20 +293,18 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder){
        else
          axes[d].values = {builder.getInt32(0)};
      }
-      tmap_.insert({i, new distributed_tile(shapes, axes)});
+      tmap_.insert({i, new distributed_tile(ty, shapes, axes)});
    }
  }
 }

 void selection::lower_tile_instruction(ir::instruction *src, llvm::IRBuilder<> &builder) {
-
+  std::cout << typeid(*src).name() << std::endl;
 }

 void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) {
  LLVMContext &ctx = builder.getContext();
-  std::cout << typeid(*src).name() << " " << src->get_type()->get_type_id() << std::endl;
-  if(src->get_type()->is_tile_ty()) {
-    std::cout << "tile instruction" << std::endl;
+  if(src->has_tile_result_or_op()) {
    lower_tile_instruction(src, builder);
  }
  else {
--- a/lib/codegen/tune.cpp
+++ b/lib/codegen/tune.cpp
@@ -29,7 +29,13 @@ void tune::init_c_phi(ir::instruction *v) {
 }

 void tune::init_c_graph(ir::instruction *v) {
-  const auto& shapes = v->get_type()->get_tile_shapes();
+  // Reference shape
+  std::vector<unsigned> shapes;
+  if(auto *store = dynamic_cast<ir::store_inst*>(v))
+    shapes = store->get_pointer_operand()->get_type()->get_tile_shapes();
+  else
+    shapes = v->get_type()->get_tile_shapes();
+  // Reshape
  if(dynamic_cast<ir::reshape_inst*>(v)){
    ir::value *op = v->get_operand(0);
    unsigned current = 0;
@@ -40,9 +46,11 @@ void tune::init_c_graph(ir::instruction *v) {
        add_constraint({v, i}, {op, current++});
    }
  }
+  // Splat
  else if(dynamic_cast<ir::splat_inst*>(v)){

  }
+  // Broadcast
  else if(dynamic_cast<ir::broadcast_inst*>(v)){
    ir::value *op = v->get_operand(0);
    ir::type *op_ty = op->get_type();
@@ -51,13 +59,14 @@ void tune::init_c_graph(ir::instruction *v) {
      if(op_shapes[i] == shapes[i] && v != op)
        add_constraint({v, i}, {op, i});
    }
-
  }
+  // Matrix multiplication
  else if(dynamic_cast<ir::matmul_inst*>(v)){
    ir::value *D = v->get_operand(2);
    add_constraint({v, 0}, {D, 0});
    add_constraint({v, 1}, {D, 1});
  }
+  // Element-wise
  else if(dynamic_cast<ir::user*>(v)){
    for(unsigned i = 0; i < shapes.size(); i ++)
      for(ir::value* op: v->ops()){
@@ -102,18 +111,19 @@ std::map<std::string, unsigned*> tune::get_params(ir::instruction* i) {
  return params_.at(i);
 }

+
 void tune::run(ir::module &mod) {
  for(ir::function *fn: mod.get_function_list()){
    // Build constraints graph
    for(ir::basic_block *block: fn->blocks())
    for(ir::instruction *i : block->get_inst_list())
-    if(i->get_type()->is_tile_ty()){
+    if(i->has_tile_result_or_op()){
      init_c_graph(i);
    }
    // Build phi constraints
    for(ir::basic_block *block: fn->blocks())
    for(ir::instruction *i : block->get_inst_list())
-    if(i->get_type()->is_tile_ty())
+    if(i->has_tile_result_or_op())
      init_c_phi(i);
    // Layout parameters
    while(!nodes_.empty()){
--- a/lib/ir/instructions.cpp
+++ b/lib/ir/instructions.cpp
@@ -25,6 +25,12 @@ void instruction::erase_from_parent() {
  parent_->erase(this);
 }

+bool instruction::has_tile_result_or_op() {
+  bool result = get_type()->is_tile_ty();
+  for(ir::value *v: ops())
+    result |= v->get_type()->is_tile_ty();
+  return result;
+}

 //===----------------------------------------------------------------------===//
 //                               phi_node classes