diff --git a/examples/matrix.cpp b/examples/matrix.cpp
index bc37b81ae..bc7239038 100644
--- a/examples/matrix.cpp
+++ b/examples/matrix.cpp
@@ -32,10 +32,7 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\
   int32 k;\
   fp32* pa[32, 8] = a + rx[:, newaxis] + rka[newaxis, :]*M;\
   fp32* pb[32, 8] = b + ry[:, newaxis] + rkb[newaxis, :]*K;\
-  fp32* pc[32, 32];\
-  for(k = 0; k < K; k = k + 8){\
-  }\
-  pc = c + rx[:, newaxis] + ry[newaxis, :];\
+  fp32* pc[32, 32] = c + rx[:, newaxis] + ry[newaxis, :]*M;\
   *pc = C;\
 }\
 ";
@@ -59,15 +56,13 @@ int main() {
    tune.run(module);
    std::vector<unsigned> params = {
      // asm
-     2, 16, 1,
+     2, 8, 1,
      // bsn
-     2, 16, 1,
+     4, 4, 1,
      // pa
-     1, 2, 4,
+     2, 4, 1,
      // pb
-     1, 2, 4,
-     // c
-     2, 16, 1, 1, 2, 4
+     1, 8, 1,
    };
    std::map<tdl::ir::value*, std::vector<std::string>> errors;
    unsigned i = 0;
@@ -75,11 +70,11 @@ int main() {
    for(unsigned *x: tune.get_params(module))
      *x = params[i++];
    tune.check_constraints(module, errors);
-//   std::cout << "errors: " << errors.size() << std::endl;
-//   for(auto &x: errors){
-//   for(auto &e: x.second)
-//     std::cout << e << std::endl;
-//   }
+   std::cout << "errors: " << errors.size() << std::endl;
+   for(auto &x: errors){
+   for(auto &e: x.second)
+     std::cout << e << std::endl;
+   }
    shared.run(module);
    liveness.run(module);
    allocation.run();
diff --git a/include/ast/ast.h b/include/ast/ast.h
index fed9d6556..7a2a62563 100644
--- a/include/ast/ast.h
+++ b/include/ast/ast.h
@@ -114,9 +114,15 @@ public:
   const slice_enum_t type_;
 };
 
+class named_expression;
+
 class expression: public node{
 public:
   virtual ir::value* codegen(ir::module *) const = 0;
+  named_expression *lvalue() const { return lvalue_; }
+
+protected:
+  named_expression *lvalue_;
 };
 
 class postfix_expression: public expression{
@@ -163,10 +169,9 @@ private:
   const list<slice*>* slices_;
 };
 
-
 class named_expression: public expression {
 public:
-  named_expression(node *id): id_((const identifier*)id) { }
+  named_expression(node *id): id_((const identifier*)id) { lvalue_ = this; }
   const identifier *id() const { return id_; }
   ir::value* codegen(ir::module * mod) const;
 
@@ -227,8 +232,11 @@ private:
 
 public:
   unary_operator(UNARY_OP_T op, node *arg)
-    : op_(op),
-      arg_((expression*)arg) { }
+      : op_(op),
+        arg_((expression*)arg) {
+    if(op == DEREF)
+      this->lvalue_ = arg_->lvalue();
+  }
 
   UNARY_OP_T get_op() const { return op_; }
   ir::value* codegen(ir::module *mod) const;
diff --git a/include/codegen/selection.h b/include/codegen/selection.h
index 92c9f79b5..6fa05782e 100644
--- a/include/codegen/selection.h
+++ b/include/codegen/selection.h
@@ -32,9 +32,10 @@ protected:
   typedef std::vector<unsigned> shapes_t;
 
 public:
-  tile(const shapes_t &shapes): shapes_(shapes){ }
+  tile(llvm::Type *ty, const shapes_t &shapes): shapes_(shapes){ }
 
 private:
+  llvm::Type *ty_;
   shapes_t shapes_;
 };
 
@@ -46,13 +47,20 @@ public:
 
 class distributed_tile: public tile{
   typedef std::vector<distributed_axis> axes_t;
+  typedef std::vector<llvm::Value*> indices_t;
+  typedef std::map<indices_t, unsigned> indices_map_t;
+  typedef std::vector<llvm::Value*> values_t;
+
+private:
+  void init_indices();
 
 public:
-  distributed_tile(const shapes_t& shapes, const axes_t &axes)
-    : tile(shapes), axes_(axes) {}
+  distributed_tile(llvm::Type *ty, const shapes_t& shapes, const axes_t &axes);
 
 private:
   axes_t axes_;
+  indices_map_t indices_;
+  values_t values_;
 };
 
 
diff --git a/include/ir/instructions.h b/include/ir/instructions.h
index 09c129160..44bafb151 100644
--- a/include/ir/instructions.h
+++ b/include/ir/instructions.h
@@ -26,6 +26,8 @@ public:
   const basic_block *get_parent() const { return parent_;  }
   basic_block *get_parent()             { return parent_;  }
   void erase_from_parent();
+  // helpers
+  bool has_tile_result_or_op();
 
 private:
   basic_block *parent_;
diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp
index 490dcdba1..9d91e3588 100644
--- a/lib/ast/lowering.cpp
+++ b/lib/ast/lowering.cpp
@@ -523,7 +523,8 @@ ir::value *assignment_expression::codegen(ir::module *mod) const{
     mod->set_value(x->id()->name(), rvalue);
   else if(auto* x = dynamic_cast<const unary_operator*>(lvalue_)){
     assert(x->get_op()==DEREF);
-    ir::value *ptr = x->codegen(mod);
+    assert(x->lvalue());
+    ir::value *ptr = x->lvalue()->codegen(mod);
     mod->get_builder().create_store(ptr, rvalue);
   }
   return rvalue;
diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp
index 4092be811..497f1f302 100644
--- a/lib/codegen/selection.cpp
+++ b/lib/codegen/selection.cpp
@@ -13,6 +13,33 @@ namespace codegen{
 
 using namespace llvm;
 
+/* Distributed Tile */
+void distributed_tile::init_indices() {
+  std::vector<size_t> id(axes_.size(), 0);
+  size_t k = 0;
+  while(true) {
+    indices_t current;
+    for(size_t d = 0; d < id.size(); d++)
+      current.push_back(axes_[d].values[id[d]]);
+    indices_[current] = indices_.size();
+    id[0]++;
+    while(id[k] == axes_[k].values.size()){
+      if(k == id.size() - 1)
+        return;
+      id[k++] = 0;
+      id[k]++;
+    }
+    k = 0;
+  }
+}
+
+distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes)
+    : tile(ty, shapes), axes_(axes) {
+  init_indices();
+  for(size_t i = 0; i < indices_.size(); i++)
+    values_.push_back(UndefValue::get(ty_));
+}
+
 
 /* convert ir::type to Type */
 Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) {
@@ -186,7 +213,7 @@ void selection::init_axes(ir::instruction *instr, IRBuilder<> &builder, Value *u
       unsigned offset = n / contiguous[k] * per_block + n % contiguous[k];
       idx_list[n] = builder.CreateAdd(thread_id, builder.getInt32(offset));
     }
-    axes[k] = {idx_list};
+    axes[k] = distributed_axis{idx_list};
   }
   // Store axes
   axes_[instr] = axes;
@@ -230,6 +257,7 @@ void selection::create_grids(std::vector<ir::instruction*> &grids,
 void selection::init_grids(ir::function *fn, IRBuilder<> &builder){
   // fetch linear ID
   Module *mod = builder.GetInsertBlock()->getParent()->getParent();
+  LLVMContext &ctx = builder.getContext();
   Function *get_thread_id = Intrinsic::getDeclaration(mod, Intrinsic::nvvm_read_ptx_sreg_tid_x);
   Value *warp_size = builder.getInt32(32);
   Value *u_thread_id = builder.CreateCall(get_thread_id, {});
@@ -248,9 +276,10 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder){
       continue;
     bool is_shared = dynamic_cast<ir::copy_to_shared_inst*>(i);
     const auto& shapes = i->get_type()->get_tile_shapes();
+    Type* ty = llvm_type(i->get_type(), ctx);
     // create shared tile
     if(is_shared){
-      tmap_.insert({i, new shared_tile(shapes)});
+      tmap_.insert({i, new shared_tile(ty, shapes)});
     }
     // create distributed tile
     else {
@@ -264,20 +293,18 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder){
         else
           axes[d].values = {builder.getInt32(0)};
       }
-      tmap_.insert({i, new distributed_tile(shapes, axes)});
+      tmap_.insert({i, new distributed_tile(ty, shapes, axes)});
     }
   }
 }
 
 void selection::lower_tile_instruction(ir::instruction *src, llvm::IRBuilder<> &builder) {
-
+  std::cout << typeid(*src).name() << std::endl;
 }
 
 void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) {
   LLVMContext &ctx = builder.getContext();
-  std::cout << typeid(*src).name() << " " << src->get_type()->get_type_id() << std::endl;
-  if(src->get_type()->is_tile_ty()) {
-    std::cout << "tile instruction" << std::endl;
+  if(src->has_tile_result_or_op()) {
     lower_tile_instruction(src, builder);
   }
   else {
diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp
index 7d0a673a2..63e4f582f 100644
--- a/lib/codegen/tune.cpp
+++ b/lib/codegen/tune.cpp
@@ -29,7 +29,13 @@ void tune::init_c_phi(ir::instruction *v) {
 }
 
 void tune::init_c_graph(ir::instruction *v) {
-  const auto& shapes = v->get_type()->get_tile_shapes();
+  // Reference shape
+  std::vector<unsigned> shapes;
+  if(auto *store = dynamic_cast<ir::store_inst*>(v))
+    shapes = store->get_pointer_operand()->get_type()->get_tile_shapes();
+  else
+    shapes = v->get_type()->get_tile_shapes();
+  // Reshape
   if(dynamic_cast<ir::reshape_inst*>(v)){
     ir::value *op = v->get_operand(0);
     unsigned current = 0;
@@ -40,9 +46,11 @@ void tune::init_c_graph(ir::instruction *v) {
         add_constraint({v, i}, {op, current++});
     }
   }
+  // Splat
   else if(dynamic_cast<ir::splat_inst*>(v)){
 
   }
+  // Broadcast
   else if(dynamic_cast<ir::broadcast_inst*>(v)){
     ir::value *op = v->get_operand(0);
     ir::type *op_ty = op->get_type();
@@ -51,13 +59,14 @@ void tune::init_c_graph(ir::instruction *v) {
       if(op_shapes[i] == shapes[i] && v != op)
         add_constraint({v, i}, {op, i});
     }
-
   }
+  // Matrix multiplication
   else if(dynamic_cast<ir::matmul_inst*>(v)){
     ir::value *D = v->get_operand(2);
     add_constraint({v, 0}, {D, 0});
     add_constraint({v, 1}, {D, 1});
   }
+  // Element-wise
   else if(dynamic_cast<ir::user*>(v)){
     for(unsigned i = 0; i < shapes.size(); i ++)
       for(ir::value* op: v->ops()){
@@ -102,18 +111,19 @@ std::map<std::string, unsigned*> tune::get_params(ir::instruction* i) {
   return params_.at(i);
 }
 
+
 void tune::run(ir::module &mod) {
   for(ir::function *fn: mod.get_function_list()){
     // Build constraints graph
     for(ir::basic_block *block: fn->blocks())
     for(ir::instruction *i : block->get_inst_list())
-    if(i->get_type()->is_tile_ty()){
+    if(i->has_tile_result_or_op()){
       init_c_graph(i);
     }
     // Build phi constraints
     for(ir::basic_block *block: fn->blocks())
     for(ir::instruction *i : block->get_inst_list())
-    if(i->get_type()->is_tile_ty())
+    if(i->has_tile_result_or_op())
       init_c_phi(i);
     // Layout parameters
     while(!nodes_.empty()){
diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp
index 679ee5bb2..3cbabd45e 100644
--- a/lib/ir/instructions.cpp
+++ b/lib/ir/instructions.cpp
@@ -25,6 +25,12 @@ void instruction::erase_from_parent() {
   parent_->erase(this);
 }
 
+bool instruction::has_tile_result_or_op() {
+  bool result = get_type()->is_tile_ty();
+  for(ir::value *v: ops())
+    result |= v->get_type()->is_tile_ty();
+  return result;
+}
 
 //===----------------------------------------------------------------------===//
 //                               phi_node classes