From f8e522ada89bef53ef8707568a306845aea3f354 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 11 Feb 2019 17:27:16 -0500 Subject: [PATCH] blabla --- examples/matrix.cpp | 16 +++++--- include/codegen/selection.h | 2 + include/codegen/shared_copy.h | 5 +++ lib/codegen/selection.cpp | 73 ++++++++++++++++++++++++++++------- lib/codegen/shared_copy.cpp | 28 +++++++++----- lib/ir/module.cpp | 3 ++ 6 files changed, 98 insertions(+), 29 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 7af5c0fb9..8360fcf3c 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -1,6 +1,7 @@ #include #include #include "cuda.h" +#include "llvm/IR/Verifier.h" #include "ast/ast.h" #include "ir/context.h" #include "ir/module.h" @@ -22,6 +23,7 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Analysis/LoopPass.h" typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); @@ -44,12 +46,14 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ fp32* pa[16, 8] = a + rxa[:, newaxis] + rka[newaxis, :]*M;\ fp32* pb[16, 8] = b + ryb[:, newaxis] + rkb[newaxis, :]*K;\ fp32* pc[16, 16] = c + rxc[:, newaxis] + ryc[newaxis, :]*M;\ + fp32 a[16, 8] = *pa;\ + fp32 b[16, 8] = *pb;\ for(k = K; k > 0; k = k - 8){\ - fp32 a[16, 8] = *pa;\ - fp32 b[16, 8] = *pb;\ C = dot(a, b, C);\ pa = pa + 8*M;\ pb = pb + 8*K;\ + a = *pa;\ + b = *pb;\ }\ *pc = C;\ }\ @@ -200,11 +204,11 @@ int main() { selection.run(module, llvm_module); // llvm source - llvm::PrintModulePass print(llvm::outs()); - llvm::AnalysisManager analysis; - print.run(llvm_module, analysis); + llvm::legacy::PassManager manager; + manager.add(llvm::createPrintModulePass(llvm::outs())); +// manager.add(llvm::createVerifierPass(true)); + manager.run(llvm_module); - // generate machine code std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); // std::cout << src << std::endl; diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 18b77f42c..4dedbd088 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -54,6 +54,7 @@ public: shared_tile(llvm::Type* ty, const shapes_t &shapes, llvm::Value* ptr, llvm::IRBuilder<> &builder); void set_value(indices_t, llvm::Value *); llvm::Value* get_value(indices_t idx); + llvm::Value* get_pointer() { return ptr_; } private: llvm::Value *ptr_; @@ -102,6 +103,7 @@ private: llvm::Constant* llvm_constant(ir::constant *cst, llvm::LLVMContext &ctx); // grid construction + bool is_shared(ir::value *v); void create_grids(std::vector &grids, std::map &references, ir::function *fn); diff --git a/include/codegen/shared_copy.h b/include/codegen/shared_copy.h index 46cd8cbc8..8512ec358 100644 --- a/include/codegen/shared_copy.h +++ b/include/codegen/shared_copy.h @@ -5,11 +5,16 @@ namespace tdl { namespace ir { class module; + class value; + class builder; } namespace codegen{ class place_shared_copy { +private: + void add(ir::value *x, ir::builder &builder); + public: void run(ir::module &mod); }; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index d6a4a0bc5..5782212ad 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -8,6 +8,7 @@ #include "ir/function.h" #include "ir/type.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Analysis/LoopInfo.h" namespace tdl{ namespace codegen{ @@ -61,6 +62,7 @@ unsigned distributed_tile::get_linear_index(indices_t idx) { } void distributed_tile::for_each(std::function fn) { + std::cout << "vector size: " << vector_size_ << std::endl; for(auto &idx: indices_) if(idx.second % vector_size_ == 0) fn(idx.first); @@ -345,8 +347,7 @@ void selection::create_grids(std::vector &grids, bind_references(op); // bind const auto& shapes = v->get_type()->get_tile_shapes(); - bool is_shared = dynamic_cast(v); - if(is_shared) + if(is_shared(v)) return; for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d] == 1) @@ -368,6 +369,18 @@ void selection::create_grids(std::vector &grids, grids.push_back(ref.second); } +bool selection::is_shared(ir::value *v) { + if(auto *phi = dynamic_cast(v)){ + bool result = true; + for(ir::value *op: phi->ops()) + result = result && is_shared(op); + return result; + } + else + return (bool)dynamic_cast(v); + +} + void selection::create_tile(ir::value *v, IRBuilder<> &builder, const std::map& references, std::set &seen, Value *sh_mem_ptr) { @@ -380,12 +393,33 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, const auto& shapes = v->get_type()->get_tile_shapes(); Type* ty = llvm_type(v->get_type()->get_scalar_ty(), ctx); // create shared tile - bool is_shared = dynamic_cast(v); - if(is_shared){ - size_t offset = alloc_->get_offset(v); - Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); - ptr = builder.CreateBitCast(ptr, ty->getPointerTo(ptr->getType()->getPointerAddressSpace())); - tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + if(is_shared(v)){ + // shared copy + PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); + if(dynamic_cast(v)) { + size_t offset = alloc_->get_offset(v); + Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); + ptr = builder.CreateBitCast(ptr, ptr_ty); + tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + } + // phi-node (double-buffering) + else if(auto *phi = dynamic_cast(v)) { + BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; + builder.SetInsertPoint(&*parent->getFirstInsertionPt()); + PHINode *ptr = builder.CreatePHI(ptr_ty, 2); + for(ir::value *op: phi->ops()){ + ir::instruction *inc_val = dynamic_cast(op); + BasicBlock *inc_block = (BasicBlock*)vmap_[inc_val->get_parent()]; + size_t offset = alloc_->get_offset(inc_val); + builder.SetInsertPoint(inc_block); + Value *inc_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(0)); + inc_ptr = builder.CreateBitCast(inc_ptr, ptr_ty); + ptr->addIncoming(inc_ptr, inc_block); + } + tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + } + else + throw std::runtime_error("unknown shared memory tile"); } // create distributed tile else { @@ -532,6 +566,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & ti->set_value(idx, in->get_value(idx)); }); } + else if(is_shared(ins)) + return; // matrix multiplication else if(dynamic_cast(ins)) { ir::value *A = ins->get_operand(0); @@ -607,14 +643,10 @@ void selection::run(ir::module &src, Module &dst){ dst_builder.SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]); // allocate shared memory Value *sh_mem_ptr = nullptr; - if(unsigned alloc_size = alloc_->get_allocated_size()){ + if(alloc_->get_allocated_size()){ Type *int_8_ty = Type::getInt8Ty(dst_ctx); - ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); Type *ptr_ty = PointerType::get(int_8_ty, 3); - GlobalVariable *sh_mem_array = - new GlobalVariable(*dst_fn->getParent(), array_ty, false, GlobalVariable::InternalLinkage, - nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); - sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); + sh_mem_ptr = Constant::getNullValue(ptr_ty); } // create grids init_grids(fn, dst_builder, sh_mem_ptr); @@ -628,6 +660,19 @@ void selection::run(ir::module &src, Module &dst){ for(ir::basic_block *block: fn->blocks()) for(ir::instruction *inst: block->get_inst_list()) if(auto *phi = dynamic_cast(inst)){ + if(is_shared(phi)){ +// PHINode *ptr = (PHINode*)(((shared_tile*)tmap_.at(phi))->get_pointer()); +// for(ir::value *op: phi->ops()){ +// ir::instruction *inc_val = dynamic_cast(op); +// BasicBlock *inc_block = (BasicBlock*)vmap_[inc_val->get_parent()]; +// size_t offset = alloc_->get_offset(inc_val); +// dst_builder.SetInsertPoint(inc_block); +// Value *inc_ptr = dst_builder.CreateGEP(sh_mem_ptr, dst_builder.getInt32(offset)); +// inc_ptr = dst_builder.CreateBitCast(inc_ptr, ptr->getType()); +// ptr->addIncoming(inc_ptr, inc_block); +// } + continue; + } for(unsigned n = 0; n < phi->get_num_incoming(); n++){ ir::value *inc_val = phi->get_incoming_value(n); ir::basic_block *inc_block = phi->get_incoming_block(n); diff --git a/lib/codegen/shared_copy.cpp b/lib/codegen/shared_copy.cpp index a6c64e08d..08bac4f9a 100644 --- a/lib/codegen/shared_copy.cpp +++ b/lib/codegen/shared_copy.cpp @@ -8,21 +8,31 @@ namespace tdl { namespace codegen{ +void place_shared_copy::add(ir::value *x, ir::builder &builder) { + if(auto *phi = dynamic_cast(x)) { + for(auto *op: phi->ops()) + add(op, builder); + } + else { + if(auto *i = dynamic_cast(x)){ + ir::basic_block* block = i->get_parent(); + auto it = std::find(block->begin(), block->end(), i); + builder.set_insert_point(++it); + } + ir::instruction *rx = (ir::instruction*)builder.create_copy_to_shared(x); + x->replace_all_uses_with(rx); + rx->set_operand(0, x); + } +} + void place_shared_copy::run(ir::module &mod) { ir::builder &builder = mod.get_builder(); for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()) if(dynamic_cast(i)){ - builder.set_insert_point(i); - ir::value *x = i->get_operand(0); - ir::value *y = i->get_operand(1); - ir::instruction *rx = (ir::instruction*)builder.create_copy_to_shared(x); - ir::instruction *ry = (ir::instruction*)builder.create_copy_to_shared(y); - x->replace_all_uses_with(rx); - y->replace_all_uses_with(ry); - rx->set_operand(0, x); - ry->set_operand(0, y); + add(i->get_operand(0), builder); + add(i->get_operand(1), builder); } } diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index d95e21c3b..29636657f 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -61,6 +61,9 @@ ir::value *module::try_remove_trivial_phis(ir::phi_node *&phi){ } ir::value *module::add_phi_operands(const std::string& name, ir::phi_node *&phi){ + // already initialized + if(phi->get_num_operands()) + return phi; ir::basic_block *block = phi->get_parent(); for(ir::basic_block *pred: block->get_predecessors()){ ir::value *value = get_value(name, pred);