From e45d6bbb60ea433635522d6e6961bafec78e8b18 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 12 Feb 2019 11:00:24 -0500 Subject: [PATCH] some cleaning --- examples/matrix.cpp | 14 +++--- include/codegen/allocation.h | 7 +-- include/codegen/liveness.h | 7 ++- include/codegen/selection.h | 7 ++- lib/codegen/allocation.cpp | 6 ++- lib/codegen/liveness.cpp | 3 +- lib/codegen/selection.cpp | 82 ++++++++++++++++++------------------ 7 files changed, 72 insertions(+), 54 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 8360fcf3c..9fdbcf96a 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -11,6 +11,7 @@ #include "codegen/allocation.h" #include "codegen/liveness.h" #include "codegen/vectorize.h" +#include "codegen/buffer_info.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/Module.h" #include "llvm/IR/LLVMContext.h" @@ -162,11 +163,12 @@ int main() { // create passes tdl::codegen::place_shared_copy shared; + tdl::codegen::buffer_info_pass buffer_info; tdl::codegen::tune tune; - tdl::codegen::liveness liveness; - tdl::codegen::allocation allocation(&liveness); + tdl::codegen::liveness liveness(&buffer_info); + tdl::codegen::allocation allocation(&liveness, &buffer_info); tdl::codegen::vectorize vectorize(&tune); - tdl::codegen::selection selection(&allocation, &tune); + tdl::codegen::selection selection(&allocation, &tune, &buffer_info); // tuning parameters tune.run(module); @@ -186,7 +188,6 @@ int main() { }; std::map> errors; unsigned i = 0; - std::cout << tune.get_params(module).size() << std::endl; for(unsigned *x: tune.get_params(module)) *x = params[i++]; tune.check_constraints(module, errors); @@ -198,6 +199,7 @@ int main() { // run passes shared.run(module); + buffer_info.run(module); liveness.run(module); allocation.run(); vectorize.run(module); @@ -206,7 +208,7 @@ int main() { // llvm source llvm::legacy::PassManager manager; manager.add(llvm::createPrintModulePass(llvm::outs())); -// manager.add(llvm::createVerifierPass(true)); + manager.add(llvm::createVerifierPass(true)); manager.run(llvm_module); std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); @@ -220,7 +222,7 @@ int main() { CUstream cu_stream; int major, minor; compile_machine_code(cu_device, cu_context, cu_module, cu_kernel, cu_stream, major, minor, src, "test"); - std::cout << src << std::endl; +// std::cout << src << std::endl; // execute machine code // Allocate buffers diff --git a/include/codegen/allocation.h b/include/codegen/allocation.h index 4b90cf46a..96366b526 100644 --- a/include/codegen/allocation.h +++ b/include/codegen/allocation.h @@ -16,12 +16,12 @@ namespace codegen{ class layout; class target_tuner; class liveness; -class loop_info; +class buffer_info_pass; class allocation { public: - allocation(liveness *live) - : liveness_(live){ } + allocation(liveness *live, buffer_info_pass *buffer_info) + : liveness_(live), buffer_info_(buffer_info){ } // accessors unsigned get_offset(ir::value *x) const { return offsets_.at(x); } @@ -36,6 +36,7 @@ private: size_t allocated_size_; // dependences liveness *liveness_; + buffer_info_pass *buffer_info_; }; } diff --git a/include/codegen/liveness.h b/include/codegen/liveness.h index 11d377c62..fd4faf2f3 100644 --- a/include/codegen/liveness.h +++ b/include/codegen/liveness.h @@ -15,6 +15,8 @@ namespace codegen{ typedef unsigned slot_index; +class buffer_info_pass; + struct segment { slot_index start; slot_index end; @@ -35,11 +37,13 @@ private: typedef std::map has_storage_map_t; public: - /// Intervals iterators... + // Intervals iterators using iterator = intervals_map_t::iterator; using const_iterator = intervals_map_t::const_iterator; public: + // constructor + liveness(buffer_info_pass *info): info_(info){ } // accessors const intervals_map_t& intervals() const { return intervals_; } @@ -49,6 +53,7 @@ public: void run(ir::module &mod); private: + buffer_info_pass *info_; has_storage_map_t has_dedicated_storage_; indices_map_t indices_; intervals_map_t intervals_; diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 4dedbd088..6580ade98 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -7,6 +7,7 @@ #include "ir/module.h" #include "ir/function.h" #include "ir/type.h" +#include "codegen/buffer_info.h" namespace llvm{ @@ -22,6 +23,8 @@ namespace codegen{ class allocation; class tune; +class buffer_info_pass; + typedef std::vector indices_t; struct distributed_axis { @@ -103,7 +106,6 @@ private: llvm::Constant* llvm_constant(ir::constant *cst, llvm::LLVMContext &ctx); // grid construction - bool is_shared(ir::value *v); void create_grids(std::vector &grids, std::map &references, ir::function *fn); @@ -116,7 +118,7 @@ private: void lower_tile_instruction(ir::instruction *src, llvm::IRBuilder<> &builder); public: - selection(allocation *alloc, tune *params): alloc_(alloc), params_(params){ } + selection(allocation *alloc, tune *params, buffer_info_pass *buffer_info): alloc_(alloc), params_(params), buffer_info_(buffer_info){ } void run(ir::module &src, llvm::Module &dst); private: @@ -124,6 +126,7 @@ private: tmap_t tmap_; allocation *alloc_; tune *params_; + buffer_info_pass *buffer_info_; std::map axes_; }; diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp index 7a5154280..74c9f4c58 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/allocation.cpp @@ -1,6 +1,7 @@ #include "codegen/allocation.h" #include "codegen/liveness.h" #include "codegen/layout.h" +#include "codegen/buffer_info.h" #include "ir/basic_block.h" #include "ir/type.h" #include "ir/value.h" @@ -16,7 +17,10 @@ void allocation::run(){ typedef std::multimap triples_map_type; auto get_num_bytes = [&](ir::value *x){ - return x->get_type()->get_tile_bitwidth(); + unsigned result = x->get_type()->get_tile_bitwidth(); + if(buffer_info_->is_double(x)) + result *= 2; + return result; }; std::vector I; diff --git a/lib/codegen/liveness.cpp b/lib/codegen/liveness.cpp index bf4c99be2..05b803f8f 100644 --- a/lib/codegen/liveness.cpp +++ b/lib/codegen/liveness.cpp @@ -1,4 +1,5 @@ #include "codegen/liveness.h" +#include "codegen/buffer_info.h" #include "ir/basic_block.h" #include "ir/function.h" #include "ir/module.h" @@ -23,7 +24,7 @@ for(ir::function *fn: mod.get_function_list()){ // Creates live intervals for(auto i: indices_){ ir::value *v = i.first; - if(!dynamic_cast(v)) + if(!info_->is_shared(v) || info_->get_reference(v)) continue; unsigned start = i.second; unsigned end = start; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 5782212ad..a743a5162 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -62,7 +62,6 @@ unsigned distributed_tile::get_linear_index(indices_t idx) { } void distributed_tile::for_each(std::function fn) { - std::cout << "vector size: " << vector_size_ << std::endl; for(auto &idx: indices_) if(idx.second % vector_size_ == 0) fn(idx.first); @@ -347,7 +346,7 @@ void selection::create_grids(std::vector &grids, bind_references(op); // bind const auto& shapes = v->get_type()->get_tile_shapes(); - if(is_shared(v)) + if(buffer_info_->is_shared(v)) return; for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d] == 1) @@ -369,18 +368,6 @@ void selection::create_grids(std::vector &grids, grids.push_back(ref.second); } -bool selection::is_shared(ir::value *v) { - if(auto *phi = dynamic_cast(v)){ - bool result = true; - for(ir::value *op: phi->ops()) - result = result && is_shared(op); - return result; - } - else - return (bool)dynamic_cast(v); - -} - void selection::create_tile(ir::value *v, IRBuilder<> &builder, const std::map& references, std::set &seen, Value *sh_mem_ptr) { @@ -393,7 +380,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, const auto& shapes = v->get_type()->get_tile_shapes(); Type* ty = llvm_type(v->get_type()->get_scalar_ty(), ctx); // create shared tile - if(is_shared(v)){ + if(buffer_info_->is_shared(v)){ // shared copy PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); if(dynamic_cast(v)) { @@ -405,17 +392,8 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, // phi-node (double-buffering) else if(auto *phi = dynamic_cast(v)) { BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; - builder.SetInsertPoint(&*parent->getFirstInsertionPt()); + builder.SetInsertPoint(parent); PHINode *ptr = builder.CreatePHI(ptr_ty, 2); - for(ir::value *op: phi->ops()){ - ir::instruction *inc_val = dynamic_cast(op); - BasicBlock *inc_block = (BasicBlock*)vmap_[inc_val->get_parent()]; - size_t offset = alloc_->get_offset(inc_val); - builder.SetInsertPoint(inc_block); - Value *inc_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(0)); - inc_ptr = builder.CreateBitCast(inc_ptr, ptr_ty); - ptr->addIncoming(inc_ptr, inc_block); - } tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); } else @@ -550,7 +528,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & if(linear % vector_size == 0) packets[id] = result->get_value(idx); packets[id] = builder.CreateInsertElement(packets[id], in->get_value(idx), linear % vector_size); - std::cout << linear << std::endl; }); result->for_each([&](indices_t idx){ unsigned linear = in->get_linear_index(idx); @@ -566,7 +543,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & ti->set_value(idx, in->get_value(idx)); }); } - else if(is_shared(ins)) + else if(buffer_info_->is_shared(ins)) return; // matrix multiplication else if(dynamic_cast(ins)) { @@ -643,10 +620,14 @@ void selection::run(ir::module &src, Module &dst){ dst_builder.SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]); // allocate shared memory Value *sh_mem_ptr = nullptr; - if(alloc_->get_allocated_size()){ + if(unsigned alloc_size = alloc_->get_allocated_size()){ Type *int_8_ty = Type::getInt8Ty(dst_ctx); + ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); Type *ptr_ty = PointerType::get(int_8_ty, 3); - sh_mem_ptr = Constant::getNullValue(ptr_ty); + GlobalVariable *sh_mem_array = + new GlobalVariable(*dst_fn->getParent(), array_ty, false, GlobalVariable::ExternalLinkage, + nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); + sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); } // create grids init_grids(fn, dst_builder, sh_mem_ptr); @@ -660,17 +641,38 @@ void selection::run(ir::module &src, Module &dst){ for(ir::basic_block *block: fn->blocks()) for(ir::instruction *inst: block->get_inst_list()) if(auto *phi = dynamic_cast(inst)){ - if(is_shared(phi)){ -// PHINode *ptr = (PHINode*)(((shared_tile*)tmap_.at(phi))->get_pointer()); -// for(ir::value *op: phi->ops()){ -// ir::instruction *inc_val = dynamic_cast(op); -// BasicBlock *inc_block = (BasicBlock*)vmap_[inc_val->get_parent()]; -// size_t offset = alloc_->get_offset(inc_val); -// dst_builder.SetInsertPoint(inc_block); -// Value *inc_ptr = dst_builder.CreateGEP(sh_mem_ptr, dst_builder.getInt32(offset)); -// inc_ptr = dst_builder.CreateBitCast(inc_ptr, ptr->getType()); -// ptr->addIncoming(inc_ptr, inc_block); -// } + if(buffer_info_->is_shared(phi)) { + BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; + unsigned id_pre = 0, id_loop = 1; + if(phi->get_incoming_block(0) == phi->get_parent()) + std::swap(id_pre, id_loop); + ir::value *pre_value = phi->get_incoming_value(id_pre); + ir::value *loop_value = phi->get_incoming_value(id_loop); + BasicBlock *pre_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_pre)]; + BasicBlock *loop_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_loop)]; + int pre_offset = alloc_->get_offset(pre_value); + int loop_offset = alloc_->get_offset(loop_value); + dst_builder.SetInsertPoint(&*parent->getFirstInsertionPt()); + PHINode *ptr = (PHINode*)(((shared_tile*)tmap_.at(phi))->get_pointer()); + // offset + PHINode *offset = dst_builder.CreatePHI(dst_builder.getInt32Ty(), 2); + dst_builder.SetInsertPoint(parent->getFirstNonPHI()); + Value *next_offset = dst_builder.CreateNeg(offset); + offset->addIncoming(dst_builder.getInt32((loop_offset - pre_offset)/4), pre_block); + offset->addIncoming(next_offset, loop_block); + // next pointer + Value *pre_ptr = dst_builder.CreateGEP(sh_mem_ptr, dst_builder.getInt32(pre_offset)); + pre_ptr = dst_builder.CreateBitCast(pre_ptr, ptr->getType()); + Value *next_ptr = dst_builder.CreateGEP(ptr, offset); + ptr->addIncoming(pre_ptr, pre_block); + ptr->addIncoming(next_ptr, loop_block); + // barrier + Function *barrier = Intrinsic::getDeclaration(dst_fn->getParent(), Intrinsic::nvvm_barrier0); + dst_builder.SetInsertPoint(pre_block->getTerminator()); + dst_builder.CreateCall(barrier, {}); + dst_builder.SetInsertPoint(loop_block->getTerminator()); + dst_builder.CreateCall(barrier, {}); + continue; } for(unsigned n = 0; n < phi->get_num_incoming(); n++){