From 41aad4800ca74af1623c4ef750538cd00545a279 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 12 Feb 2019 11:47:52 -0500 Subject: [PATCH] [code generation] added double-buffering --- examples/matrix.cpp | 4 +- include/codegen/allocation.h | 3 ++ include/codegen/buffer_info.h | 34 ++++++++++++++++ include/codegen/vectorize.h | 27 +++++++++++++ lib/codegen/allocation.cpp | 14 +++---- lib/codegen/buffer_info.cpp | 65 ++++++++++++++++++++++++++++++ lib/codegen/selection.cpp | 76 +++++++++++++++++------------------ lib/codegen/vectorize.cpp | 28 +++++++++++++ 8 files changed, 204 insertions(+), 47 deletions(-) create mode 100644 include/codegen/buffer_info.h create mode 100644 include/codegen/vectorize.h create mode 100644 lib/codegen/buffer_info.cpp create mode 100644 lib/codegen/vectorize.cpp diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 9fdbcf96a..31e4a173e 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -212,7 +212,7 @@ int main() { manager.run(llvm_module); std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); -// std::cout << src << std::endl; + std::cout << src << std::endl; // compile machine code CUdevice cu_device; @@ -222,7 +222,7 @@ int main() { CUstream cu_stream; int major, minor; compile_machine_code(cu_device, cu_context, cu_module, cu_kernel, cu_stream, major, minor, src, "test"); -// std::cout << src << std::endl; + std::cout << src << std::endl; // execute machine code // Allocate buffers diff --git a/include/codegen/allocation.h b/include/codegen/allocation.h index 96366b526..5bd5e85a2 100644 --- a/include/codegen/allocation.h +++ b/include/codegen/allocation.h @@ -23,6 +23,9 @@ public: allocation(liveness *live, buffer_info_pass *buffer_info) : liveness_(live), buffer_info_(buffer_info){ } + // utilities + unsigned get_num_bytes(ir::value *x); + // accessors unsigned get_offset(ir::value *x) const { return offsets_.at(x); } unsigned get_allocated_size() const { return allocated_size_; } diff --git a/include/codegen/buffer_info.h b/include/codegen/buffer_info.h new file mode 100644 index 000000000..2cce9d829 --- /dev/null +++ b/include/codegen/buffer_info.h @@ -0,0 +1,34 @@ +#ifndef TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H +#define TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H + +#include +#include + +namespace tdl { + +namespace ir { + class module; + class value; +} + +namespace codegen{ + +class buffer_info_pass { +public: + void run(ir::module &mod); + // queries + bool is_double(ir::value *x); + bool is_shared(ir::value *x); + ir::value *get_reference(ir::value *x); + +private: + std::set shared_; + std::set double_; + std::map refs_; +}; + + +} +} + +#endif diff --git a/include/codegen/vectorize.h b/include/codegen/vectorize.h new file mode 100644 index 000000000..c9c28a79c --- /dev/null +++ b/include/codegen/vectorize.h @@ -0,0 +1,27 @@ +#ifndef TDL_INCLUDE_CODEGEN_VECTORIZE_H +#define TDL_INCLUDE_CODEGEN_VECTORIZE_H + +namespace tdl { + +namespace ir { + class module; +} + +namespace codegen{ + +class tune; + +class vectorize { +public: + vectorize(tune *params): params_(params){} + void run(ir::module &mod); + +private: + tune *params_; +}; + + +} +} + +#endif diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp index 74c9f4c58..c4957e477 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/allocation.cpp @@ -11,18 +11,18 @@ namespace tdl{ namespace codegen{ +unsigned allocation::get_num_bytes(ir::value *x) { + unsigned result = x->get_type()->get_tile_bitwidth(); + if(buffer_info_->is_double(x)) + result *= 2; + return result; +} + void allocation::run(){ using std::max; using std::min; typedef std::multimap triples_map_type; - auto get_num_bytes = [&](ir::value *x){ - unsigned result = x->get_type()->get_tile_bitwidth(); - if(buffer_info_->is_double(x)) - result *= 2; - return result; - }; - std::vector I; for(auto x: liveness_->intervals()) I.push_back(x.first); diff --git a/lib/codegen/buffer_info.cpp b/lib/codegen/buffer_info.cpp new file mode 100644 index 000000000..6be951a22 --- /dev/null +++ b/lib/codegen/buffer_info.cpp @@ -0,0 +1,65 @@ +#include "codegen/buffer_info.h" +#include "ir/module.h" +#include "ir/function.h" +#include "ir/basic_block.h" +#include "ir/instructions.h" +#include "ir/type.h" + +namespace tdl { + +namespace codegen{ + + +// run pass on module +void buffer_info_pass::run(ir::module &mod) { + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()) { + if(!i->get_type()->is_tile_ty()) + continue; + // handle phi + if(auto *phi = dynamic_cast(i)){ + // determine if the value is in shared memory + bool is_shared = true; + bool is_double = false; + for(unsigned n = 0; n < phi->get_num_incoming(); n++){ + ir::value *inc_val = phi->get_incoming_value(n); + ir::value *inc_block = phi->get_incoming_block(n); + is_shared = is_shared && dynamic_cast(inc_val); + is_double = is_double || inc_block == phi->get_parent(); + } + // add to shared + if(is_shared) + shared_.insert(phi); + // add to double-buffered + if(is_double) + double_.insert(phi); + // set references of input + for(unsigned n = 0; n < phi->get_num_incoming(); n++){ + ir::value *inc_val = phi->get_incoming_value(n); + assert(refs_[inc_val] == nullptr); + refs_[inc_val] = phi; + } + } + // handle shared copy + if(auto *copy = dynamic_cast(i)) + shared_.insert(copy); + } +} + +// query double-buffered status +bool buffer_info_pass::is_double(ir::value *x) +{ return double_.find(x) != double_.end(); } + +// query shared status +bool buffer_info_pass::is_shared(ir::value *x) +{ return shared_.find(x) != shared_.end(); } + +// get reference if any +ir::value *buffer_info_pass::get_reference(ir::value *x) +{ return refs_[x]; } + + + +} +} diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index a743a5162..b9857eb12 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -384,17 +384,42 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, // shared copy PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); if(dynamic_cast(v)) { - size_t offset = alloc_->get_offset(v); - Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); - ptr = builder.CreateBitCast(ptr, ptr_ty); - tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + if(buffer_info_->get_reference(v) == nullptr){ + size_t offset = alloc_->get_offset(v); + Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); + ptr = builder.CreateBitCast(ptr, ptr_ty); + tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + } } // phi-node (double-buffering) else if(auto *phi = dynamic_cast(v)) { BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; - builder.SetInsertPoint(parent); + unsigned id_pre = 0, id_loop = 1; + if(phi->get_incoming_block(0) == phi->get_parent()) + std::swap(id_pre, id_loop); + ir::value *pre_value = phi->get_incoming_value(id_pre); + ir::value *loop_value = phi->get_incoming_value(id_loop); + BasicBlock *pre_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_pre)]; + BasicBlock *loop_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_loop)]; + if(parent->empty()) + builder.SetInsertPoint(parent); + else + builder.SetInsertPoint(&*parent->getFirstInsertionPt()); PHINode *ptr = builder.CreatePHI(ptr_ty, 2); + // offset + PHINode *offset = builder.CreatePHI(builder.getInt32Ty(), 2); + Value *next_offset = builder.CreateNeg(offset); + offset->addIncoming(builder.getInt32(alloc_->get_num_bytes(phi) / 2 / 4), pre_block); + offset->addIncoming(next_offset, loop_block); + // next pointer + Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->get_offset(phi))); + pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType()); + Value *next_ptr = builder.CreateGEP(ptr, offset); + ptr->addIncoming(pre_ptr, pre_block); + ptr->addIncoming(next_ptr, loop_block); tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + tmap_.insert({pre_value, new shared_tile(ty, shapes, pre_ptr, builder)}); + tmap_.insert({loop_value, new shared_tile(ty, shapes, next_ptr, builder)}); } else throw std::runtime_error("unknown shared memory tile"); @@ -633,46 +658,21 @@ void selection::run(ir::module &src, Module &dst){ init_grids(fn, dst_builder, sh_mem_ptr); // iterate through block for(ir::basic_block *block: fn->blocks()) { - dst_builder.SetInsertPoint((BasicBlock*)vmap_[block]); - for(ir::instruction *i: block->get_inst_list()) + BasicBlock *parent = (BasicBlock*)vmap_[block]; + dst_builder.SetInsertPoint(parent); + for(ir::instruction *i: block->get_inst_list()){ + if(dynamic_cast(i)) + dst_builder.SetInsertPoint(&*parent->getFirstInsertionPt()); lower_instruction(i, dst_builder); + if(dynamic_cast(i)) + dst_builder.SetInsertPoint(parent); + } } // add phi operands for(ir::basic_block *block: fn->blocks()) for(ir::instruction *inst: block->get_inst_list()) if(auto *phi = dynamic_cast(inst)){ if(buffer_info_->is_shared(phi)) { - BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; - unsigned id_pre = 0, id_loop = 1; - if(phi->get_incoming_block(0) == phi->get_parent()) - std::swap(id_pre, id_loop); - ir::value *pre_value = phi->get_incoming_value(id_pre); - ir::value *loop_value = phi->get_incoming_value(id_loop); - BasicBlock *pre_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_pre)]; - BasicBlock *loop_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_loop)]; - int pre_offset = alloc_->get_offset(pre_value); - int loop_offset = alloc_->get_offset(loop_value); - dst_builder.SetInsertPoint(&*parent->getFirstInsertionPt()); - PHINode *ptr = (PHINode*)(((shared_tile*)tmap_.at(phi))->get_pointer()); - // offset - PHINode *offset = dst_builder.CreatePHI(dst_builder.getInt32Ty(), 2); - dst_builder.SetInsertPoint(parent->getFirstNonPHI()); - Value *next_offset = dst_builder.CreateNeg(offset); - offset->addIncoming(dst_builder.getInt32((loop_offset - pre_offset)/4), pre_block); - offset->addIncoming(next_offset, loop_block); - // next pointer - Value *pre_ptr = dst_builder.CreateGEP(sh_mem_ptr, dst_builder.getInt32(pre_offset)); - pre_ptr = dst_builder.CreateBitCast(pre_ptr, ptr->getType()); - Value *next_ptr = dst_builder.CreateGEP(ptr, offset); - ptr->addIncoming(pre_ptr, pre_block); - ptr->addIncoming(next_ptr, loop_block); - // barrier - Function *barrier = Intrinsic::getDeclaration(dst_fn->getParent(), Intrinsic::nvvm_barrier0); - dst_builder.SetInsertPoint(pre_block->getTerminator()); - dst_builder.CreateCall(barrier, {}); - dst_builder.SetInsertPoint(loop_block->getTerminator()); - dst_builder.CreateCall(barrier, {}); - continue; } for(unsigned n = 0; n < phi->get_num_incoming(); n++){ diff --git a/lib/codegen/vectorize.cpp b/lib/codegen/vectorize.cpp new file mode 100644 index 000000000..41a1afd10 --- /dev/null +++ b/lib/codegen/vectorize.cpp @@ -0,0 +1,28 @@ +#include "codegen/vectorize.h" +#include "codegen/tune.h" +#include "ir/module.h" +#include "ir/function.h" +#include "ir/basic_block.h" +#include "ir/instructions.h" + +namespace tdl { + +namespace codegen{ + +void vectorize::run(ir::module &mod) { + ir::builder &builder = mod.get_builder(); + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()) + if(dynamic_cast(i)){ + builder.set_insert_point(i); + ir::value *x = i->get_operand(0); + ir::instruction *rx = (ir::instruction*)builder.create_vectorize(x); + x->replace_all_uses_with(rx); + rx->set_operand(0, x); + params_->copy(rx, x); + } +} + +} +}