From 49fd6ece995d46fea83a4629c37d39f04c293f4a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 21 Mar 2019 23:51:47 -0700 Subject: [PATCH] some cleaning --- include/triton/driver/buffer.h | 6 + include/triton/driver/context.h | 5 + include/triton/driver/device.h | 5 + include/triton/driver/dispatch.h | 3 + include/triton/driver/event.h | 4 +- include/triton/driver/handle.h | 12 +- include/triton/driver/kernel.h | 5 + include/triton/driver/module.h | 7 +- include/triton/driver/platform.h | 11 ++ include/triton/driver/stream.h | 13 +- lib/codegen/selection.cpp | 273 ++++++++++++++++++------------- lib/driver/buffer.cpp | 13 +- lib/driver/context.cpp | 13 +- lib/driver/dispatch.cpp | 2 + lib/driver/error.cpp | 61 +++++++ lib/driver/event.cpp | 4 +- lib/driver/kernel.cpp | 21 ++- lib/driver/module.cpp | 107 ++++++++++-- lib/driver/platform.cpp | 14 +- lib/driver/stream.cpp | 26 ++- 20 files changed, 423 insertions(+), 182 deletions(-) diff --git a/include/triton/driver/buffer.h b/include/triton/driver/buffer.h index 08bfede1d..667892525 100755 --- a/include/triton/driver/buffer.h +++ b/include/triton/driver/buffer.h @@ -45,6 +45,12 @@ protected: driver::context* context_; }; +// CPU +class cpu_buffer: public buffer +{ + +}; + // OpenCL class ocl_buffer: public buffer { diff --git a/include/triton/driver/context.h b/include/triton/driver/context.h index 379fe6962..f56e45451 100755 --- a/include/triton/driver/context.h +++ b/include/triton/driver/context.h @@ -48,6 +48,11 @@ protected: std::string cache_path_; }; +// CPU +class cpu_context: public context { + +}; + // CUDA class cu_context: public context { public: diff --git a/include/triton/driver/device.h b/include/triton/driver/device.h index 97071ec27..8a5f7790b 100755 --- a/include/triton/driver/device.h +++ b/include/triton/driver/device.h @@ -40,6 +40,11 @@ public: using polymorphic_resource::polymorphic_resource; }; +// CPU device +class cpu_device: public device { + +}; + // OpenCL device class ocl_device: public device { public: diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index f579602c4..1e0459931 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -49,6 +49,7 @@ template void check(T){} void check(CUresult err); void check(cublasStatus_t err); void check(cudnnStatus_t err); +void check(cl_int err); class dispatch { @@ -117,6 +118,7 @@ public: static cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *); static cl_int clGetKernelWorkGroupInfo(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *); static cl_kernel clCreateKernel(cl_program, const char *, cl_int *); + static cl_int clCreateKernelsInProgram(cl_program, cl_uint, cl_kernel*, cl_uint*); static cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *); static cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *); static cl_int clReleaseKernel(cl_kernel); @@ -233,6 +235,7 @@ private: static void* clGetKernelInfo_; static void* clGetKernelWorkGroupInfo_; static void* clCreateKernel_; + static void* clCreateKernelsInProgram_; static void* clCreateBuffer_; static void* clCreateProgramWithSource_; static void* clReleaseKernel_; diff --git a/include/triton/driver/event.h b/include/triton/driver/event.h index 3343ba6e3..633f03d7d 100755 --- a/include/triton/driver/event.h +++ b/include/triton/driver/event.h @@ -31,8 +31,8 @@ namespace triton namespace driver { -// Event -class Event +// event +class event { public: float elapsed_time() const; diff --git a/include/triton/driver/handle.h b/include/triton/driver/handle.h index 6de493722..2bbe7ba42 100755 --- a/include/triton/driver/handle.h +++ b/include/triton/driver/handle.h @@ -35,6 +35,12 @@ namespace triton namespace driver { +enum backend_t { + CUDA, + OpenCL +}; + +// helpers for CUDA struct cu_event_t{ operator bool() const { return first && second; } CUevent first; @@ -79,18 +85,20 @@ protected: template class polymorphic_resource { public: - polymorphic_resource(CUType cu, bool take_ownership): cu_(cu, take_ownership){} - polymorphic_resource(CLType cl, bool take_ownership): cl_(cl, take_ownership){} + polymorphic_resource(CUType cu, bool take_ownership): cu_(cu, take_ownership), backend_(CUDA){} + polymorphic_resource(CLType cl, bool take_ownership): cl_(cl, take_ownership), backend_(OpenCL){} virtual ~polymorphic_resource() { } handle cu() { return cu_; } handle cl() { return cl_; } const handle& cu() const { return cu_; } const handle& cl() const { return cl_; } + backend_t backend() { return backend_; } protected: handle cl_; handle cu_; + backend_t backend_; }; } diff --git a/include/triton/driver/kernel.h b/include/triton/driver/kernel.h index 0657e775f..a68b81840 100755 --- a/include/triton/driver/kernel.h +++ b/include/triton/driver/kernel.h @@ -53,6 +53,11 @@ private: driver::module* program_; }; +// CPU +class cpu_kernel: public kernel { + +}; + // OpenCL class ocl_kernel: public kernel { public: diff --git a/include/triton/driver/module.h b/include/triton/driver/module.h index 92a237d3d..a149812cf 100755 --- a/include/triton/driver/module.h +++ b/include/triton/driver/module.h @@ -56,12 +56,17 @@ public: driver::context* context() const; void compile_llvm_module(llvm::Module* module, const std::string& triple, const std::string &proc, std::string layout, - llvm::SmallVectorImpl &buffer); + llvm::SmallVectorImpl &buffer, std::vector files = {}); protected: driver::context* ctx_; }; +// CPU +class cpu_module: public module{ + +}; + // OpenCL class ocl_module: public module{ diff --git a/include/triton/driver/platform.h b/include/triton/driver/platform.h index 9ea1e9f3b..ff06e4b01 100755 --- a/include/triton/driver/platform.h +++ b/include/triton/driver/platform.h @@ -50,6 +50,7 @@ private: std::string name_; }; +// CUDA class cu_platform: public platform { public: @@ -61,6 +62,7 @@ private: handle cu_; }; +// OpenCL class cl_platform: public platform { public: @@ -72,6 +74,15 @@ private: handle cl_; }; +// CPU +class cpu_platform: public platform +{ +public: + cpu_platform(): platform("CPU") { } + std::string version() const; + void devices(std::vector &devices) const; +}; + } } diff --git a/include/triton/driver/stream.h b/include/triton/driver/stream.h index 18bedbce0..6044a0376 100755 --- a/include/triton/driver/stream.h +++ b/include/triton/driver/stream.h @@ -36,7 +36,7 @@ namespace driver { class kernel; -class Event; +class event; class Range; class cu_buffer; @@ -51,7 +51,7 @@ public: driver::context* context() const; // methods virtual void synchronize() = 0; - virtual void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const * = NULL, Event *event = NULL) = 0; + virtual void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const * = NULL, event *event = NULL) = 0; virtual void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr) = 0; virtual void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr) = 0; // template helpers @@ -64,6 +64,11 @@ protected: driver::context *ctx_; }; +// CPU +class cpu_stream: public stream { + +}; + // OpenCL class cl_stream: public stream { public: @@ -72,7 +77,7 @@ public: // Overridden void synchronize(); - void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, Event *event); + void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, event *event); void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr); void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr); }; @@ -86,7 +91,7 @@ public: // Overridden void synchronize(); - void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, Event *event); + void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, event *event); void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr); void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr); }; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 7523665a8..f8667b5f7 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -19,6 +19,59 @@ namespace codegen{ using namespace llvm; +inline void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn) { + fn->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); +// module->getOrInsertNamedMetadata("opencl.ocl.version")->addOperand(llvm::MDTuple::get(ctx, {llvm::ValueAsMetadata::get(builder.getInt32(2)), llvm::ValueAsMetadata::get(builder.getInt32(0))})); + + // // set metadata + // llvm::Metadata *md_args[] = { + // llvm::ValueAsMetadata::get(dst_fn), + // llvm::MDString::get(dst_ctx, "kernel"), + // llvm::ValueAsMetadata::get(dst_builder.getInt32(1)) + // }; + // module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(llvm::MDNode::get(dst_ctx, md_args)); +} + +inline Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder) { +// Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::nvvm_barrier0); +// return builder.CreateCall(barrier, {}); + + Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::amdgcn_s_barrier); + return builder.CreateCall(barrier, {}); +} + +inline Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax) { +// static std::array ctaid = { +// Intrinsic::nvvm_read_ptx_sreg_ctaid_x, +// Intrinsic::nvvm_read_ptx_sreg_ctaid_y, +// Intrinsic::nvvm_read_ptx_sreg_ctaid_z +// }; + static std::array ids = { + Intrinsic::amdgcn_workgroup_id_x, + Intrinsic::amdgcn_workgroup_id_y, + Intrinsic::amdgcn_workgroup_id_z + }; + Value* get_group_id = Intrinsic::getDeclaration(module, ids[ax]); + Value* group_id = builder.CreateCall(get_group_id, {}); + Value* result = builder.CreateMul(builder.getInt32(stride), group_id); + return result; +} + +inline Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax) { +// static std::array ids = { +// Intrinsic::nvvm_read_ptx_sreg_tid_x, +// Intrinsic::nvvm_read_ptx_sreg_tid_y, +// Intrinsic::nvvm_read_ptx_sreg_tid_z +// }; + static std::array ids = { + Intrinsic::amdgcn_workitem_id_x, + Intrinsic::amdgcn_workitem_id_y, + Intrinsic::amdgcn_workitem_id_z + }; + Function *get_local_id = Intrinsic::getDeclaration(module, ids[ax]); + return builder.CreateCall(get_local_id, {}); +} + /* Distributed Tile */ void distributed_tile::init_indices() { std::vector id(axes_.size(), 0); @@ -264,8 +317,7 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function(inst)){ Module *module = builder.GetInsertBlock()->getModule(); - Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::nvvm_barrier0); - return builder.CreateCall(barrier, {}); + return add_barrier(module, builder); } if(auto* ii = dynamic_cast(inst)){ Type *ty = type(ii->get_type()->get_scalar_ty()); @@ -561,9 +613,8 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem_ptr){ // fetch linear ID Module *mod = builder.GetInsertBlock()->getParent()->getParent(); - Function *get_thread_id = Intrinsic::getDeclaration(mod, Intrinsic::nvvm_read_ptx_sreg_tid_x); Value *warp_size = builder.getInt32(32); - Value *u_thread_id = builder.CreateCall(get_thread_id, {}); + Value* u_thread_id = get_local_id(mod, builder, 0); Value *u_thread_warp_id = builder.CreateURem(u_thread_id, warp_size); Value *u_warp_id = builder.CreateUDiv(u_thread_id, warp_size); // create grid @@ -619,14 +670,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & const auto& shapes = ins->get_type()->get_tile_shapes(); // global_range if(auto *x = dynamic_cast(ins)) { - static std::array ctaid = { - Intrinsic::nvvm_read_ptx_sreg_ctaid_x, - Intrinsic::nvvm_read_ptx_sreg_ctaid_y, - Intrinsic::nvvm_read_ptx_sreg_ctaid_z - }; - Function *get_group_id = Intrinsic::getDeclaration(module, ctaid[x->get_axis()]); - Value *group_id = builder.CreateCall(get_group_id, {}); - Value *offset = builder.CreateMul(builder.getInt32(shapes[0]->get_value()), group_id); + Value *offset = get_global_offset(module, builder, shapes[0]->get_value(), x->get_axis()); result->for_each([&](indices_t idx){ BinaryOperator *bin = static_cast(idx[0]); result->set_value(idx, builder.CreateAdd(bin, offset)); @@ -739,27 +783,27 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & return; // matrix multiplication else if(dynamic_cast(ins)) { - ir::value *A = ins->get_operand(0); - ir::value *B = ins->get_operand(1); - ir::value *C = ins->get_operand(2); - shared_tile *TA = (shared_tile*)tmap_.at(A); - shared_tile *TB = (shared_tile*)tmap_.at(B); - distributed_tile *TC = (distributed_tile*)tmap_.at(C); - TA->set_vector_size(TC->axis(0).contiguous); - TB->set_vector_size(TC->axis(1).contiguous); - Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {llvm_type(C->get_type()->get_scalar_ty(), ctx)}); - result->for_each([&](indices_t idx){ - Value *res = TC->get_value(idx); - unsigned NK = A->get_type()->get_tile_shapes()[1]->get_value(); - for(unsigned K = 0; K < NK; ++K){ - indices_t a_idx = {idx[0], builder.getInt32(K)}; - indices_t b_idx = {idx[1], builder.getInt32(K)}; - Value *a = TA->get_value(a_idx); - Value *b = TB->get_value(b_idx); - res = builder.CreateCall(f_mul_add, {a, b, res}); - } - result->set_value(idx, res); - }); +// ir::value *A = ins->get_operand(0); +// ir::value *B = ins->get_operand(1); +// ir::value *C = ins->get_operand(2); +// shared_tile *TA = (shared_tile*)tmap_.at(A); +// shared_tile *TB = (shared_tile*)tmap_.at(B); +// distributed_tile *TC = (distributed_tile*)tmap_.at(C); +// TA->set_vector_size(TC->axis(0).contiguous); +// TB->set_vector_size(TC->axis(1).contiguous); +// Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {llvm_type(C->get_type()->get_scalar_ty(), ctx)}); +// result->for_each([&](indices_t idx){ +// Value *res = TC->get_value(idx); +// unsigned NK = A->get_type()->get_tile_shapes()[1]->get_value(); +// for(unsigned K = 0; K < NK; ++K){ +// indices_t a_idx = {idx[0], builder.getInt32(K)}; +// indices_t b_idx = {idx[1], builder.getInt32(K)}; +// Value *a = TA->get_value(a_idx); +// Value *b = TB->get_value(b_idx); +// res = builder.CreateCall(f_mul_add, {a, b, res}); +// } +// result->set_value(idx, res); +// }); } // element-wise else { @@ -805,7 +849,7 @@ ArrayType* selection::llvm_linearized_tile_type(ir::type *ty, LLVMContext &ctx) return ArrayType::get(llvm_type(ty->get_scalar_ty(), ctx), size); } -void selection::run(ir::module &src, Module &dst){ +void selection::run(ir::module &src, Module &dst) { vmap_.clear(); LLVMContext &dst_ctx = dst.getContext(); IRBuilder<> dst_builder(dst_ctx); @@ -825,13 +869,7 @@ void selection::run(ir::module &src, Module &dst){ for(ir::attribute_t attr: attr_pair.second) dst_fn->addAttribute(id, llvm_attr(attr)); } - // set metadata - llvm::Metadata *md_args[] = { - llvm::ValueAsMetadata::get(dst_fn), - llvm::MDString::get(dst_ctx, "kernel"), - llvm::ValueAsMetadata::get(dst_builder.getInt32(1)) - }; - dst.getOrInsertNamedMetadata("nvvm.annotations")->addOperand(llvm::MDNode::get(dst_ctx, md_args)); + set_kernel(dst_builder, dst_ctx, &dst, dst_fn); // map parameters for(unsigned i = 0; i < fn->args().size(); i++) @@ -842,82 +880,83 @@ void selection::run(ir::module &src, Module &dst){ vmap_[block] = dst_block; } dst_builder.SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]); - // allocate shared memory - Value *sh_mem_ptr = nullptr; - if(unsigned alloc_size = alloc_->get_allocated_size()){ - Type *int_8_ty = Type::getInt8Ty(dst_ctx); - ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); - Type *ptr_ty = PointerType::get(int_8_ty, 3); - GlobalVariable *sh_mem_array = - new GlobalVariable(dst, array_ty, false, GlobalVariable::ExternalLinkage, - nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); - sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); - } - // create grids - init_grids(fn, dst_builder, sh_mem_ptr); - std::map last_block; - // iterate through block - for(ir::basic_block *block: fn->blocks()) { - BasicBlock *parent = (BasicBlock*)vmap_[block]; - dst_builder.SetInsertPoint(parent); - for(ir::instruction *i: block->get_inst_list()){ - BasicBlock *current = dst_builder.GetInsertBlock(); - bool phi_inserted = (dynamic_cast(i) || dynamic_cast(i)) && !current->empty(); - if(phi_inserted) - dst_builder.SetInsertPoint(&*current->getFirstInsertionPt()); - lower_instruction(i, dst_builder); - if(phi_inserted) - dst_builder.SetInsertPoint(current); - last_block[block] = dst_builder.GetInsertBlock(); - } - } - // add phi operands - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *inst: block->get_inst_list()) - if(auto *phi = dynamic_cast(inst)){ - if(buffer_info_->is_double(phi)) { - PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); - PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); - for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::basic_block* inc_block = phi->get_incoming_block(n); - ir::value* inc_val = phi->get_incoming_value(n); - ir::value* terminator = inc_block->get_inst_list().back(); - BasicBlock *llvm_inc_block = last_block.at(inc_block); - shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val); - bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); - if(is_loop_latch){ - dst_builder.SetInsertPoint(llvm_inc_block->getTerminator()); - Value *next_offset = dst_builder.CreateNeg(offset); - offset->addIncoming(next_offset, llvm_inc_block); - } - else { - offset->addIncoming(dst_builder.getInt32(alloc_->get_num_bytes(phi)/(2*4)), llvm_inc_block); - } - ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); - } - } - else { - for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::value *inc_val = phi->get_incoming_value(n); - ir::basic_block *inc_block = phi->get_incoming_block(n); - BasicBlock *llvm_inc_block = last_block.at(inc_block); - if(phi->get_type()->is_tile_ty()) { - distributed_tile *phi_tile = (distributed_tile*)tmap_.at(phi); - distributed_tile *inc_tile = (distributed_tile*)tmap_.at(inc_val); - phi_tile->for_each([&](indices_t idx){ - PHINode *llvm_phi = (PHINode*)phi_tile->get_value(idx); - Value *llvm_inc_val = inc_tile->get_value(idx); - llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); - }); - } - else { - PHINode *llvm_phi = (PHINode*)vmap_.at(phi); - Value *llvm_inc_val = vmap_.at(inc_val); - llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); - } - } - } - } + dst_builder.CreateRetVoid(); +// // allocate shared memory +// Value *sh_mem_ptr = nullptr; +// if(unsigned alloc_size = alloc_->get_allocated_size()){ +// Type *int_8_ty = Type::getInt8Ty(dst_ctx); +// ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); +// Type *ptr_ty = PointerType::get(int_8_ty, 3); +// GlobalVariable *sh_mem_array = +// new GlobalVariable(dst, array_ty, false, GlobalVariable::ExternalLinkage, +// nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); +// sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); +// } +// // create grids +// init_grids(fn, dst_builder, sh_mem_ptr); +// std::map last_block; +// // iterate through block +// for(ir::basic_block *block: fn->blocks()) { +// BasicBlock *parent = (BasicBlock*)vmap_[block]; +// dst_builder.SetInsertPoint(parent); +// for(ir::instruction *i: block->get_inst_list()){ +// BasicBlock *current = dst_builder.GetInsertBlock(); +// bool phi_inserted = (dynamic_cast(i) || dynamic_cast(i)) && !current->empty(); +// if(phi_inserted) +// dst_builder.SetInsertPoint(&*current->getFirstInsertionPt()); +// lower_instruction(i, dst_builder); +// if(phi_inserted) +// dst_builder.SetInsertPoint(current); +// last_block[block] = dst_builder.GetInsertBlock(); +// } +// } +// // add phi operands +// for(ir::basic_block *block: fn->blocks()) +// for(ir::instruction *inst: block->get_inst_list()) +// if(auto *phi = dynamic_cast(inst)){ +// if(buffer_info_->is_double(phi)) { +// PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); +// PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); +// for(unsigned n = 0; n < phi->get_num_incoming(); n++){ +// ir::basic_block* inc_block = phi->get_incoming_block(n); +// ir::value* inc_val = phi->get_incoming_value(n); +// ir::value* terminator = inc_block->get_inst_list().back(); +// BasicBlock *llvm_inc_block = last_block.at(inc_block); +// shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val); +// bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); +// if(is_loop_latch){ +// dst_builder.SetInsertPoint(llvm_inc_block->getTerminator()); +// Value *next_offset = dst_builder.CreateNeg(offset); +// offset->addIncoming(next_offset, llvm_inc_block); +// } +// else { +// offset->addIncoming(dst_builder.getInt32(alloc_->get_num_bytes(phi)/(2*4)), llvm_inc_block); +// } +// ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); +// } +// } +// else { +// for(unsigned n = 0; n < phi->get_num_incoming(); n++){ +// ir::value *inc_val = phi->get_incoming_value(n); +// ir::basic_block *inc_block = phi->get_incoming_block(n); +// BasicBlock *llvm_inc_block = last_block.at(inc_block); +// if(phi->get_type()->is_tile_ty()) { +// distributed_tile *phi_tile = (distributed_tile*)tmap_.at(phi); +// distributed_tile *inc_tile = (distributed_tile*)tmap_.at(inc_val); +// phi_tile->for_each([&](indices_t idx){ +// PHINode *llvm_phi = (PHINode*)phi_tile->get_value(idx); +// Value *llvm_inc_val = inc_tile->get_value(idx); +// llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); +// }); +// } +// else { +// PHINode *llvm_phi = (PHINode*)vmap_.at(phi); +// Value *llvm_inc_val = vmap_.at(inc_val); +// llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); +// } +// } +// } +// } } } diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index 433d33b2e..6bec0c66e 100755 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -47,11 +47,11 @@ driver::context* buffer::context() { } buffer* buffer::create(driver::context* ctx, size_t size) { - if(dynamic_cast(ctx)) - return new cu_buffer(ctx, size); - if(dynamic_cast(ctx)) - return new ocl_buffer(ctx, size); - throw std::runtime_error("unknown context"); + switch(ctx->backend()){ + case CUDA: return new cu_buffer(ctx, size); + case OpenCL: return new ocl_buffer(ctx, size); + default: throw std::runtime_error("unknown backend"); + } } // @@ -59,7 +59,8 @@ buffer* buffer::create(driver::context* ctx, size_t size) { ocl_buffer::ocl_buffer(driver::context* context, size_t size) : buffer(context, cl_mem(), true){ cl_int err; - dispatch::clCreateBuffer(*context->cl(), CL_MEM_READ_WRITE, size, NULL, &err); + *cl_ = dispatch::clCreateBuffer(*context->cl(), CL_MEM_READ_WRITE, size, NULL, &err); + check(err); } diff --git a/lib/driver/context.cpp b/lib/driver/context.cpp index 6e1618713..5b82492b9 100755 --- a/lib/driver/context.cpp +++ b/lib/driver/context.cpp @@ -51,11 +51,11 @@ context::context(driver::device *dev, cl_context cl, bool take_ownership): } context* context::create(driver::device *dev){ - if(dynamic_cast(dev)) - return new cu_context(dev); - if(dynamic_cast(dev)) - return new ocl_context(dev); - throw std::runtime_error("unknown context"); + switch(dev->backend()){ + case CUDA: return new cu_context(dev); + case OpenCL: return new ocl_context(dev); + default: throw std::runtime_error("unknown backend"); + } } @@ -99,7 +99,7 @@ cu_context::context_switcher::context_switcher(const context &ctx): ctx_((const cu_context::context_switcher::~context_switcher() { CUcontext tmp; dispatch::cuCtxPopCurrent_v2(&tmp); - assert(tmp==(CUcontext)ctx_ && "Switching back to invalid context!"); + assert(tmp==*ctx_.cu() && "Switching back to invalid context!"); } // import CUdevice @@ -129,6 +129,7 @@ cu_context::cu_context(driver::device* device): context(device, CUcontext(), tru ocl_context::ocl_context(driver::device* dev): context(dev, cl_context(), true) { cl_int err; *cl_ = dispatch::clCreateContext(nullptr, 1, &*dev->cl(), nullptr, nullptr, &err); + check(err); } diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp index b2e556d8e..4dfd6df6e 100755 --- a/lib/driver/dispatch.cpp +++ b/lib/driver/dispatch.cpp @@ -286,6 +286,7 @@ OCL_DEFINE5(cl_int, clGetProgramInfo, cl_program, cl_program_info, size_t, void OCL_DEFINE5(cl_int, clGetKernelInfo, cl_kernel, cl_kernel_info, size_t, void *, size_t *) OCL_DEFINE6(cl_int, clGetKernelWorkGroupInfo, cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *) OCL_DEFINE3(cl_kernel, clCreateKernel, cl_program, const char *, cl_int *) +OCL_DEFINE4(cl_int, clCreateKernelsInProgram, cl_program, cl_uint, cl_kernel*, cl_uint*) OCL_DEFINE5(cl_mem, clCreateBuffer, cl_context, cl_mem_flags, size_t, void *, cl_int *) OCL_DEFINE5(cl_program, clCreateProgramWithSource, cl_context, cl_uint, const char **, const size_t *, cl_int *) OCL_DEFINE1(cl_int, clReleaseKernel, cl_kernel) @@ -343,6 +344,7 @@ void* dispatch::clGetProgramInfo_; void* dispatch::clGetKernelInfo_; void* dispatch::clGetKernelWorkGroupInfo_; void* dispatch::clCreateKernel_; +void* dispatch::clCreateKernelsInProgram_; void* dispatch::clCreateBuffer_; void* dispatch::clCreateProgramWithSource_; void* dispatch::clReleaseKernel_; diff --git a/lib/driver/error.cpp b/lib/driver/error.cpp index f3cce16d5..99b2401dd 100755 --- a/lib/driver/error.cpp +++ b/lib/driver/error.cpp @@ -133,6 +133,67 @@ void check(cudnnStatus_t err){ } } +void check(cl_int err) +{ + using namespace exception::ocl; + switch(err) + { + case CL_SUCCESS: break; + case CL_DEVICE_NOT_FOUND: throw device_not_found(); + case CL_DEVICE_NOT_AVAILABLE: throw device_not_available(); + case CL_COMPILER_NOT_AVAILABLE: throw compiler_not_available(); + case CL_MEM_OBJECT_ALLOCATION_FAILURE: throw mem_object_allocation_failure(); + case CL_OUT_OF_RESOURCES: throw out_of_resources(); + case CL_OUT_OF_HOST_MEMORY: throw out_of_host_memory(); + case CL_PROFILING_INFO_NOT_AVAILABLE: throw profiling_info_not_available(); + case CL_MEM_COPY_OVERLAP: throw mem_copy_overlap(); + case CL_IMAGE_FORMAT_MISMATCH: throw image_format_mismatch(); + case CL_IMAGE_FORMAT_NOT_SUPPORTED: throw image_format_not_supported(); + case CL_BUILD_PROGRAM_FAILURE: throw build_program_failure(); + case CL_MAP_FAILURE: throw map_failure(); + + case CL_INVALID_VALUE: throw invalid_value(); + case CL_INVALID_DEVICE_TYPE: throw invalid_device_type(); + case CL_INVALID_PLATFORM: throw invalid_platform(); + case CL_INVALID_DEVICE: throw invalid_device(); + case CL_INVALID_CONTEXT: throw invalid_context(); + case CL_INVALID_QUEUE_PROPERTIES: throw invalid_queue_properties(); + case CL_INVALID_COMMAND_QUEUE: throw invalid_command_queue(); + case CL_INVALID_HOST_PTR: throw invalid_host_ptr(); + case CL_INVALID_MEM_OBJECT: throw invalid_mem_object(); + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: throw invalid_image_format_descriptor(); + case CL_INVALID_IMAGE_SIZE: throw invalid_image_size(); + case CL_INVALID_SAMPLER: throw invalid_sampler(); + case CL_INVALID_BINARY: throw invalid_binary(); + case CL_INVALID_BUILD_OPTIONS: throw invalid_build_options(); + case CL_INVALID_PROGRAM: throw invalid_program(); + case CL_INVALID_PROGRAM_EXECUTABLE: throw invalid_program_executable(); + case CL_INVALID_KERNEL_NAME: throw invalid_kernel_name(); + case CL_INVALID_KERNEL_DEFINITION: throw invalid_kernel_definition(); + case CL_INVALID_KERNEL: throw invalid_kernel(); + case CL_INVALID_ARG_INDEX: throw invalid_arg_index(); + case CL_INVALID_ARG_VALUE: throw invalid_arg_value(); + case CL_INVALID_ARG_SIZE: throw invalid_arg_size(); + case CL_INVALID_KERNEL_ARGS: throw invalid_kernel_args(); + case CL_INVALID_WORK_DIMENSION: throw invalid_work_dimension(); + case CL_INVALID_WORK_GROUP_SIZE: throw invalid_work_group_size(); + case CL_INVALID_WORK_ITEM_SIZE: throw invalid_work_item_size(); + case CL_INVALID_GLOBAL_OFFSET: throw invalid_global_offset(); + case CL_INVALID_EVENT_WAIT_LIST: throw invalid_event_wait_list(); + case CL_INVALID_EVENT: throw invalid_event(); + case CL_INVALID_OPERATION: throw invalid_operation(); + case CL_INVALID_GL_OBJECT: throw invalid_gl_object(); + case CL_INVALID_BUFFER_SIZE: throw invalid_buffer_size(); + case CL_INVALID_MIP_LEVEL: throw invalid_mip_level(); + case CL_INVALID_GLOBAL_WORK_SIZE: throw invalid_global_work_size(); + #ifdef CL_INVALID_PROPERTY + case CL_INVALID_PROPERTY: throw invalid_property(); + #endif + default: throw; + } +} + + } } diff --git a/lib/driver/event.cpp b/lib/driver/event.cpp index ddd5f3874..ad341d701 100755 --- a/lib/driver/event.cpp +++ b/lib/driver/event.cpp @@ -27,13 +27,13 @@ namespace triton namespace driver { -float Event::elapsed_time() const{ +float event::elapsed_time() const{ float time; dispatch::cuEventElapsedTime(&time, cu_->first, cu_->second); return time; } -handle const & Event::cu() const +handle const & event::cu() const { return cu_; } } diff --git a/lib/driver/kernel.cpp b/lib/driver/kernel.cpp index 1490ad21d..4e40e6196 100755 --- a/lib/driver/kernel.cpp +++ b/lib/driver/kernel.cpp @@ -46,11 +46,11 @@ kernel::kernel(driver::module *program, cl_kernel fn, bool has_ownership): } kernel* kernel::create(driver::module* program, const char* name) { - if(dynamic_cast(program)) - return new cu_kernel(program, name); - if(dynamic_cast(program)) - return new ocl_kernel(program, name); - throw std::runtime_error("unknown program"); + switch(program->backend()){ + case CUDA: return new cu_kernel(program, name); + case OpenCL: return new ocl_kernel(program, name); + default: throw std::runtime_error("unknown backend"); + } } driver::module* kernel::module() { @@ -62,16 +62,21 @@ driver::module* kernel::module() { /* ------------------------ */ ocl_kernel::ocl_kernel(driver::module* program, const char* name): kernel(program, cl_kernel(), true) { +// cl_uint res; +// check(dispatch::clCreateKernelsInProgram(*program->cl(), 0, NULL, &res)); +// std::cout << res << std::endl; cl_int err; - *cl_ = dispatch::clCreateKernel(*program->cl(), name, &err); + std::cout << *program->cl() << std::endl; + *cl_ = dispatch::clCreateKernel(*program->cl(), "matmul", &err); + check(err); } void ocl_kernel::setArg(unsigned int index, std::size_t size, void* ptr) { - dispatch::clSetKernelArg(*cl_, index, size, ptr); + check(dispatch::clSetKernelArg(*cl_, index, size, ptr)); } void ocl_kernel::setArg(unsigned int index, driver::buffer* buffer) { - dispatch::clSetKernelArg(*cl_, index, sizeof(cl_mem), (void*)&*buffer->cl()); + check(dispatch::clSetKernelArg(*cl_, index, sizeof(cl_mem), (void*)&*buffer->cl())); } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 5796cc7e5..989337b23 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -28,9 +28,15 @@ #include "triton/driver/error.h" #include "triton/tools/sys/getenv.hpp" #include "llvm/IR/IRPrintingPasses.h" +#include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/IR/Verifier.h" #include "llvm/IR/Module.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Linker/Linker.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/AsmParser/Parser.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" @@ -75,38 +81,60 @@ driver::context* module::context() const { } module* module::create(driver::context* ctx, llvm::Module *src) { - if(dynamic_cast(ctx)) - return new cu_module(ctx, src); - if(dynamic_cast(ctx)) - return new ocl_module(ctx, src); - throw std::runtime_error("unknown context"); + switch(ctx->backend()){ + case CUDA: return new cu_module(ctx, src); + case OpenCL: return new ocl_module(ctx, src); + default: throw std::runtime_error("unknown backend"); + } } void module::compile_llvm_module(llvm::Module* module, const std::string& triple, const std::string &proc, std::string layout, - llvm::SmallVectorImpl &buffer) { + llvm::SmallVectorImpl &buffer, + std::vector files) { init_llvm(); // create machine module->setTargetTriple(triple); std::string error; auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); - llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, "", - llvm::TargetOptions(), llvm::Reloc::Model(), - llvm::None, llvm::CodeGenOpt::Aggressive); - + llvm::TargetOptions opt; + opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; + opt.UnsafeFPMath = false; + opt.NoInfsFPMath = false; + opt.NoNaNsFPMath = true; + llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, "", opt, + llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive); // set data layout if(layout.empty()) - layout = module->getDataLayoutStr(); - module->setDataLayout(layout); + module->setDataLayout(machine->createDataLayout()); + else + module->setDataLayout(layout); + + // link + for (std::string& file: files) { + std::string path = "/opt/rocm/lib/" + file; + llvm::SMDiagnostic err; + std::unique_ptr mlib = llvm::parseIRFile(path, err, module->getContext()); + if (mlib.get() == nullptr) { + std::string msg = err.getMessage(); + std::cerr << "Fail to load bitcode file " << path << "\n" + << "line " << err.getLineNo() << ":" << msg; + } + mlib->setTargetTriple(module->getTargetTriple()); + mlib->setDataLayout(module->getDataLayout()); + for (llvm::Function &f : mlib->functions()) { + f.addFnAttr(llvm::Attribute::AlwaysInline); + } + llvm::Linker::linkModules(*module, std::move(mlib)); + } - std::cout << "compiling" << std::endl; // emit machine code llvm::legacy::PassManager pass; llvm::raw_svector_ostream stream(buffer); - machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); + machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_ObjectFile); pass.run(*module); - std::cout << "compiled" << std::endl; +// std::cout << std::string(buffer.begin(), buffer.end()) << std::endl; } /* ------------------------ */ @@ -114,9 +142,56 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple /* ------------------------ */ ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(context, cl_program(), true) { +// const char* x = "__kernel void matmul(){ }"; +// cl_int err; +// *cl_ = dispatch::clCreateProgramWithSource(*context->cl(), 1, &x, NULL, &err); +// check(err); +// return; + init_llvm(); llvm::SmallVector buffer; - module::compile_llvm_module(src, "amdgcn-amd-amdpal", "gfx902", "", buffer); + std::vector files = { + "oclc_daz_opt_on.amdgcn.bc", + "ocml.amdgcn.bc", + "hc.amdgcn.bc", + "ockl.amdgcn.bc", + "oclc_correctly_rounded_sqrt_off.amdgcn.bc", + "oclc_correctly_rounded_sqrt_on.amdgcn.bc", + "oclc_daz_opt_off.amdgcn.bc", + "oclc_finite_only_off.amdgcn.bc", + "oclc_finite_only_on.amdgcn.bc", + "oclc_isa_version_803.amdgcn.bc", + "oclc_isa_version_900.amdgcn.bc", + "oclc_unsafe_math_off.amdgcn.bc", + "oclc_unsafe_math_on.amdgcn.bc", + "oclc_isa_version_700.amdgcn.bc", + "opencl.amdgcn.bc" + }; + module::compile_llvm_module(src, "amdgcn-amd-amdpal", "gfx902", "", buffer, files); + + + +// llvm::BitcodeWriter writer(buffer); +// writer.writeModule(*src); +// llvm::legacy::PassManager pass; +// llvm::raw_svector_ostream stream(buffer); +// pass.add(llvm::createPrintModulePass(stream)); +// pass.run(*src); + size_t sizes[] = {buffer.size()}; + const unsigned char* data[] = {(unsigned char*)buffer.data()}; + cl_int status; + cl_int err; + *cl_ = dispatch::clCreateProgramWithBinary(*context->cl(), 1, &*context->device()->cl(), sizes, data, &status, &err); + check(err); + check(status); + try{ + dispatch::clBuildProgram(*cl_, 1, &*context->device()->cl(), NULL, NULL, NULL); + } + catch(...){ + char log[2048]; + dispatch::clGetProgramBuildInfo(*cl_, *context->device()->cl(), CL_PROGRAM_BUILD_LOG, 1024, log, NULL); + std::cout << log << std::endl; + } } diff --git a/lib/driver/platform.cpp b/lib/driver/platform.cpp index 60d2bd128..ad85773dd 100755 --- a/lib/driver/platform.cpp +++ b/lib/driver/platform.cpp @@ -58,20 +58,26 @@ void cu_platform::devices(std::vector &devices) const{ std::string cl_platform::version() const { size_t size; - dispatch::clGetPlatformInfo(*cl_, CL_PLATFORM_VERSION, 0, nullptr, &size); + check(dispatch::clGetPlatformInfo(*cl_, CL_PLATFORM_VERSION, 0, nullptr, &size)); std::string result(size, 0); - dispatch::clGetPlatformInfo(*cl_, CL_PLATFORM_VERSION, size, (void*)&*result.begin(), nullptr); + check(dispatch::clGetPlatformInfo(*cl_, CL_PLATFORM_VERSION, size, (void*)&*result.begin(), nullptr)); return result; } void cl_platform::devices(std::vector &devices) const{ cl_uint num_devices; - dispatch::clGetDeviceIDs(*cl_, CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices); + check(dispatch::clGetDeviceIDs(*cl_, CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices)); std::vector ids(num_devices); - dispatch::clGetDeviceIDs(*cl_, CL_DEVICE_TYPE_GPU, num_devices, ids.data(), nullptr); + check(dispatch::clGetDeviceIDs(*cl_, CL_DEVICE_TYPE_GPU, num_devices, ids.data(), nullptr)); for(cl_device_id id: ids) devices.push_back(new driver::ocl_device(id)); } +/* ------------------------ */ +// Vulkan // +/* ------------------------ */ + + + } } diff --git a/lib/driver/stream.cpp b/lib/driver/stream.cpp index 35e369716..c3551c832 100755 --- a/lib/driver/stream.cpp +++ b/lib/driver/stream.cpp @@ -44,23 +44,20 @@ namespace driver stream::stream(driver::context *ctx, CUstream cu, bool has_ownership) : polymorphic_resource(cu, has_ownership), ctx_(ctx) { - } stream::stream(driver::context *ctx, cl_command_queue cl, bool has_ownership) : polymorphic_resource(cl, has_ownership), ctx_(ctx) { - } driver::stream* stream::create(driver::context* ctx) { - if(dynamic_cast(ctx)) - return new cu_stream(ctx); - if(dynamic_cast(ctx)) - return new cl_stream(ctx); - throw std::runtime_error("unknown context"); + switch(ctx->backend()){ + case CUDA: return new cu_stream(ctx); + case OpenCL: return new cl_stream(ctx); + default: throw std::runtime_error("unknown backend"); + } } - driver::context* stream::context() const { return ctx_; } @@ -73,22 +70,23 @@ driver::context* stream::context() const { cl_stream::cl_stream(driver::context *ctx): stream(ctx, cl_command_queue(), true) { cl_int err; *cl_ = dispatch::clCreateCommandQueue(*ctx->cl(), *ctx->device()->cl(), 0, &err); + check(err); } void cl_stream::synchronize() { - dispatch::clFinish(*cl_); + check(dispatch::clFinish(*cl_)); } -void cl_stream::enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, Event* event) { - cl_int err = dispatch::clEnqueueNDRangeKernel(*cl_, *kernel->cl(), grid.size(), NULL, (const size_t*)grid.data(), (const size_t*)block.data(), 0, NULL, NULL); +void cl_stream::enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, event* event) { + check(dispatch::clEnqueueNDRangeKernel(*cl_, *kernel->cl(), grid.size(), NULL, (const size_t*)grid.data(), (const size_t*)block.data(), 0, NULL, NULL)); } void cl_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) { - cl_int err = dispatch::clEnqueueWriteBuffer(*cl_, *buffer->cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL); + check(dispatch::clEnqueueWriteBuffer(*cl_, *buffer->cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL)); } void cl_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) { - cl_int err = dispatch::clEnqueueReadBuffer(*cl_, *buffer->cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL); + check(dispatch::clEnqueueReadBuffer(*cl_, *buffer->cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL)); } /* ------------------------ */ @@ -115,7 +113,7 @@ void cu_stream::synchronize() { dispatch::cuStreamSynchronize(*cu_); } -void cu_stream::enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, Event* event) { +void cu_stream::enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, event* event) { driver::cu_kernel* cu_kernel = (driver::cu_kernel*)kernel; cu_context::context_switcher ctx_switch(*ctx_); if(event)