From 6dc88878aceacddc7dbca911b7b404685f63cdeb Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 24 Feb 2019 23:22:28 -0500 Subject: [PATCH] [code generation] bugfix in double-buffering --- examples/matrix.cpp | 49 +++++++++++++------------------------ lib/codegen/allocation.cpp | 2 +- lib/codegen/buffer_info.cpp | 2 +- lib/codegen/selection.cpp | 21 +++++++++++----- lib/codegen/vectorize.cpp | 4 ++- 5 files changed, 37 insertions(+), 41 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 382f8a324..a04c1d95e 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -60,27 +60,11 @@ void matmul(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){\ int1 checkc1[TN] = ryc < N;\ int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];\ for(k = K; k > 0; k = k - TK){\ - int1 checka[TM, TK] = (k > 8);\ - int1 checkb[TN, TK] = (k > 8);\ - int1 checka0[TM];\ - int1 checka1[TK];\ - int1 checkb0[TN];\ - int1 checkb1[TK];\ C = dot(a, b, C);\ pa = pa + TK*M;\ pb = pb + TK*K;\ - @checka a = *pa;\ - @checkb b = *pb;\ - if(k > 8)\ - continue;\ - checka0 = rxa < M;\ - checka1 = rka < k;\ - checkb0 = ryb < N;\ - checkb1 = rkb < k;\ - checka = checka0[:, newaxis] && checka1[newaxis, :];\ - checkb = checkb0[:, newaxis] && checkb1[newaxis, :];\ - @checka a = *pa;\ - @checkb b = *pb;\ + a = *pa;\ + b = *pb;\ }\ @checkc *pc = C;\ }\ @@ -219,21 +203,22 @@ int main() { // tuning parameters tune.run(module); + std::vector params = { // shapes - 16, 16, 8, + 8, 8, 8, // a0 - 2, 8, 1, + 1, 8, 1, // b0 - 4, 4, 1, + 1, 8, 1, // c0 - 2, 8, 1, + 1, 8, 1, // c1 - 4, 4, 1, + 1, 4, 2, // a1 - 2, 4, 1, + 1, 4, 2, // b1 - 1, 8, 1 + 1, 4, 2 }; // meta-parameters unsigned i = 0; @@ -255,23 +240,22 @@ int main() { // run passes - triton::ir::print(module, std::cout); buffer_info.run(module); shared.run(module); liveness.run(module); allocation.run(); barriers.run(module); +// triton::ir::print(module, std::cout); vectorize.run(module); selection.run(module, llvm_module); // llvm source llvm::legacy::PassManager manager; - manager.add(llvm::createPrintModulePass(llvm::outs())); +// manager.add(llvm::createPrintModulePass(llvm::outs())); manager.add(llvm::createVerifierPass(true)); manager.run(llvm_module); std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); - std::cout << src << std::endl; // compile machine code CUdevice cu_device; @@ -285,16 +269,17 @@ int main() { // execute machine code // Allocate buffers typedef float numeric_t; - size_t M = 32, N = 32, K = 32; + size_t M = 128, N = 128, K = 128; size_t bound = 8; std::vector c(M*N); std::vector rc(M*N); std::vector a(M*K); std::vector b(K*N); + srand(0); for(size_t i = 0; i < a.size(); i++) - a[i] = (float)rand() / RAND_MAX; + a[i] = (float)rand()/RAND_MAX; for(size_t i = 0; i < b.size(); i++) - b[i] = (float)rand() / RAND_MAX; + b[i] = (float)rand()/RAND_MAX; for(size_t i = 0; i < c.size(); i++) c[i] = 0; CUdeviceptr d_a, d_b, d_c; @@ -311,7 +296,7 @@ int main() { cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, cu_kernel); unsigned TM = context.p_impl->mp_constants_[0]->get_value(); unsigned TN = context.p_impl->mp_constants_[1]->get_value(); - unsigned nthreads = 32; + unsigned nthreads = params[10]*params[13]*params[11]*params[14]; checkCudaErrors(cuLaunchKernel(cu_kernel, (M + TM - 1)/TM, (N + TN - 1)/TN, 1, nthreads, 1, 1, 0, cu_stream, args, NULL)); checkCudaErrors(cuStreamSynchronize(cu_stream)); // Write back diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp index c4dd538f8..9a3d5e39d 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/allocation.cpp @@ -12,7 +12,7 @@ namespace triton{ namespace codegen{ unsigned allocation::get_num_bytes(ir::value *x) { - unsigned result = x->get_type()->get_tile_bitwidth(); + unsigned result = x->get_type()->get_tile_bitwidth() / 8; if(buffer_info_->is_double(x)) result *= 2; return result; diff --git a/lib/codegen/buffer_info.cpp b/lib/codegen/buffer_info.cpp index 92e27fd23..4d2a3c676 100644 --- a/lib/codegen/buffer_info.cpp +++ b/lib/codegen/buffer_info.cpp @@ -16,7 +16,7 @@ bool buffer_info_pass::is_loop_latch(ir::phi_node *phi, ir::value *terminator){ return br->get_true_dest() == phi->get_parent() || br->get_false_dest() == phi->get_parent(); else if(auto *br = dynamic_cast(terminator)) - return br->get_dest() == phi->get_parent(); + return false; else throw std::runtime_error("unreachable"); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 1b09ca4f9..1e7f9c4e4 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -376,6 +376,13 @@ void selection::create_grids(std::vector &grids, grids.push_back(ref.second); } +bool static inline has_phi_user(ir::value *v) { + for(ir::user *usr: v->get_users()){ + if(dynamic_cast(usr)) + return true; + } + return false; +} void selection::create_tile(ir::value *v, IRBuilder<> &builder, const std::map& references, std::set &seen, Value *sh_mem_ptr) { @@ -394,8 +401,9 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, if(dynamic_cast(v) || (buffer_info_->is_double(v))){ // shared copy PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); + // TODO - buffer info not up-to-date with references if(dynamic_cast(v)) { - if(buffer_info_->get_reference(v) == nullptr){ + if(!has_phi_user(v)){ size_t offset = alloc_->get_offset(v); Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); ptr = builder.CreateBitCast(ptr, ptr_ty); @@ -417,7 +425,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, // next pointer Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->get_offset(phi))); pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType()); - Value *next_ptr = builder.CreateGEP(ptr, offset); + Value *next_ptr = builder.CreateGEP(ptr, offset, "next_ptr"); tmap_.insert({phi, new shared_tile(ty, shapes2, ptr, builder, offset)}); for(unsigned i = 0; i < phi->get_num_incoming(); i++) { ir::basic_block* inc_block = phi->get_incoming_block(i); @@ -720,12 +728,13 @@ void selection::run(ir::module &src, Module &dst){ PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::value *inc_val = phi->get_incoming_value(n); - ir::basic_block *inc_block = phi->get_incoming_block(n); + ir::basic_block* inc_block = phi->get_incoming_block(n); + ir::value* inc_val = phi->get_incoming_value(n); + ir::value* terminator = inc_block->get_inst_list().back(); BasicBlock *llvm_inc_block = last_block.at(inc_block); shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val); - GetElementPtrInst *inc_ptr = dyn_cast(inc_shared->get_pointer()); - if(inc_ptr && ptr == inc_ptr->getPointerOperand()){ + bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); + if(is_loop_latch){ dst_builder.SetInsertPoint(llvm_inc_block->getTerminator()); Value *next_offset = dst_builder.CreateNeg(offset); offset->addIncoming(next_offset, llvm_inc_block); diff --git a/lib/codegen/vectorize.cpp b/lib/codegen/vectorize.cpp index c9757c6aa..57c2142c9 100644 --- a/lib/codegen/vectorize.cpp +++ b/lib/codegen/vectorize.cpp @@ -15,8 +15,10 @@ void vectorize::run(ir::module &mod) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()) if(dynamic_cast(i)){ - builder.set_insert_point(i); ir::value *x = i->get_operand(0); + if(*params_->get_param(x, "p0.d0") == 1) + continue; + builder.set_insert_point(i); ir::instruction *rx = (ir::instruction*)builder.create_vectorize(x); x->replace_all_uses_with(rx); rx->set_operand(0, x);