[code generation] bug fixes in grid axes binding

This commit is contained in:
Philippe Tillet
2019-02-08 23:32:17 -05:00
parent f697fcb887
commit 77dd99efe8
4 changed files with 22 additions and 40 deletions

View File

@@ -34,7 +34,6 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LLVM_CXXFLAGS} -std=c++11")
# TDL # TDL
file(GLOB_RECURSE LIBTDL_SRC lib/*.cpp) file(GLOB_RECURSE LIBTDL_SRC lib/*.cpp)
add_library(tdl SHARED ${LIBTDL_SRC} ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) add_library(tdl SHARED ${LIBTDL_SRC} ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS})
message(STATUS ${llvm_libs})
target_link_libraries(tdl ${llvm_libs}) target_link_libraries(tdl ${llvm_libs})
# Examples # Examples

View File

@@ -31,22 +31,10 @@ extern translation_unit *ast_root;
const char src[] = const char src[] =
"\ "\
void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\
int32 rx[32] = get_global_range[32](0);\ int32 rx[16] = get_global_range[16](0);\
int32 ry[32] = get_global_range[32](1);\ int32 ry[16] = get_global_range[16](1);\
int32 rka[8] = 0 ... 8;\ fp32 C[16, 16] = 1;\
int32 rkb[8] = 0 ... 8;\ fp32* pc[16, 16] = c + rx[:, newaxis] + ry[newaxis, :]*M;\
fp32 C[32, 32] = 0;\
int32 k;\
fp32* pa[32, 8] = a + rx[:, newaxis] + rka[newaxis, :]*M;\
fp32* pb[32, 8] = b + ry[:, newaxis] + rkb[newaxis, :]*K;\
fp32* pc[32, 32] = c + rx[:, newaxis] + ry[newaxis, :]*M;\
for(k = K; k > 0; k = k - 8){\
fp32 a[32, 8] = *pa;\
fp32 b[32, 8] = *pb;\
C = C + 1;\
pa = pa + 8*M;\
pb = pb + 8*K;\
}\
*pc = C;\ *pc = C;\
}\ }\
"; ";
@@ -151,14 +139,10 @@ int main() {
// tuning parameters // tuning parameters
tune.run(module); tune.run(module);
std::vector<unsigned> params = { std::vector<unsigned> params = {
// asm // c0
2, 8, 1, 2, 8, 1,
// bsn // c1
4, 4, 1, 4, 4, 1,
// pa
2, 4, 1,
// pb
1, 8, 1,
}; };
std::map<tdl::ir::value*, std::vector<std::string>> errors; std::map<tdl::ir::value*, std::vector<std::string>> errors;
unsigned i = 0; unsigned i = 0;
@@ -184,7 +168,7 @@ int main() {
// generate machine code // generate machine code
std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true));
std::cout << src << std::endl; // std::cout << src << std::endl;
// compile machine code // compile machine code
CUdevice cu_device; CUdevice cu_device;
@@ -220,16 +204,16 @@ int main() {
void *args[] = { &d_a, &d_b, &d_c, &M, &N, &K}; void *args[] = { &d_a, &d_b, &d_c, &M, &N, &K};
int num_regs; int num_regs;
cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, cu_kernel); cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, cu_kernel);
unsigned TM = params[0]*params[1]; unsigned TM = 16;
unsigned TN = params[3]*params[4]; unsigned TN = 16;
unsigned nthreads = params[1]*params[2]*params[7]*params[8]; unsigned nthreads = 32;
checkCudaErrors(cuLaunchKernel(cu_kernel, M/TM, N/TN, 1, nthreads, 1, 1, 0, cu_stream, args, NULL)); checkCudaErrors(cuLaunchKernel(cu_kernel, M/TM, N/TN, 1, nthreads, 1, 1, 0, cu_stream, args, NULL));
checkCudaErrors(cuStreamSynchronize(cu_stream)); checkCudaErrors(cuStreamSynchronize(cu_stream));
// Write back // Write back
checkCudaErrors(cuMemcpyDtoH(c.data(), d_c, sizeof(numeric_t) * c.size())); checkCudaErrors(cuMemcpyDtoH(c.data(), d_c, sizeof(numeric_t) * c.size()));
for(size_t i = 0; i < M*N; i++) for(size_t i = 0; i < M*N; i++)
if(c[i] == 32) if(c[i] != 1)
std::cout << i << " " << "success" << std::endl; std::cout << i << " " << "failure" << std::endl;
return 0; return 0;
} }

View File

@@ -109,7 +109,7 @@ private:
tmap_t tmap_; tmap_t tmap_;
allocation *alloc_; allocation *alloc_;
tune *params_; tune *params_;
std::map<ir::value*, std::vector<distributed_axis>> axes_; std::map<unsigned*, distributed_axis> axes_;
}; };
} }

View File

@@ -63,7 +63,8 @@ Value* shared_tile::shared_offset(indices_t idx) {
return result; return result;
} }
shared_tile::shared_tile(Type *ty, const shapes_t &shapes, Value *ptr, llvm::IRBuilder<> &builder): tile(ty, shapes), ptr_(ptr), builder_(builder) { shared_tile::shared_tile(Type *ty, const shapes_t &shapes, Value *ptr, llvm::IRBuilder<> &builder):
tile(ty, shapes), ptr_(ptr), builder_(builder) {
} }
@@ -236,8 +237,8 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id
std::vector<Value*> thread_id_in_warp = delinearize(u_thread_id, warp_size, builder); std::vector<Value*> thread_id_in_warp = delinearize(u_thread_id, warp_size, builder);
std::vector<Value*> warp_id = delinearize(u_warp_id, n_warps, builder); std::vector<Value*> warp_id = delinearize(u_warp_id, n_warps, builder);
// Create axes // Create axes
std::vector<distributed_axis> axes(dim);
for(unsigned k = 0; k < dim; k++) { for(unsigned k = 0; k < dim; k++) {
std::string str_k = std::to_string(k);
Value *warp_size_k = builder.getInt32(warp_size[k]); Value *warp_size_k = builder.getInt32(warp_size[k]);
Value *contiguous_k = builder.getInt32(contiguous[k]); Value *contiguous_k = builder.getInt32(contiguous[k]);
Value *thread_id = builder.CreateAdd(thread_id_in_warp[k], builder.CreateMul(warp_id[k], warp_size_k)); Value *thread_id = builder.CreateAdd(thread_id_in_warp[k], builder.CreateMul(warp_id[k], warp_size_k));
@@ -247,12 +248,10 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id
std::vector<Value*> idx_list(per_thread); std::vector<Value*> idx_list(per_thread);
for(unsigned n = 0 ; n < per_thread; n++){ for(unsigned n = 0 ; n < per_thread; n++){
unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; unsigned offset = n / contiguous[k] * per_block + n % contiguous[k];
idx_list[n] = builder.CreateAdd(thread_id, builder.getInt32(offset)); idx_list[n] = builder.CreateAdd(thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n));
} }
axes[k] = distributed_axis{idx_list}; axes_[params_->get_param(v, "p0.d" + str_k)] = distributed_axis{idx_list};
} }
// Store axes
axes_[v] = axes;
} }
void selection::create_grids(std::vector<ir::value*> &grids, void selection::create_grids(std::vector<ir::value*> &grids,
@@ -327,7 +326,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder,
for(size_t d = 0; d < shapes.size(); d++){ for(size_t d = 0; d < shapes.size(); d++){
if(shapes[d] > 1){ if(shapes[d] > 1){
unsigned *x = params_->get_param(v, "p0.d" + std::to_string(d)); unsigned *x = params_->get_param(v, "p0.d" + std::to_string(d));
axes[d] = axes_.at(references.at(x))[d]; axes[d] = axes_.at(x);
} }
else else
axes[d].values = {builder.getInt32(0)}; axes[d].values = {builder.getInt32(0)};
@@ -337,6 +336,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder,
// constant range // constant range
if(dynamic_cast<ir::constant*>(v)) if(dynamic_cast<ir::constant*>(v))
T->for_each([&](indices_t idx){ T->for_each([&](indices_t idx){
assert(idx.size() == 1);
T->set_value(idx, idx[0]); T->set_value(idx, idx[0]);
}); });
@@ -397,8 +397,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> &
Value *offset = builder.CreateMul(builder.getInt32(shapes[0]), group_id); Value *offset = builder.CreateMul(builder.getInt32(shapes[0]), group_id);
result->for_each([&](indices_t idx){ result->for_each([&](indices_t idx){
BinaryOperator *bin = static_cast<BinaryOperator*>(idx[0]); BinaryOperator *bin = static_cast<BinaryOperator*>(idx[0]);
result->set_value(idx, builder.CreateAdd(bin->getOperand(1), result->set_value(idx, builder.CreateAdd(bin, offset));
builder.CreateAdd(bin->getOperand(0), offset)));
}); });
} }
// reshape // reshape
@@ -430,8 +429,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> &
for(size_t k = 0; k < in_idx.size(); k++){ for(size_t k = 0; k < in_idx.size(); k++){
if(in_shapes[k] == 1) if(in_shapes[k] == 1)
in_idx[k] = builder.getInt32(0); in_idx[k] = builder.getInt32(0);
result->set_value(out_idx, in_tile->get_value(in_idx));
} }
result->set_value(out_idx, in_tile->get_value(in_idx));
}); });
} }
// copy to shared // copy to shared