diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 554b3bcc3..9d07bf227 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -38,7 +38,7 @@ inline double bench(std::function const & op, driver::stream * stream) double total_time = 0; op(); stream->synchronize(); - while(total_time*1e-9 < 1e-3){ + while(total_time*1e-9 < 1e-1){ float norm = 1; // normalize clock if possible to reduce noise in auto-tuning if(auto cu_device = dynamic_cast(stream->context()->device())) diff --git a/lib/codegen/analysis/align.cc b/lib/codegen/analysis/align.cc index ef57e7a4f..e31799b59 100644 --- a/lib/codegen/analysis/align.cc +++ b/lib/codegen/analysis/align.cc @@ -487,6 +487,9 @@ void align::populate(ir::value *v) { populate_is_constant(v); populate_starting_multiple(v); populate_max_contiguous(v); +// std::cout << v->get_name() << std::endl; +// if(max_contiguous_[v].size() == 2) +// std::cout << max_contiguous_[v][0] << " " << max_contiguous_[v][1] << std::endl; } void align::run(ir::module &mod) { diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 0bf85c84f..66c775ac6 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,6 +241,7 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index ec0fcd990..86c0a3f8f 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -161,6 +161,7 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr for(auto it: opt_space_.defines) cpp.AddMacro(it.first, &opt.defines.at(it.first)); cpp.Process(tokens); +// tokens.Print(stdout); // parse Parser parser(tokens); parser.Parse(); @@ -215,16 +216,19 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c // run passes peephole.run(module); dce.run(module); +// ir::print(module, std::cout); align.run(module); cts.run(module); axes.run(module); layouts.run(module); coalesce.run(module); +// ir::print(module, std::cout); dce.run(module); align.run(module); dce.run(module); tiles.run(module); reassociate.run(module); + peephole.run(module); dce.run(module); cts.run(module); liveness.run(module); diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 19dd95cd9..4f6c989e9 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -48,7 +48,7 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i opt.defines.push_back({"TM", {"128"}}); opt.defines.push_back({"TN", {"128"}}); opt.defines.push_back({"TK", {"16"}}); - opt.num_warps = {2, 4, 8}; + opt.num_warps = {4}; // create function rt::function function(src::dot, opt); // benchmark available libraries @@ -79,12 +79,9 @@ int main() { // shapes to benchmark typedef std::tuple config_t; std::vector configs; - for(auto x: std::vector>{{false, false}, - {false, true}, - {true, false}, - {true, true}}){ + for(auto x: std::vector>{{false, false}}){ std::vector tmp = { - config_t{x[0], x[1], 2048, 2048, 2048} + config_t{x[0], x[1], 4096, 4096, 4096} // config_t{x[0], x[1], 16, 2048, 2048}, // config_t{x[0], x[1], 32, 2048, 2048}, // config_t{x[0], x[1], 64, 2048, 2048},