diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 171c1d3f5..e630e5164 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -87,19 +87,28 @@ T min(std::vector x) template -double bench(OP const & op, SYNC const & sync, unsigned repeat = 20) +double bench(OP const & op, SYNC const & sync, triton::driver::device const & device) { timer tmr; + std::vector times; + double total_time = 0; op(); sync(); - tmr.start(); - for(unsigned i = 0; i < repeat; i++) + while(total_time*1e-9 < 1e-3){ + float norm = 1; + // normalize clock if possible to get roughly constant result + if(auto cu_device = dynamic_cast(&device)) + norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); + tmr.start(); op(); - sync(); - double time = tmr.get().count(); - return time / repeat; + sync(); + times.push_back(norm*tmr.get().count()); + total_time+=times.back(); + } + return min(times); } + int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); @@ -159,7 +168,7 @@ int main() { stream->synchronize(); // benchmark double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, - [&](){ stream->synchronize(); }); + [&](){ stream->synchronize(); }, *context->device()); ts = ts * 1e-9; double tflops = 2.*M*N*K / ts * 1e-12; return tflops; @@ -175,7 +184,7 @@ int main() { 4 }; -// jit.autotune(src, benchmark); + jit.autotune(src, benchmark); jit.add_module(src, params); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); diff --git a/include/triton/driver/device.h b/include/triton/driver/device.h index 34d299e91..f4a786a31 100755 --- a/include/triton/driver/device.h +++ b/include/triton/driver/device.h @@ -115,6 +115,7 @@ public: size_t max_shared_memory() const; size_t max_sm_clock() const; size_t max_mem_clock() const; + void set_max_clock(); // Target std::unique_ptr make_target() const; diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index 2d06bb397..71411b1ca 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -165,6 +165,7 @@ public: static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device); static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); + static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock); static cublasHandle_t cublasHandle(driver::cu_context const & ctx); static cublasStatus_t cublasCreate_v2(cublasHandle_t* h); @@ -281,6 +282,7 @@ private: static void* nvmlDeviceGetHandleByPciBusId_v2_; static void* nvmlDeviceGetClockInfo_; static void* nvmlDeviceGetMaxClockInfo_; + static void* nvmlDeviceSetApplicationsClocks_; // cuBLAS static void* cublasCreate_v2_; static void* cublasGetStream_v2_; diff --git a/include/triton/jit.h b/include/triton/jit.h index c033cf204..c4809d254 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -61,7 +61,6 @@ public: barriers.run(module); } vectorize.run(module); - triton::ir::print(module, std::cout); } codegen::tune tune; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 84dcbcf3b..77ba26464 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -234,7 +234,7 @@ ir::type* tile::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_cons // Pointer ir::type* pointer::type_impl(ir::module*, ir::type *type, storage_spec_vec_const_ref_t storage) const{ bool is_ptr_to_const = std::find(storage.begin(), storage.end(), CONSTANT_SPACE_T) != storage.end(); - return ir::pointer_type::get(type, is_ptr_to_const?4:0); + return ir::pointer_type::get(type, is_ptr_to_const?4:1); } // Function diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index cf36e36c9..4353b1332 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -145,7 +145,7 @@ void tune::run(ir::module &mod) { // Layout parameters while(!nodes_.empty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 2); + ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 4); ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); connected_components(*nodes_.begin(), {nts, mts}, nodes_, dependencies_); } diff --git a/lib/driver/device.cpp b/lib/driver/device.cpp index 950ddc3f3..ae66c50c8 100755 --- a/lib/driver/device.cpp +++ b/lib/driver/device.cpp @@ -1,22 +1,22 @@ /* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, * subject to the following conditions: -* -* The above copyright notice and this permission notice shall be +* +* The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ @@ -217,6 +217,11 @@ size_t cu_device::max_mem_clock() const{ return result; } +// max memory clock +void cu_device::set_max_clock() { + dispatch::nvmlDeviceSetApplicationsClocks(nvml_device(), max_mem_clock(), max_sm_clock()); +} + // print infos std::string cu_device::infos() const{ std::ostringstream oss; diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp index 4dfd6df6e..7bb0fd001 100755 --- a/lib/driver/dispatch.cpp +++ b/lib/driver/dispatch.cpp @@ -199,6 +199,7 @@ CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*) NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*) NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*) NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*) +NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, unsigned int, unsigned int) cublasHandle_t dispatch::cublasHandle(const cu_context &ctx){ static std::map handles; @@ -391,6 +392,7 @@ void* dispatch::nvmlInit_v2_; void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_; void* dispatch::nvmlDeviceGetClockInfo_; void* dispatch::nvmlDeviceGetMaxClockInfo_; +void* dispatch::nvmlDeviceSetApplicationsClocks_; void* dispatch::cublasCreate_v2_; void* dispatch::cublasGetStream_v2_; diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index e822794a5..8346961fe 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -106,6 +106,7 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple const std::string& features, file_type_t ft) { init_llvm(); + // create machine module->setTargetTriple(triple); std::string error; @@ -179,10 +180,6 @@ host_module::host_module(driver::context * context, llvm::Module* src): module(c // create execution engine -// llvm::legacy::PassManager pass; -// pass.add(llvm::createPrintModulePass(llvm::outs())); -// pass.add(llvm::createVerifierPass()); -// pass.run(*src); auto cloned = llvm::CloneModule(*src); for(llvm::Function& fn: cloned->functions()) hst_->functions[fn.getName()] = &fn;