[examples] normalize benchmark by max_clock / current_clock

2019-03-28 07:11:06 -04:00
parent 2c3ae0675e
commit 0c607c9392
9 changed files with 44 additions and 29 deletions
--- a/examples/matrix.cpp
+++ b/examples/matrix.cpp
@@ -87,19 +87,28 @@ T min(std::vector<T> x)


 template<class OP, class SYNC>
-double bench(OP const & op, SYNC const & sync, unsigned repeat = 20)
+double bench(OP const & op, SYNC const & sync, triton::driver::device const & device)
 {
  timer tmr;
+  std::vector<size_t> times;
+  double total_time = 0;
  op();
  sync();
-  tmr.start();
-  for(unsigned i = 0; i < repeat; i++)
+  while(total_time*1e-9 < 1e-3){
+    float norm = 1;
+    // normalize clock if possible to get roughly constant result
+    if(auto cu_device = dynamic_cast<const triton::driver::cu_device*>(&device))
+      norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock();
+    tmr.start();
    op();
-  sync();
-  double time = tmr.get().count();
-  return time / repeat;
+    sync();
+    times.push_back(norm*tmr.get().count());
+    total_time+=times.back();
+  }
+  return min(times);
 }

+
 int main() {
  // initialize default compute device
  auto context = triton::driver::backend::contexts::get_default();
@@ -159,7 +168,7 @@ int main() {
    stream->synchronize();
    // benchmark
    double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});},
-                      [&](){ stream->synchronize(); });
+                      [&](){ stream->synchronize(); }, *context->device());
    ts = ts * 1e-9;
    double tflops = 2.*M*N*K / ts * 1e-12;
    return tflops;
@@ -175,7 +184,7 @@ int main() {
    4
  };

-//  jit.autotune(src, benchmark);
+  jit.autotune(src, benchmark);
  jit.add_module(src, params);
  triton::driver::kernel* kernel = jit.get_function("matmul");
  triton::jit::launch_information info = jit.get_launch_info("matmul");
--- a/include/triton/driver/device.h
+++ b/include/triton/driver/device.h
@@ -115,6 +115,7 @@ public:
  size_t max_shared_memory() const;
  size_t max_sm_clock() const;
  size_t max_mem_clock() const;
+  void set_max_clock();
  // Target
  std::unique_ptr<codegen::target> make_target() const;

--- a/include/triton/driver/dispatch.h
+++ b/include/triton/driver/dispatch.h
@@ -165,6 +165,7 @@ public:
  static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device);
  static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
+  static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock);

  static cublasHandle_t cublasHandle(driver::cu_context const & ctx);
  static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
@@ -281,6 +282,7 @@ private:
  static void* nvmlDeviceGetHandleByPciBusId_v2_;
  static void* nvmlDeviceGetClockInfo_;
  static void* nvmlDeviceGetMaxClockInfo_;
+  static void* nvmlDeviceSetApplicationsClocks_;
  // cuBLAS
  static void* cublasCreate_v2_;
  static void* cublasGetStream_v2_;
--- a/include/triton/jit.h
+++ b/include/triton/jit.h
@@ -61,7 +61,6 @@ public:
        barriers.run(module);
      }
      vectorize.run(module);
-      triton::ir::print(module, std::cout);
    }

    codegen::tune tune;
--- a/lib/ast/lowering.cpp
+++ b/lib/ast/lowering.cpp
@@ -234,7 +234,7 @@ ir::type* tile::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_cons
 // Pointer
 ir::type* pointer::type_impl(ir::module*, ir::type *type, storage_spec_vec_const_ref_t storage) const{
  bool is_ptr_to_const = std::find(storage.begin(), storage.end(), CONSTANT_SPACE_T) != storage.end();
-  return ir::pointer_type::get(type, is_ptr_to_const?4:0);
+  return ir::pointer_type::get(type, is_ptr_to_const?4:1);
 }

 // Function
--- a/lib/codegen/tune.cpp
+++ b/lib/codegen/tune.cpp
@@ -145,7 +145,7 @@ void tune::run(ir::module &mod) {
    // Layout parameters
    while(!nodes_.empty()){
      ir::type *ty = mod.get_builder().get_int32_ty();
-      ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 2);
+      ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 4);
      ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32);
      connected_components(*nodes_.begin(), {nts, mts}, nodes_, dependencies_);
    }
--- a/lib/driver/device.cpp
+++ b/lib/driver/device.cpp
@@ -1,22 +1,22 @@
 /* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
+*
+* The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

@@ -217,6 +217,11 @@ size_t cu_device::max_mem_clock() const{
  return result;
 }

+// max memory clock
+void cu_device::set_max_clock() {
+  dispatch::nvmlDeviceSetApplicationsClocks(nvml_device(), max_mem_clock(), max_sm_clock());
+}
+
 // print infos
 std::string cu_device::infos() const{
  std::ostringstream oss;
--- a/lib/driver/dispatch.cpp
+++ b/lib/driver/dispatch.cpp
@@ -199,6 +199,7 @@ CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*)
 NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*)
 NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
 NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
+NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, unsigned int, unsigned int)

 cublasHandle_t dispatch::cublasHandle(const cu_context &ctx){
  static std::map<CUcontext, cublasHandle_t> handles;
@@ -391,6 +392,7 @@ void* dispatch::nvmlInit_v2_;
 void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_;
 void* dispatch::nvmlDeviceGetClockInfo_;
 void* dispatch::nvmlDeviceGetMaxClockInfo_;
+void* dispatch::nvmlDeviceSetApplicationsClocks_;

 void* dispatch::cublasCreate_v2_;
 void* dispatch::cublasGetStream_v2_;
--- a/lib/driver/module.cpp
+++ b/lib/driver/module.cpp
@@ -106,6 +106,7 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple
                                        const std::string& features,
                                        file_type_t ft) {
  init_llvm();
+
  // create machine
  module->setTargetTriple(triple);
  std::string error;
@@ -179,10 +180,6 @@ host_module::host_module(driver::context * context, llvm::Module* src): module(c


  // create execution engine
-//  llvm::legacy::PassManager pass;
-//  pass.add(llvm::createPrintModulePass(llvm::outs()));
-//  pass.add(llvm::createVerifierPass());
-//  pass.run(*src);
  auto cloned = llvm::CloneModule(*src);
  for(llvm::Function& fn: cloned->functions())
    hst_->functions[fn.getName()] = &fn;