[examples] normalize benchmark by max_clock / current_clock

This commit is contained in:
Philippe Tillet
2019-03-28 07:11:06 -04:00
parent 2c3ae0675e
commit 0c607c9392
9 changed files with 44 additions and 29 deletions

View File

@@ -87,19 +87,28 @@ T min(std::vector<T> x)
template<class OP, class SYNC>
double bench(OP const & op, SYNC const & sync, unsigned repeat = 20)
double bench(OP const & op, SYNC const & sync, triton::driver::device const & device)
{
timer tmr;
std::vector<size_t> times;
double total_time = 0;
op();
sync();
tmr.start();
for(unsigned i = 0; i < repeat; i++)
while(total_time*1e-9 < 1e-3){
float norm = 1;
// normalize clock if possible to get roughly constant result
if(auto cu_device = dynamic_cast<const triton::driver::cu_device*>(&device))
norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock();
tmr.start();
op();
sync();
double time = tmr.get().count();
return time / repeat;
sync();
times.push_back(norm*tmr.get().count());
total_time+=times.back();
}
return min(times);
}
int main() {
// initialize default compute device
auto context = triton::driver::backend::contexts::get_default();
@@ -159,7 +168,7 @@ int main() {
stream->synchronize();
// benchmark
double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});},
[&](){ stream->synchronize(); });
[&](){ stream->synchronize(); }, *context->device());
ts = ts * 1e-9;
double tflops = 2.*M*N*K / ts * 1e-12;
return tflops;
@@ -175,7 +184,7 @@ int main() {
4
};
// jit.autotune(src, benchmark);
jit.autotune(src, benchmark);
jit.add_module(src, params);
triton::driver::kernel* kernel = jit.get_function("matmul");
triton::jit::launch_information info = jit.get_launch_info("matmul");

View File

@@ -115,6 +115,7 @@ public:
size_t max_shared_memory() const;
size_t max_sm_clock() const;
size_t max_mem_clock() const;
void set_max_clock();
// Target
std::unique_ptr<codegen::target> make_target() const;

View File

@@ -165,6 +165,7 @@ public:
static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device);
static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock);
static cublasHandle_t cublasHandle(driver::cu_context const & ctx);
static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
@@ -281,6 +282,7 @@ private:
static void* nvmlDeviceGetHandleByPciBusId_v2_;
static void* nvmlDeviceGetClockInfo_;
static void* nvmlDeviceGetMaxClockInfo_;
static void* nvmlDeviceSetApplicationsClocks_;
// cuBLAS
static void* cublasCreate_v2_;
static void* cublasGetStream_v2_;

View File

@@ -61,7 +61,6 @@ public:
barriers.run(module);
}
vectorize.run(module);
triton::ir::print(module, std::cout);
}
codegen::tune tune;

View File

@@ -234,7 +234,7 @@ ir::type* tile::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_cons
// Pointer
ir::type* pointer::type_impl(ir::module*, ir::type *type, storage_spec_vec_const_ref_t storage) const{
bool is_ptr_to_const = std::find(storage.begin(), storage.end(), CONSTANT_SPACE_T) != storage.end();
return ir::pointer_type::get(type, is_ptr_to_const?4:0);
return ir::pointer_type::get(type, is_ptr_to_const?4:1);
}
// Function

View File

@@ -145,7 +145,7 @@ void tune::run(ir::module &mod) {
// Layout parameters
while(!nodes_.empty()){
ir::type *ty = mod.get_builder().get_int32_ty();
ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 2);
ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 4);
ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32);
connected_components(*nodes_.begin(), {nts, mts}, nodes_, dependencies_);
}

View File

@@ -1,22 +1,22 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
@@ -217,6 +217,11 @@ size_t cu_device::max_mem_clock() const{
return result;
}
// max memory clock
void cu_device::set_max_clock() {
dispatch::nvmlDeviceSetApplicationsClocks(nvml_device(), max_mem_clock(), max_sm_clock());
}
// print infos
std::string cu_device::infos() const{
std::ostringstream oss;

View File

@@ -199,6 +199,7 @@ CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*)
NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*)
NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, unsigned int, unsigned int)
cublasHandle_t dispatch::cublasHandle(const cu_context &ctx){
static std::map<CUcontext, cublasHandle_t> handles;
@@ -391,6 +392,7 @@ void* dispatch::nvmlInit_v2_;
void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_;
void* dispatch::nvmlDeviceGetClockInfo_;
void* dispatch::nvmlDeviceGetMaxClockInfo_;
void* dispatch::nvmlDeviceSetApplicationsClocks_;
void* dispatch::cublasCreate_v2_;
void* dispatch::cublasGetStream_v2_;

View File

@@ -106,6 +106,7 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple
const std::string& features,
file_type_t ft) {
init_llvm();
// create machine
module->setTargetTriple(triple);
std::string error;
@@ -179,10 +180,6 @@ host_module::host_module(driver::context * context, llvm::Module* src): module(c
// create execution engine
// llvm::legacy::PassManager pass;
// pass.add(llvm::createPrintModulePass(llvm::outs()));
// pass.add(llvm::createVerifierPass());
// pass.run(*src);
auto cloned = llvm::CloneModule(*src);
for(llvm::Function& fn: cloned->functions())
hst_->functions[fn.getName()] = &fn;