GEMM: Enabled use of cuBLAS when predicted beneficial

This commit is contained in:
Philippe Tillet
2016-10-04 04:20:07 -04:00
parent a4ed0dfbec
commit 3293c45e60
26 changed files with 19391 additions and 26856 deletions

View File

@@ -29,7 +29,7 @@ double bench(OP const & op, SYNC const & sync)
double total_time = 0; double total_time = 0;
op(); op();
sync(); sync();
while(total_time*1e-9 < 1e-1){ while(total_time*1e-9 < 2e-1){
tmr.start(); tmr.start();
op(); op();
sync(); sync();
@@ -239,12 +239,13 @@ void bench(sc::numeric_type dtype, std::string operation)
#ifdef BENCH_CUBLAS #ifdef BENCH_CUBLAS
times.push_back(bench([&](){cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 1, (T*)cu(C), ldc);}, cusync)); times.push_back(bench([&](){cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 1, (T*)cu(C), ldc);}, cusync));
#endif #endif
std::cout << name << "\t" << M << "\t" << N << "\t" << K << "\t" << cAT << "\t" << cBT;
std::transform(times.begin(), times.end(), std::back_inserter(tflops), [&](double t){ return 2*M*N*K/t*1e-3;}); std::transform(times.begin(), times.end(), std::back_inserter(tflops), [&](double t){ return 2*M*N*K/t*1e-3;});
double best = max(tflops); auto fastest = tflops;
std::sort(fastest.begin(), fastest.end(), std::greater<double>());
std::cout << name << "\t" << M << "\t" << N << "\t" << K << "\t" << cAT << "\t" << cBT;
for(auto x: tflops){ for(auto x: tflops){
std::cout << "\t"; std::cout << "\t";
if(x==best) if(x == fastest[0] && x/fastest[1] >= 1.02)
std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET); std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET);
else else
std::cout << x; std::cout << x;

View File

@@ -38,13 +38,6 @@ enum backend_type
CUDA CUDA
}; };
void check(nvrtcResult err);
void check(CUresult);
void check_destruction(CUresult);
void check(cl_int err);
} }
} }

View File

@@ -50,7 +50,7 @@ private:
static CUdevice device(CUcontext) static CUdevice device(CUcontext)
{ {
CUdevice res; CUdevice res;
check(dispatch::cuCtxGetDevice(&res)); dispatch::cuCtxGetDevice(&res);
return res; return res;
} }

View File

@@ -32,7 +32,8 @@
#include "isaac/driver/external/CUDA/cuda.h" #include "isaac/driver/external/CUDA/cuda.h"
#include "isaac/driver/external/CUDA/nvrtc.h" #include "isaac/driver/external/CUDA/nvrtc.h"
#include "isaac/driver/external/CUDA/cublas.h" #include "isaac/driver/external/CUDA/cublas.h"
//Exceptions
#include "isaac/driver/common.h"
#include <iostream> #include <iostream>
namespace isaac namespace isaac
@@ -40,6 +41,14 @@ namespace isaac
namespace driver namespace driver
{ {
class Context;
template<class T> void check(T){}
void check(nvrtcResult err);
void check(CUresult err);
void check(cublasStatus_t err);
void check(cl_int err);
void check_destruction(CUresult);
class dispatch class dispatch
{ {
@@ -61,11 +70,11 @@ private:
cache = dlsym(lib_h, name); cache = dlsym(lib_h, name);
FunPtrT fptr; FunPtrT fptr;
*reinterpret_cast<void **>(&fptr) = cache; *reinterpret_cast<void **>(&fptr) = cache;
return (*fptr)(args...); typename return_type<FunPtrT>::type res = (*fptr)(args...);
check(res);
return res;
} }
static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
public: public:
static bool clinit(); static bool clinit();
static bool cublasinit(); static bool cublasinit();
@@ -146,17 +155,19 @@ public:
static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames); static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames);
static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log); static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
static cublasStatus_t cublasGetStream(cudaStream_t *streamId); static cublasHandle_t cublasHandle(Context const & ctx);
static cublasStatus_t cublasSetStream(cudaStream_t streamId); static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
static cublasStatus_t cublasSgemm (cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc); static cublasStatus_t cublasGetStream(cublasHandle_t h, cudaStream_t *streamId);
static cublasStatus_t cublasDgemm (cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc); static cublasStatus_t cublasSetStream(cublasHandle_t h, cudaStream_t streamId);
static cublasStatus_t cublasSgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc);
static cublasStatus_t cublasDgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc);
private: private:
static void* opencl_; static void* opencl_;
static void* cuda_; static void* cuda_;
static void* nvrtc_; static void* nvrtc_;
static void* cublas_; static void* cublas_;
static cublasHandle_t cublas_handle_;
//OpenCL //OpenCL
static void* clBuildProgram_; static void* clBuildProgram_;

View File

@@ -39,7 +39,7 @@ namespace nvrtc
#define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class ISAACAPI name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } } #define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class ISAACAPI name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } }
ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory ,"out of memory exception"); ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory ,"out of memory");
ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure ,"program creation failure"); ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure ,"program creation failure");
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_input ,"invalid input"); ISAAC_CREATE_NVRTC_EXCEPTION(invalid_input ,"invalid input");
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_program ,"invalid program"); ISAAC_CREATE_NVRTC_EXCEPTION(invalid_program ,"invalid program");
@@ -49,8 +49,6 @@ namespace nvrtc
ISAAC_CREATE_NVRTC_EXCEPTION(unknown_error ,"unknown error"); ISAAC_CREATE_NVRTC_EXCEPTION(unknown_error ,"unknown error");
#undef ISAAC_CREATE_NVRTC_EXCEPTION #undef ISAAC_CREATE_NVRTC_EXCEPTION
void check(nvrtcResult err);
} }
@@ -120,11 +118,26 @@ namespace cuda
ISAAC_CREATE_CUDA_EXCEPTION(unknown ,"unknown"); ISAAC_CREATE_CUDA_EXCEPTION(unknown ,"unknown");
#undef ISAAC_CREATE_CUDA_EXCEPTION #undef ISAAC_CREATE_CUDA_EXCEPTION
void check(CUresult);
void check_destruction(CUresult);
} }
namespace cublas
{
#define ISAAC_CREATE_CUBLAS_EXCEPTION(name, msg) class ISAACAPI name: public std::exception { public: const char * what() const throw(){ return "CUBLAS: Error- " msg; } }
ISAAC_CREATE_CUBLAS_EXCEPTION(not_initialized ,"not initialized");
ISAAC_CREATE_CUBLAS_EXCEPTION(alloc_failed ,"alloc failed");
ISAAC_CREATE_CUBLAS_EXCEPTION(invalid_value ,"invalid value");
ISAAC_CREATE_CUBLAS_EXCEPTION(arch_mismatch ,"arch mismatch");
ISAAC_CREATE_CUBLAS_EXCEPTION(mapping_error ,"mapping error");
ISAAC_CREATE_CUBLAS_EXCEPTION(execution_failed ,"execution failed");
ISAAC_CREATE_CUBLAS_EXCEPTION(internal_error ,"internal error");
ISAAC_CREATE_CUBLAS_EXCEPTION(not_supported ,"not supported");
ISAAC_CREATE_CUBLAS_EXCEPTION(license_error ,"license error");
ISAAC_CREATE_CUBLAS_EXCEPTION(unknown ,"unknown");
#undef ISAAC_CREATE_CUBLAS_EXCEPTION
}
namespace ocl namespace ocl
{ {
@@ -183,9 +196,6 @@ namespace ocl
#ifdef CL_INVALID_PROPERTY #ifdef CL_INVALID_PROPERTY
ISAAC_CREATE_CL_EXCEPTION(invalid_property, "invalid property"); ISAAC_CREATE_CL_EXCEPTION(invalid_property, "invalid property");
#endif #endif
ISAACAPI void check(cl_int err);
} }

View File

@@ -68,6 +68,7 @@ public:
typedef std::map<std::pair<expression_type, numeric_type>, std::shared_ptr<value_type> > map_type; typedef std::map<std::pair<expression_type, numeric_type>, std::shared_ptr<value_type> > map_type;
private: private:
static std::shared_ptr<templates::base> create(std::string const & template_name, std::vector<int> const & x); static std::shared_ptr<templates::base> create(std::string const & template_name, std::vector<int> const & x);
static std::shared_ptr<templates::base> create(std::string const & op, std::string const & x);
static void import(std::string const & fname, driver::CommandQueue const & queue); static void import(std::string const & fname, driver::CommandQueue const & queue);
static map_type & init(driver::CommandQueue const & queue); static map_type & init(driver::CommandQueue const & queue);
public: public:

View File

@@ -217,9 +217,9 @@ void backend::platforms(std::vector<Platform> & platforms)
if(dispatch::clinit()) if(dispatch::clinit())
{ {
cl_uint nplatforms; cl_uint nplatforms;
check(dispatch::dispatch::clGetPlatformIDs(0, NULL, &nplatforms)); dispatch::dispatch::clGetPlatformIDs(0, NULL, &nplatforms);
std::vector<cl_platform_id> clplatforms(nplatforms); std::vector<cl_platform_id> clplatforms(nplatforms);
check(dispatch::dispatch::clGetPlatformIDs(nplatforms, clplatforms.data(), NULL)); dispatch::dispatch::clGetPlatformIDs(nplatforms, clplatforms.data(), NULL);
for(cl_platform_id p: clplatforms){ for(cl_platform_id p: clplatforms){
Platform tmp(p); Platform tmp(p);
if(tmp.name().find("CUDA")!=std::string::npos && has_cuda) if(tmp.name().find("CUDA")!=std::string::npos && has_cuda)

View File

@@ -45,7 +45,7 @@ Buffer::Buffer(Context const & context, size_t size) : backend_(context.backend_
switch(backend_) switch(backend_)
{ {
case CUDA: case CUDA:
check(dispatch::cuMemAlloc(&h_.cu(), size)); dispatch::cuMemAlloc(&h_.cu(), size);
break; break;
case OPENCL: case OPENCL:
cl_int err; cl_int err;

View File

@@ -112,6 +112,25 @@ void check(CUresult err)
} }
} }
void check(cublasStatus_t err)
{
using namespace isaac::exception::cublas;
switch(err)
{
case CUBLAS_STATUS_SUCCESS : break;
case CUBLAS_STATUS_NOT_INITIALIZED : throw not_initialized();
case CUBLAS_STATUS_ALLOC_FAILED : throw alloc_failed();
case CUBLAS_STATUS_INVALID_VALUE : throw invalid_value();
case CUBLAS_STATUS_ARCH_MISMATCH : throw arch_mismatch();
case CUBLAS_STATUS_MAPPING_ERROR : throw mapping_error();
case CUBLAS_STATUS_EXECUTION_FAILED: throw execution_failed();
case CUBLAS_STATUS_INTERNAL_ERROR : throw internal_error();
case CUBLAS_STATUS_NOT_SUPPORTED : throw not_supported();
case CUBLAS_STATUS_LICENSE_ERROR : throw license_error();
default : throw unknown();
}
}
void check_destruction(CUresult result) void check_destruction(CUresult result)
{ {
if(result!=CUDA_ERROR_DEINITIALIZED) if(result!=CUDA_ERROR_DEINITIALIZED)
@@ -179,6 +198,7 @@ void check(cl_int err)
} }
} }
} }
} }

View File

@@ -50,7 +50,7 @@ CommandQueue::CommandQueue(Context const & context, Device const & device, cl_co
{ {
case CUDA: case CUDA:
{ {
check(dispatch::cuStreamCreate(&h_.cu(), 0)); dispatch::cuStreamCreate(&h_.cu(), 0);
break; break;
} }
@@ -86,8 +86,8 @@ void CommandQueue::synchronize()
{ {
switch(backend_) switch(backend_)
{ {
case CUDA: check(dispatch::cuStreamSynchronize(h_.cu())); break; case CUDA: dispatch::cuStreamSynchronize(h_.cu()); break;
case OPENCL: check(dispatch::clFinish(h_.cl())); break; case OPENCL: dispatch::clFinish(h_.cl()); break;
default: throw; default: throw;
} }
} }
@@ -98,16 +98,16 @@ void CommandQueue::enqueue(Kernel const & kernel, NDRange global, driver::NDRang
{ {
case CUDA: case CUDA:
if(event) if(event)
check(dispatch::cuEventRecord(event->h_.cu().first, h_.cu())); dispatch::cuEventRecord(event->h_.cu().first, h_.cu());
check(dispatch::cuLaunchKernel(kernel.h_.cu(), global[0]/local[0], global[1]/local[1], global[2]/local[2], dispatch::cuLaunchKernel(kernel.h_.cu(), global[0]/local[0], global[1]/local[1], global[2]/local[2],
local[0], local[1], local[2], 0, h_.cu(),(void**)&kernel.cu_params_[0], NULL)); local[0], local[1], local[2], 0, h_.cu(),(void**)&kernel.cu_params_[0], NULL);
if(event) if(event)
check(dispatch::cuEventRecord(event->h_.cu().second, h_.cu())); dispatch::cuEventRecord(event->h_.cu().second, h_.cu());
break; break;
case OPENCL: case OPENCL:
check(dispatch::clEnqueueNDRangeKernel(h_.cl(), kernel.h_.cl(), global.dimension(), NULL, (const size_t *)global, (const size_t *) local, 0, NULL, event?&event->h_.cl():NULL)); dispatch::clEnqueueNDRangeKernel(h_.cl(), kernel.h_.cl(), global.dimension(), NULL, (const size_t *)global, (const size_t *) local, 0, NULL, event?&event->h_.cl():NULL);
break; break;
default: throw; default: throw;
} }
@@ -119,12 +119,12 @@ void CommandQueue::write(Buffer const & buffer, bool blocking, std::size_t offse
{ {
case CUDA: case CUDA:
if(blocking) if(blocking)
check(dispatch::cuMemcpyHtoD(buffer.h_.cu() + offset, ptr, size)); dispatch::cuMemcpyHtoD(buffer.h_.cu() + offset, ptr, size);
else else
check(dispatch::cuMemcpyHtoDAsync(buffer.h_.cu() + offset, ptr, size, h_.cu())); dispatch::cuMemcpyHtoDAsync(buffer.h_.cu() + offset, ptr, size, h_.cu());
break; break;
case OPENCL: case OPENCL:
check(dispatch::clEnqueueWriteBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL)); dispatch::clEnqueueWriteBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL);
break; break;
default: throw; default: throw;
} }
@@ -136,12 +136,12 @@ void CommandQueue::read(Buffer const & buffer, bool blocking, std::size_t offset
{ {
case CUDA: case CUDA:
if(blocking) if(blocking)
check(dispatch::cuMemcpyDtoH(ptr, buffer.h_.cu() + offset, size)); dispatch::cuMemcpyDtoH(ptr, buffer.h_.cu() + offset, size);
else else
check(dispatch::cuMemcpyDtoHAsync(ptr, buffer.h_.cu() + offset, size, h_.cu())); dispatch::cuMemcpyDtoHAsync(ptr, buffer.h_.cu() + offset, size, h_.cu());
break; break;
case OPENCL: case OPENCL:
check(dispatch::clEnqueueReadBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL)); dispatch::clEnqueueReadBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL);
break; break;
default: throw; default: throw;
} }

View File

@@ -74,7 +74,7 @@ Context::Context(Device const & device) : backend_(device.backend_), device_(dev
switch(backend_) switch(backend_)
{ {
case CUDA: case CUDA:
check(dispatch::cuCtxCreate(&h_.cu(), CU_CTX_SCHED_AUTO, device.h_.cu())); dispatch::cuCtxCreate(&h_.cu(), CU_CTX_SCHED_AUTO, device.h_.cu());
break; break;
case OPENCL: case OPENCL:
cl_int err; cl_int err;

View File

@@ -38,7 +38,7 @@ template<CUdevice_attribute attr>
int Device::cuGetInfo() const int Device::cuGetInfo() const
{ {
int res; int res;
check(dispatch::cuDeviceGetAttribute(&res, attr, h_.cu())); dispatch::cuDeviceGetAttribute(&res, attr, h_.cu());
return res; return res;
} }
@@ -197,7 +197,7 @@ std::string Device::name() const
{ {
case CUDA: case CUDA:
char tmp[128]; char tmp[128];
check(dispatch::cuDeviceGetName(tmp, 128, h_.cu())); dispatch::cuDeviceGetName(tmp, 128, h_.cu());
return std::string(tmp); return std::string(tmp);
case OPENCL: case OPENCL:
return ocl::info<CL_DEVICE_NAME>(h_.cl()); return ocl::info<CL_DEVICE_NAME>(h_.cl());

View File

@@ -20,6 +20,7 @@
*/ */
#include "isaac/driver/dispatch.h" #include "isaac/driver/dispatch.h"
#include "isaac/driver/context.h"
namespace isaac namespace isaac
{ {
@@ -126,11 +127,8 @@ bool dispatch::nvrtcinit()
bool dispatch::cublasinit() bool dispatch::cublasinit()
{ {
if(cublas_==nullptr){ if(cublas_==nullptr)
cublas_ = dlopen("libcublas.so", RTLD_LAZY); cublas_ = dlopen("libcublas.so", RTLD_LAZY);
if(cublas_!=nullptr)
dispatch::cublasCreate_v2(&cublas_handle_);
}
return cublas_ != nullptr; return cublas_ != nullptr;
} }
@@ -212,19 +210,28 @@ NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTXSize, nvrtcProgram, size_t *)
NVRTC_DEFINE6(nvrtcResult, nvrtcCreateProgram, nvrtcProgram *, const char *, const char *, int, const char **, const char **) NVRTC_DEFINE6(nvrtcResult, nvrtcCreateProgram, nvrtcProgram *, const char *, const char *, int, const char **, const char **)
NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLog, nvrtcProgram, char *) NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLog, nvrtcProgram, char *)
cublasHandle_t dispatch::cublasHandle(Context const & ctx)
{
static std::map<Context, cublasHandle_t> handles;
auto pr = handles.insert({ctx, cublasHandle_t()});
if(pr.second)
cublasCreate_v2(&pr.first->second);
return pr.first->second;
}
CUBLAS_DEFINE1(cublasStatus_t, cublasCreate_v2, cublasHandle_t*) CUBLAS_DEFINE1(cublasStatus_t, cublasCreate_v2, cublasHandle_t*)
cublasStatus_t dispatch::cublasGetStream(cudaStream_t *a) cublasStatus_t dispatch::cublasGetStream(cublasHandle_t h, cudaStream_t *a)
{ return f_impl<dispatch::cublasinit>(cublas_, cublasGetStream_v2, cublasGetStream_, "cublasGetStream_v2", cublas_handle_, a); } { return f_impl<dispatch::cublasinit>(cublas_, cublasGetStream_v2, cublasGetStream_, "cublasGetStream_v2", h, a); }
cublasStatus_t dispatch::cublasSetStream(cudaStream_t a) cublasStatus_t dispatch::cublasSetStream(cublasHandle_t h, cudaStream_t a)
{ return f_impl<dispatch::cublasinit>(cublas_, cublasSetStream_v2, cublasSetStream_, "cublasSetStream_v2", cublas_handle_, a); } { return f_impl<dispatch::cublasinit>(cublas_, cublasSetStream_v2, cublasSetStream_, "cublasSetStream_v2", h, a); }
cublasStatus_t dispatch::cublasSgemm(cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc) cublasStatus_t dispatch::cublasSgemm(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc)
{ return f_impl<dispatch::cublasinit>(cublas_, cublasSgemm_v2, cublasSgemm_, "cublasSgemm_v2", cublas_handle_, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);} { return f_impl<dispatch::cublasinit>(cublas_, cublasSgemm_v2, cublasSgemm_, "cublasSgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
cublasStatus_t dispatch::cublasDgemm(cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc) cublasStatus_t dispatch::cublasDgemm(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc)
{ return f_impl<dispatch::cublasinit>(cublas_, cublasDgemm_v2, cublasDgemm_, "cublasDgemm_v2", cublas_handle_, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);} { return f_impl<dispatch::cublasinit>(cublas_, cublasDgemm_v2, cublasDgemm_, "cublasDgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
void dispatch::release() void dispatch::release()
{ {
@@ -250,7 +257,6 @@ void * dispatch::opencl_;
void * dispatch::cuda_; void * dispatch::cuda_;
void * dispatch::nvrtc_; void * dispatch::nvrtc_;
void * dispatch::cublas_; void * dispatch::cublas_;
cublasHandle_t dispatch::cublas_handle_;
//OpenCL //OpenCL
void* dispatch::clBuildProgram_; void* dispatch::clBuildProgram_;

View File

@@ -33,8 +33,8 @@ Event::Event(backend_type backend) : backend_(backend), h_(backend_, true)
switch(backend_) switch(backend_)
{ {
case CUDA: case CUDA:
check(dispatch::dispatch::cuEventCreate(&h_.cu().first, CU_EVENT_DEFAULT)); dispatch::cuEventCreate(&h_.cu().first, CU_EVENT_DEFAULT);
check(dispatch::dispatch::cuEventCreate(&h_.cu().second, CU_EVENT_DEFAULT)); dispatch::cuEventCreate(&h_.cu().second, CU_EVENT_DEFAULT);
break; break;
case OPENCL: case OPENCL:
break; break;
@@ -54,7 +54,7 @@ long Event::elapsed_time() const
{ {
case CUDA: case CUDA:
float time; float time;
check(dispatch::cuEventElapsedTime(&time, h_.cu().first, h_.cu().second)); dispatch::cuEventElapsedTime(&time, h_.cu().first, h_.cu().second);
return 1e6*time; return 1e6*time;
case OPENCL: case OPENCL:
return static_cast<long>(ocl::info<CL_PROFILING_COMMAND_END>(h_.cl()) - ocl::info<CL_PROFILING_COMMAND_START>(h_.cl())); return static_cast<long>(ocl::info<CL_PROFILING_COMMAND_END>(h_.cl()) - ocl::info<CL_PROFILING_COMMAND_START>(h_.cl()));

View File

@@ -37,7 +37,7 @@ template<class CLType, class CUType>
void Handle<CLType, CUType>::_delete(CUcontext x) { check_destruction(dispatch::cuCtxDestroy(x)); } void Handle<CLType, CUType>::_delete(CUcontext x) { check_destruction(dispatch::cuCtxDestroy(x)); }
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::_delete(CUdeviceptr x) { check_destruction(dispatch::dispatch::cuMemFree(x)); } void Handle<CLType, CUType>::_delete(CUdeviceptr x) { check_destruction(dispatch::cuMemFree(x)); }
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::_delete(CUstream x) { check_destruction(dispatch::cuStreamDestroy(x)); } void Handle<CLType, CUType>::_delete(CUstream x) { check_destruction(dispatch::cuStreamDestroy(x)); }
@@ -46,38 +46,38 @@ template<class CLType, class CUType>
void Handle<CLType, CUType>::_delete(CUdevice) { } void Handle<CLType, CUType>::_delete(CUdevice) { }
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::_delete(CUevent x) { check_destruction(dispatch::dispatch::cuEventDestroy(x)); } void Handle<CLType, CUType>::_delete(CUevent x) { check_destruction(dispatch::cuEventDestroy(x)); }
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::_delete(CUfunction) { } void Handle<CLType, CUType>::_delete(CUfunction) { }
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::_delete(CUmodule x) { check_destruction(dispatch::dispatch::cuModuleUnload(x)); } void Handle<CLType, CUType>::_delete(CUmodule x) { check_destruction(dispatch::cuModuleUnload(x)); }
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::_delete(cu_event_t x) { _delete(x.first); _delete(x.second); } void Handle<CLType, CUType>::_delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
//OpenCL //OpenCL
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::release(cl_context x) { check(dispatch::clReleaseContext(x)); } void Handle<CLType, CUType>::release(cl_context x) { dispatch::clReleaseContext(x); }
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::release(cl_mem x) { check(dispatch::clReleaseMemObject(x)); } void Handle<CLType, CUType>::release(cl_mem x) { dispatch::clReleaseMemObject(x); }
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::release(cl_command_queue x) { check(dispatch::clReleaseCommandQueue(x)); } void Handle<CLType, CUType>::release(cl_command_queue x) { dispatch::clReleaseCommandQueue(x); }
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::release(cl_device_id x) { check(dispatch::clReleaseDevice(x)); } void Handle<CLType, CUType>::release(cl_device_id x) { dispatch::clReleaseDevice(x); }
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::release(cl_event x) { check(dispatch::clReleaseEvent(x)); } void Handle<CLType, CUType>::release(cl_event x) { dispatch::clReleaseEvent(x); }
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::release(cl_kernel x) { check(dispatch::clReleaseKernel(x)); } void Handle<CLType, CUType>::release(cl_kernel x) { dispatch::clReleaseKernel(x); }
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::release(cl_program x) { check(dispatch::clReleaseProgram(x)); } void Handle<CLType, CUType>::release(cl_program x) { dispatch::clReleaseProgram(x); }
template<class CLType, class CUType> template<class CLType, class CUType>
Handle<CLType, CUType>::Handle(backend_type backend, bool take_ownership): backend_(backend), has_ownership_(take_ownership) Handle<CLType, CUType>::Handle(backend_type backend, bool take_ownership): backend_(backend), has_ownership_(take_ownership)

View File

@@ -45,8 +45,7 @@ struct info<cl_mem>
static void get(cl_mem handle, cl_mem_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret) static void get(cl_mem handle, cl_mem_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret)
{ {
cl_int err = dispatch::clGetMemObjectInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); dispatch::clGetMemObjectInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
check(err);
} }
}; };
@@ -57,8 +56,7 @@ struct info<cl_device_id>
static void get(cl_device_id handle, cl_device_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret) static void get(cl_device_id handle, cl_device_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret)
{ {
cl_int err = dispatch::clGetDeviceInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); dispatch::clGetDeviceInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
check(err);
} }
}; };
@@ -68,13 +66,11 @@ struct info<cl_kernel>
typedef cl_kernel_info type; typedef cl_kernel_info type;
static void get(cl_kernel handle, cl_kernel_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ static void get(cl_kernel handle, cl_kernel_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
cl_int err = dispatch::clGetKernelInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); dispatch::clGetKernelInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
check(err);
} }
static void get(cl_kernel handle, cl_device_id dev_id, cl_kernel_work_group_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ static void get(cl_kernel handle, cl_device_id dev_id, cl_kernel_work_group_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
cl_int err = dispatch::clGetKernelWorkGroupInfo(handle, dev_id, param_name,param_value_size,param_value,param_value_size_ret); dispatch::clGetKernelWorkGroupInfo(handle, dev_id, param_name,param_value_size,param_value,param_value_size_ret);
check(err);
} }
}; };
@@ -84,8 +80,7 @@ struct info<cl_context>
typedef cl_context_info type; typedef cl_context_info type;
static void get(cl_context handle, cl_context_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ static void get(cl_context handle, cl_context_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
cl_int err = dispatch::clGetContextInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); dispatch::clGetContextInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
check(err);
} }
}; };
@@ -95,13 +90,11 @@ struct info<cl_program>
typedef cl_program_info type; typedef cl_program_info type;
static void get(cl_program handle, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ static void get(cl_program handle, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
cl_int err = dispatch::clGetProgramInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); dispatch::clGetProgramInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
check(err);
} }
static void get(cl_program handle, cl_device_id device, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ static void get(cl_program handle, cl_device_id device, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
cl_int err = dispatch::clGetProgramBuildInfo(handle,device,param_name,param_value_size,param_value,param_value_size_ret); dispatch::clGetProgramBuildInfo(handle,device,param_name,param_value_size,param_value,param_value_size_ret);
check(err);
} }
}; };
@@ -111,8 +104,7 @@ struct info<cl_event>
{ {
typedef cl_profiling_info type; typedef cl_profiling_info type;
static void get(cl_event handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ static void get(cl_event handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
cl_int err = dispatch::clGetEventProfilingInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); dispatch::clGetEventProfilingInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
check(err);
} }
}; };
@@ -121,8 +113,7 @@ struct info<cl_command_queue>
{ {
typedef cl_command_queue_info type; typedef cl_command_queue_info type;
static void get(cl_command_queue handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ static void get(cl_command_queue handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
cl_int err = dispatch::clGetCommandQueueInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); dispatch::clGetCommandQueueInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
check(err);
} }
}; };
@@ -131,8 +122,7 @@ struct info<cl_platform_id>
{ {
typedef cl_command_queue_info type; typedef cl_command_queue_info type;
static void get(cl_platform_id handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ static void get(cl_platform_id handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
cl_int err = dispatch::clGetPlatformInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); dispatch::clGetPlatformInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
check(err);
} }
}; };

View File

@@ -38,7 +38,7 @@ Kernel::Kernel(Program const & program, const char * name) : backend_(program.ba
case CUDA: case CUDA:
cu_params_store_.reserve(64); cu_params_store_.reserve(64);
cu_params_.reserve(64); cu_params_.reserve(64);
check(dispatch::cuModuleGetFunction(&h_.cu(), program.h_.cu(), name));\ dispatch::cuModuleGetFunction(&h_.cu(), program.h_.cu(), name);\
break; break;
case OPENCL: case OPENCL:
cl_int err; cl_int err;
@@ -85,7 +85,7 @@ void Kernel::setArg(unsigned int index, std::size_t size, void* ptr)
cu_params_[index] = cu_params_store_[index].get(); cu_params_[index] = cu_params_store_[index].get();
break; break;
case OPENCL: case OPENCL:
check(dispatch::clSetKernelArg(h_.cl(), index, size, ptr)); dispatch::clSetKernelArg(h_.cl(), index, size, ptr);
break; break;
default: default:
throw; throw;
@@ -101,7 +101,7 @@ void Kernel::setArg(unsigned int index, Buffer const & data)
setArg(index, sizeof(CUdeviceptr), (void*)&data.h_.cu()); break; setArg(index, sizeof(CUdeviceptr), (void*)&data.h_.cu()); break;
} }
case OPENCL: case OPENCL:
check(dispatch::clSetKernelArg(h_.cl(), index, sizeof(cl_mem), (void*)&data.h_.cl())); dispatch::clSetKernelArg(h_.cl(), index, sizeof(cl_mem), (void*)&data.h_.cl());
break; break;
default: throw; default: throw;
} }

View File

@@ -78,7 +78,7 @@ void Platform::devices(std::vector<Device> & devices) const
case CUDA: case CUDA:
{ {
int N; int N;
check(dispatch::cuDeviceGetCount(&N)); dispatch::cuDeviceGetCount(&N);
for(int i = 0 ; i < N ; ++i){ for(int i = 0 ; i < N ; ++i){
CUdevice device; CUdevice device;
dispatch::cuDeviceGet(&device, i); dispatch::cuDeviceGet(&device, i);
@@ -89,9 +89,9 @@ void Platform::devices(std::vector<Device> & devices) const
case OPENCL: case OPENCL:
{ {
cl_uint ndevices; cl_uint ndevices;
check(dispatch::dispatch::clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, 0, NULL, &ndevices)); dispatch::dispatch::clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, 0, NULL, &ndevices);
std::vector<cl_device_id> device_ids(ndevices); std::vector<cl_device_id> device_ids(ndevices);
check(dispatch::dispatch::clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, ndevices, device_ids.data(), NULL)); dispatch::dispatch::clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, ndevices, device_ids.data(), NULL);
for(cl_device_id d : device_ids) for(cl_device_id d : device_ids)
devices.push_back(Device(d)); devices.push_back(Device(d));
break; break;

View File

@@ -55,7 +55,7 @@ Program::Program(Context const & context, std::string const & source) : backend_
//Load cached program //Load cached program
if(cache_path.size() && std::ifstream(fname, std::ios::binary)) if(cache_path.size() && std::ifstream(fname, std::ios::binary))
{ {
check(dispatch::cuModuleLoad(&h_.cu(), fname.c_str())); dispatch::cuModuleLoad(&h_.cu(), fname.c_str());
break; break;
} }
@@ -64,28 +64,28 @@ Program::Program(Context const & context, std::string const & source) : backend_
const char * includes[] = {"vector.h"}; const char * includes[] = {"vector.h"};
const char * src[] = {helpers::cuda::vector}; const char * src[] = {helpers::cuda::vector};
check(dispatch::nvrtcCreateProgram(&prog, source.c_str(), NULL, 1, src, includes)); dispatch::nvrtcCreateProgram(&prog, source.c_str(), NULL, 1, src, includes);
try{ try{
std::pair<unsigned int, unsigned int> capability = context_.device().nv_compute_capability(); std::pair<unsigned int, unsigned int> capability = context_.device().nv_compute_capability();
std::string capability_opt = "--gpu-architecture=compute_"; std::string capability_opt = "--gpu-architecture=compute_";
capability_opt += tools::to_string(capability.first) + tools::to_string(capability.second); capability_opt += tools::to_string(capability.first) + tools::to_string(capability.second);
const char * options[] = {capability_opt.c_str(), "--restrict"}; const char * options[] = {capability_opt.c_str(), "--restrict"};
check(dispatch::nvrtcCompileProgram(prog, 2, options)); dispatch::nvrtcCompileProgram(prog, 2, options);
}catch(exception::nvrtc::compilation const &) }catch(exception::nvrtc::compilation const &)
{ {
size_t logsize; size_t logsize;
check(dispatch::nvrtcGetProgramLogSize(prog, &logsize)); dispatch::nvrtcGetProgramLogSize(prog, &logsize);
std::string log(logsize, 0); std::string log(logsize, 0);
check(dispatch::nvrtcGetProgramLog(prog, (char*)log.data())); dispatch::nvrtcGetProgramLog(prog, (char*)log.data());
std::cout << "Compilation failed:" << std::endl; std::cout << "Compilation failed:" << std::endl;
std::cout << log << std::endl; std::cout << log << std::endl;
} }
size_t ptx_size; size_t ptx_size;
check(dispatch::nvrtcGetPTXSize(prog, &ptx_size)); dispatch::nvrtcGetPTXSize(prog, &ptx_size);
std::vector<char> ptx(ptx_size); std::vector<char> ptx(ptx_size);
check(dispatch::nvrtcGetPTX(prog, ptx.data())); dispatch::nvrtcGetPTX(prog, ptx.data());
check(dispatch::cuModuleLoadDataEx(&h_.cu(), ptx.data(), 0, NULL, NULL)); dispatch::cuModuleLoadDataEx(&h_.cu(), ptx.data(), 0, NULL, NULL);
//Save cached program //Save cached program
if (cache_path.size()) if (cache_path.size())
@@ -112,7 +112,7 @@ Program::Program(Context const & context, std::string const & source) : backend_
// str.assign((std::istreambuf_iterator<char>(ifs)), // str.assign((std::istreambuf_iterator<char>(ifs)),
// std::istreambuf_iterator<char>()); // std::istreambuf_iterator<char>());
// check(dispatch::cuModuleLoadDataEx(&h_.cu(), str.c_str(), 0, NULL, NULL)); // dispatch::cuModuleLoadDataEx(&h_.cu(), str.c_str(), 0, NULL, NULL);
break; break;
} }
@@ -141,7 +141,7 @@ Program::Program(Context const & context, std::string const & source) : backend_
char* cbuffer = buffer.data(); char* cbuffer = buffer.data();
h_.cl() = dispatch::clCreateProgramWithBinary(context_.h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), &len, (const unsigned char **)&cbuffer, NULL, &err); h_.cl() = dispatch::clCreateProgramWithBinary(context_.h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), &len, (const unsigned char **)&cbuffer, NULL, &err);
check(err); check(err);
check(dispatch::clBuildProgram(h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL)); dispatch::clBuildProgram(h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL);
return; return;
} }
} }
@@ -150,7 +150,7 @@ Program::Program(Context const & context, std::string const & source) : backend_
const char * csrc = source.c_str(); const char * csrc = source.c_str();
h_.cl() = dispatch::clCreateProgramWithSource(context_.h_.cl(), 1, &csrc, &srclen, &err); h_.cl() = dispatch::clCreateProgramWithSource(context_.h_.cl(), 1, &csrc, &srclen, &err);
try{ try{
check(dispatch::clBuildProgram(h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL)); dispatch::clBuildProgram(h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL);
//Save cached program //Save cached program
if (cache_path.size()) if (cache_path.size())
{ {

View File

@@ -50,7 +50,7 @@ std::vector<int_t> infos(expression_tree const & tree, symbolic::preset::gemm::a
} }
/* ------------------ CUBLAS ------------------ */ /* ------------------ CUBLAS ------------------ */
cublas_gemm::cublas_gemm(char A_trans, char B_trans): A_trans_(A_trans), B_trans_(B_trans), init_(driver::dispatch::cublasinit()) cublas_gemm::cublas_gemm(char A_trans, char B_trans): A_trans_(A_trans), B_trans_(B_trans), init_(true)
{ } { }
int cublas_gemm::is_invalid(expression_tree const &, driver::Device const & device) const int cublas_gemm::is_invalid(expression_tree const &, driver::Device const & device) const
@@ -85,30 +85,31 @@ void cublas_gemm::enqueue(driver::CommandQueue & queue, driver::Program const &,
CUdeviceptr cuB = args.B->array.handle.cu; CUdeviceptr cuB = args.B->array.handle.cu;
CUdeviceptr cuC = args.C->array.handle.cu; CUdeviceptr cuC = args.C->array.handle.cu;
runtime::execution_options_type const & opt = control.execution_options(); runtime::execution_options_type const & opt = control.execution_options();
auto cuT = [](char xt) { return xt=='N'?CUBLAS_OP_N:CUBLAS_OP_T; }; auto cuT = [](char xt) { return (xt=='N')?CUBLAS_OP_N:CUBLAS_OP_T; };
int offA = args.A->array.start, offB = args.B->array.start, offC = args.C->array.start;
cublasHandle_t h = drv::dispatch::cublasHandle(queue.context());
//Set new stream //Set new stream
cudaStream_t bkp; cudaStream_t bkp;
drv::Event event(drv::CUDA); drv::Event event(drv::CUDA);
drv::dispatch::cublasGetStream(&bkp); drv::dispatch::cublasGetStream(h,&bkp);
drv::dispatch::cublasSetStream((cudaStream_t)queue.handle().cu()); drv::dispatch::cublasSetStream(h,(cudaStream_t)queue.handle().cu());
values_holder alpha = args.alpha.values(); values_holder alpha = args.alpha.values();
values_holder beta = args.beta.values(); values_holder beta = args.beta.values();
if(opt.events) if(opt.events)
drv::check(drv::dispatch::cuEventRecord(event.handle().cu().first, queue.handle().cu())); drv::check(drv::dispatch::cuEventRecord(event.handle().cu().first, queue.handle().cu()));
if(args.C->dtype==FLOAT_TYPE) if(args.C->dtype==FLOAT_TYPE)
drv::dispatch::cublasSgemm(cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float32, (float*)cuA, args.A->ld[1], (float*)cuB, args.B->ld[1], &beta.float32, (float*)cuC, args.C->ld[1]); drv::dispatch::cublasSgemm(h,cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float32, (float*)cuA + offA , args.A->ld[1], (float*)cuB + offB, args.B->ld[1], &beta.float32, (float*)cuC + offC, args.C->ld[1]);
else else
drv::dispatch::cublasDgemm(cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float64, (double*)cuA, args.A->ld[1], (double*)cuB, args.B->ld[1], &beta.float64, (double*)cuC, args.C->ld[1]); drv::dispatch::cublasDgemm(h,cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float64, (double*)cuA + offA, args.A->ld[1], (double*)cuB + offB, args.B->ld[1], &beta.float64, (double*)cuC + offC, args.C->ld[1]);
if(opt.events){ if(opt.events){
drv::check(drv::dispatch::cuEventRecord(event.handle().cu().second, queue.handle().cu())); drv::check(drv::dispatch::cuEventRecord(event.handle().cu().second, queue.handle().cu()));
opt.events->push_back(event); opt.events->push_back(event);
} }
//Revert old stream //Revert old stream
drv::dispatch::cublasSetStream(bkp); drv::dispatch::cublasSetStream(h,bkp);
} }
/* -------------------------------------------- */ /* -------------------------------------------- */
unsigned int gemm::lmem_usage(expression_tree const & expression) const unsigned int gemm::lmem_usage(expression_tree const & expression) const
{ {

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -139,6 +139,16 @@ profiles::value_type::templates_container const & profiles::value_type::template
return templates_; return templates_;
} }
std::shared_ptr<templates::base> profiles::create(std::string const & op, std::string const & str)
{
if(str=="cublas_gemm"){
if(op=="gemm_nn") return std::shared_ptr<templates::base>(new templates::cublas_gemm('N', 'N'));
if(op=="gemm_nt") return std::shared_ptr<templates::base>(new templates::cublas_gemm('N', 'T'));
if(op=="gemm_tn") return std::shared_ptr<templates::base>(new templates::cublas_gemm('T', 'N'));
if(op=="gemm_tt") return std::shared_ptr<templates::base>(new templates::cublas_gemm('T', 'T'));
}
throw;
}
std::shared_ptr<templates::base> profiles::create(std::string const & template_name, std::vector<int> const & x) std::shared_ptr<templates::base> profiles::create(std::string const & template_name, std::vector<int> const & x)
{ {
@@ -189,8 +199,12 @@ void profiles::import(std::string const & str, driver::CommandQueue const & queu
// Get profiles // Get profiles
std::vector<std::shared_ptr<templates::base> > templates; std::vector<std::shared_ptr<templates::base> > templates;
rapidjson::Value const & profiles = document[opcstr][dtcstr]["profiles"]; rapidjson::Value const & profiles = document[opcstr][dtcstr]["profiles"];
for (rapidjson::SizeType id = 0 ; id < profiles.Size() ; ++id) for (rapidjson::SizeType i = 0 ; i < profiles.Size() ; ++i){
templates.push_back(create(operation, rapidjson::to_int_array<int>(profiles[id]))); if(profiles[i].IsString())
templates.push_back(create(operation, profiles[i].GetString()));
else
templates.push_back(create(operation, rapidjson::to_int_array<int>(profiles[i])));
}
if(templates.size()>1){ if(templates.size()>1){
// Get predictor // Get predictor
predictors::random_forest predictor(document[opcstr][dtcstr]["predictor"]); predictors::random_forest predictor(document[opcstr][dtcstr]["predictor"]);

View File

@@ -174,7 +174,7 @@ def is_local_optimum(parameters, template, sizes, context):
#Evaluate the provided parameters guess #Evaluate the provided parameters guess
reference = tools.benchmark(template(*parameters), tree) reference = tools.benchmark(template(*parameters), tree)
if isinf(reference): if reference==float('inf'):
return False return False
#Latency bound -- ignore #Latency bound -- ignore

View File

@@ -40,7 +40,8 @@ def linspace(a, b, n=100):
def expspace(a,b,N,r=128): def expspace(a,b,N,r=128):
return [int(ceil(exp(x)/r)*r) for x in linspace(log(a), log(b), N)] return [int(ceil(exp(x)/r)*r) for x in linspace(log(a), log(b), N)]
def benchmark(template, tree):
def benchmark(template, tree, operation=sc.templates.gemm_nn):
queue = tree.context.queues[0] queue = tree.context.queues[0]
queue.profiles[template, sc.float32] = sc.profile(template, sc.float32, queue) queue.profiles[template, sc.float32] = sc.profile(template, sc.float32, queue)
times = [] times = []
@@ -49,14 +50,14 @@ def benchmark(template, tree):
#Warm-up #Warm-up
try: try:
z, events = sc.driver.enqueue(tree) z, events = sc.driver.enqueue(tree)
tree.context.queues[0].synchronize() queue.synchronize()
except profile_execution_failure: except profile_execution_failure:
return float("inf") return float("inf")
#Time #Time
while total < 1e-1: while total < 1e-2:
start = time() start = time()
z, events = sc.driver.enqueue(tree) z, events = sc.driver.enqueue(tree)
tree.context.queues[0].synchronize() queue.synchronize()
end = time() end = time()
times.append(end - start) times.append(end - start)
total += times[-1] total += times[-1]
@@ -91,11 +92,12 @@ def tree_of(template, sizes, context):
AT = template is sc.templates.gemm_tn or template is sc.templates.gemm_tt AT = template is sc.templates.gemm_tn or template is sc.templates.gemm_tt
BT = template is sc.templates.gemm_nt or template is sc.templates.gemm_tt BT = template is sc.templates.gemm_nt or template is sc.templates.gemm_tt
M, N, K = sizes M, N, K = sizes
C = sc.empty((M,N), context=context)
A = sc.empty((K, M) if AT else (M, K), context=context) A = sc.empty((K, M) if AT else (M, K), context=context)
B = sc.empty((N, K) if BT else (K, N), context=context) B = sc.empty((N, K) if BT else (K, N), context=context)
AA = A.T if AT else A AA = A.T if AT else A
BB = B.T if BT else B BB = B.T if BT else B
return sc.dot(AA, BB), (A, B) return sc.assign(C, sc.dot(AA, BB)), (A, B, C)
def memory_footprint(template, sizes): def memory_footprint(template, sizes):
if issubclass(template, sc.templates.elementwise_1d): if issubclass(template, sc.templates.elementwise_1d):
@@ -123,14 +125,16 @@ def metric_name_of(template):
return 'GB/S' return 'GB/S'
def external_profiles(template): def external_profiles(template):
res = []
if template is sc.templates.gemm_nn: if template is sc.templates.gemm_nn:
return [sc.templates.cublas_gemm('N', 'N')] res += [sc.templates.cublas_gemm('N','N')]
elif template is sc.templates.gemm_tn: elif template is sc.templates.gemm_tn:
return [sc.templates.cublas_gemm('T', 'N')] res += [sc.templates.cublas_gemm('T','N')]
elif template is sc.templates.gemm_nt: elif template is sc.templates.gemm_nt:
return [sc.templates.cublas_gemm('N', 'T')] res += [sc.templates.cublas_gemm('N','T')]
elif template is sc.templates.gemm_tt: elif template is sc.templates.gemm_tt:
return [sc.templates.cublas_gemm('T', 'T')] res += [sc.templates.cublas_gemm('T','T')]
return res
def genetic_infos_of(template): def genetic_infos_of(template):
if issubclass(template, sc.templates.elementwise_1d): if issubclass(template, sc.templates.elementwise_1d):
@@ -144,4 +148,8 @@ def genetic_infos_of(template):
elif issubclass(template, sc.templates.gemm): elif issubclass(template, sc.templates.gemm):
return {'categorical': [8,9], 'nbits': [3,3,3,3,3,2,2,2,2,2,3,3]} return {'categorical': [8,9], 'nbits': [3,3,3,3,3,2,2,2,2,2,3,3]}
def convert(profile):
if isinstance(profile, str):
return profile
else:
return map(int, profile)

View File

@@ -77,11 +77,14 @@ class Tuner:
#Square #Square
for N in [896, 1760, 2048, 2560]: for N in [896, 1760, 2048, 2560]:
sizes += [(N, N)] sizes += [(N, N)]
#Tall and Skinny #Short/Fat
for M in [16, 32, 64, 128]: for M in [16, 32, 64, 128]:
for N in [1024, 4096, 16384, 65536, 262144]: for N in [1024, 4096, 16384, 65536, 262144]:
sizes += [(M, N)] sizes += [(M, N)]
sizes += [(N, M)] #Tall/Skinny
for N in [16, 32, 64, 128]:
for M in [1024, 4096, 16384, 65536, 262144]:
sizes += [(M, N)]
#BLAS3 training sizes #BLAS3 training sizes
if operation in [sc.templates.gemm_nn, sc.templates.gemm_nt, sc.templates.gemm_tn, sc.templates.gemm_tt]: if operation in [sc.templates.gemm_nn, sc.templates.gemm_nt, sc.templates.gemm_tn, sc.templates.gemm_tt]:
@@ -113,8 +116,6 @@ class Tuner:
try: try:
with open(os.path.join(savepath, 'X.csv')) as f: with open(os.path.join(savepath, 'X.csv')) as f:
X = [tuple(map(int, row)) for row in csv.reader(f, delimiter=',')] X = [tuple(map(int, row)) for row in csv.reader(f, delimiter=',')]
with open(os.path.join(savepath, 'Y.csv')) as f:
Y = [map(float, row) for row in csv.reader(f, delimiter=',')]
with open(os.path.join(savepath, 'profiles.csv')) as f: with open(os.path.join(savepath, 'profiles.csv')) as f:
def mmap(x): def mmap(x):
if x=='FETCH_FROM_LOCAL': if x=='FETCH_FROM_LOCAL':
@@ -125,6 +126,8 @@ class Tuner:
return sc.templates.fetch_type.FETCH_FROM_GLOBAL_STRIDED return sc.templates.fetch_type.FETCH_FROM_GLOBAL_STRIDED
return int(x) return int(x)
profiles = [map(mmap,row) for v in row for row in csv.reader(f, delimiter=',')] profiles = [map(mmap,row) for v in row for row in csv.reader(f, delimiter=',')]
with open(os.path.join(savepath, 'Y.csv')) as f:
Y = [map(float, row) for row in csv.reader(f, delimiter=',')]
except: except:
pass pass
@@ -140,29 +143,30 @@ class Tuner:
self.progress_bar.update(1, 1, profiles[argmax(row)], max(row)) self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
continue continue
tree, operands = tools.tree_of(operation, x, context) tree, operands = tools.tree_of(operation, x, context)
#Check if GA needs to run (i.e., current best prediction is not a local optimum) #Best predicted profile for x
tune = True
best = None best = None
if idx > 0: if idx > 0:
dim = min(10, idx+1) if len(profiles) > 1:
clf = RandomForestRegressor(dim, dim).fit(X, Y) clf = RandomForestRegressor(20, max_depth=5).fit(X, Y)
predictions = clf.predict(x)[0] predictions = clf.predict(x)[0]
for idx in (-predictions).argsort(): for idx in (-predictions).argsort():
ts = tools.benchmark(operation(*profiles[idx]), tree)
if np.isfinite(ts):
break
if np.isfinite(ts):
best = profiles[idx] best = profiles[idx]
tune = not optimize.is_local_optimum(predicted, operation, x, context) ts = tools.benchmark(operation(*best), tree)
if ts != float('inf'):
break
else:
best = profiles[0]
#Retune if necessary #Retune if necessary
tune = not (best and optimize.is_local_optimum(best, operation, x, context))
if tune: if tune:
optimizer = optimize.GeneticOptimizer(self.logger, naccept=1000, niter=1000, cxpb=.4, mutpb=.4, popsize=20, progress_bar = self.progress_bar) optimizer = optimize.GeneticOptimizer(self.logger, naccept=1000, niter=1000, cxpb=.4, mutpb=.4, popsize=20, progress_bar = self.progress_bar)
best = optimizer.run(operation, x, context, prior=best)[0] best = optimizer.run(operation, x, context, prior=best)[0]
if best not in profiles: if best not in profiles:
profiles.append(best) profiles.append(best)
for xx,yy in zip(X, Y): for xx,yy in zip(X, Y):
tree, _operands = tools.tree_of(operation, xx, context) tree, _ = tools.tree_of(operation, xx, context)
time = tools.benchmark(operation(*best), _tree) time = tools.benchmark(operation(*best), tree)
yy.append(performance(xx, time)) yy.append(performance(xx, time))
#Update dataset #Update dataset
X.append(x) X.append(x)
@@ -182,11 +186,12 @@ class Tuner:
for prof in tools.external_profiles(operation): for prof in tools.external_profiles(operation):
for x, y in zip(X, Y): for x, y in zip(X, Y):
tree, operands = tools.tree_of(operation, x, context) tree, operands = tools.tree_of(operation, x, context)
perf = performance(x,tools.benchmark(prof, tree)) perf = performance(x,tools.benchmark(prof, tree, operation))
if perf > 0: if perf > 0:
profiles.append(prof.__class__.__name__) profiles.append(prof.__class__.__name__)
y.append(perf) y.append(perf)
#Pruning of useless profiles #Pruning of useless profiles
if len(Y[0]) > 1: if len(Y[0]) > 1:
unused = np.where(np.bincount(np.argmax(Y, 1))==0)[0] unused = np.where(np.bincount(np.argmax(Y, 1))==0)[0]
@@ -212,5 +217,5 @@ class Tuner:
'threshold': e.tree_.threshold.astype('float64').tolist(), 'threshold': e.tree_.threshold.astype('float64').tolist(),
'feature': e.tree_.feature.astype('float64').tolist(), 'feature': e.tree_.feature.astype('float64').tolist(),
'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_] 'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_]
D['profiles'] = [map(int, x) for x in profiles] D['profiles'] = [tools.convert(x) for x in profiles]
json.dump(json_data, open(json_path,'w')) json.dump(json_data, open(json_path,'w'))