GEMM: Enabled use of cuBLAS when predicted beneficial
This commit is contained in:
@@ -29,7 +29,7 @@ double bench(OP const & op, SYNC const & sync)
|
||||
double total_time = 0;
|
||||
op();
|
||||
sync();
|
||||
while(total_time*1e-9 < 1e-1){
|
||||
while(total_time*1e-9 < 2e-1){
|
||||
tmr.start();
|
||||
op();
|
||||
sync();
|
||||
@@ -239,12 +239,13 @@ void bench(sc::numeric_type dtype, std::string operation)
|
||||
#ifdef BENCH_CUBLAS
|
||||
times.push_back(bench([&](){cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 1, (T*)cu(C), ldc);}, cusync));
|
||||
#endif
|
||||
std::cout << name << "\t" << M << "\t" << N << "\t" << K << "\t" << cAT << "\t" << cBT;
|
||||
std::transform(times.begin(), times.end(), std::back_inserter(tflops), [&](double t){ return 2*M*N*K/t*1e-3;});
|
||||
double best = max(tflops);
|
||||
auto fastest = tflops;
|
||||
std::sort(fastest.begin(), fastest.end(), std::greater<double>());
|
||||
std::cout << name << "\t" << M << "\t" << N << "\t" << K << "\t" << cAT << "\t" << cBT;
|
||||
for(auto x: tflops){
|
||||
std::cout << "\t";
|
||||
if(x==best)
|
||||
if(x == fastest[0] && x/fastest[1] >= 1.02)
|
||||
std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET);
|
||||
else
|
||||
std::cout << x;
|
||||
|
@@ -38,13 +38,6 @@ enum backend_type
|
||||
CUDA
|
||||
};
|
||||
|
||||
void check(nvrtcResult err);
|
||||
|
||||
void check(CUresult);
|
||||
void check_destruction(CUresult);
|
||||
|
||||
void check(cl_int err);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -50,7 +50,7 @@ private:
|
||||
static CUdevice device(CUcontext)
|
||||
{
|
||||
CUdevice res;
|
||||
check(dispatch::cuCtxGetDevice(&res));
|
||||
dispatch::cuCtxGetDevice(&res);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@@ -32,7 +32,8 @@
|
||||
#include "isaac/driver/external/CUDA/cuda.h"
|
||||
#include "isaac/driver/external/CUDA/nvrtc.h"
|
||||
#include "isaac/driver/external/CUDA/cublas.h"
|
||||
|
||||
//Exceptions
|
||||
#include "isaac/driver/common.h"
|
||||
#include <iostream>
|
||||
|
||||
namespace isaac
|
||||
@@ -40,6 +41,14 @@ namespace isaac
|
||||
namespace driver
|
||||
{
|
||||
|
||||
class Context;
|
||||
|
||||
template<class T> void check(T){}
|
||||
void check(nvrtcResult err);
|
||||
void check(CUresult err);
|
||||
void check(cublasStatus_t err);
|
||||
void check(cl_int err);
|
||||
void check_destruction(CUresult);
|
||||
|
||||
class dispatch
|
||||
{
|
||||
@@ -61,11 +70,11 @@ private:
|
||||
cache = dlsym(lib_h, name);
|
||||
FunPtrT fptr;
|
||||
*reinterpret_cast<void **>(&fptr) = cache;
|
||||
return (*fptr)(args...);
|
||||
typename return_type<FunPtrT>::type res = (*fptr)(args...);
|
||||
check(res);
|
||||
return res;
|
||||
}
|
||||
|
||||
static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
|
||||
|
||||
public:
|
||||
static bool clinit();
|
||||
static bool cublasinit();
|
||||
@@ -146,17 +155,19 @@ public:
|
||||
static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames);
|
||||
static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
|
||||
|
||||
static cublasStatus_t cublasGetStream(cudaStream_t *streamId);
|
||||
static cublasStatus_t cublasSetStream(cudaStream_t streamId);
|
||||
static cublasStatus_t cublasSgemm (cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc);
|
||||
static cublasStatus_t cublasDgemm (cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc);
|
||||
static cublasHandle_t cublasHandle(Context const & ctx);
|
||||
static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
|
||||
static cublasStatus_t cublasGetStream(cublasHandle_t h, cudaStream_t *streamId);
|
||||
static cublasStatus_t cublasSetStream(cublasHandle_t h, cudaStream_t streamId);
|
||||
static cublasStatus_t cublasSgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc);
|
||||
static cublasStatus_t cublasDgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc);
|
||||
|
||||
private:
|
||||
static void* opencl_;
|
||||
static void* cuda_;
|
||||
static void* nvrtc_;
|
||||
static void* cublas_;
|
||||
static cublasHandle_t cublas_handle_;
|
||||
|
||||
|
||||
//OpenCL
|
||||
static void* clBuildProgram_;
|
||||
|
@@ -39,7 +39,7 @@ namespace nvrtc
|
||||
|
||||
#define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class ISAACAPI name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } }
|
||||
|
||||
ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory ,"out of memory exception");
|
||||
ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory ,"out of memory");
|
||||
ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure ,"program creation failure");
|
||||
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_input ,"invalid input");
|
||||
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_program ,"invalid program");
|
||||
@@ -49,8 +49,6 @@ namespace nvrtc
|
||||
ISAAC_CREATE_NVRTC_EXCEPTION(unknown_error ,"unknown error");
|
||||
|
||||
#undef ISAAC_CREATE_NVRTC_EXCEPTION
|
||||
|
||||
void check(nvrtcResult err);
|
||||
}
|
||||
|
||||
|
||||
@@ -120,11 +118,26 @@ namespace cuda
|
||||
ISAAC_CREATE_CUDA_EXCEPTION(unknown ,"unknown");
|
||||
|
||||
#undef ISAAC_CREATE_CUDA_EXCEPTION
|
||||
|
||||
void check(CUresult);
|
||||
void check_destruction(CUresult);
|
||||
}
|
||||
|
||||
namespace cublas
|
||||
{
|
||||
|
||||
#define ISAAC_CREATE_CUBLAS_EXCEPTION(name, msg) class ISAACAPI name: public std::exception { public: const char * what() const throw(){ return "CUBLAS: Error- " msg; } }
|
||||
|
||||
ISAAC_CREATE_CUBLAS_EXCEPTION(not_initialized ,"not initialized");
|
||||
ISAAC_CREATE_CUBLAS_EXCEPTION(alloc_failed ,"alloc failed");
|
||||
ISAAC_CREATE_CUBLAS_EXCEPTION(invalid_value ,"invalid value");
|
||||
ISAAC_CREATE_CUBLAS_EXCEPTION(arch_mismatch ,"arch mismatch");
|
||||
ISAAC_CREATE_CUBLAS_EXCEPTION(mapping_error ,"mapping error");
|
||||
ISAAC_CREATE_CUBLAS_EXCEPTION(execution_failed ,"execution failed");
|
||||
ISAAC_CREATE_CUBLAS_EXCEPTION(internal_error ,"internal error");
|
||||
ISAAC_CREATE_CUBLAS_EXCEPTION(not_supported ,"not supported");
|
||||
ISAAC_CREATE_CUBLAS_EXCEPTION(license_error ,"license error");
|
||||
ISAAC_CREATE_CUBLAS_EXCEPTION(unknown ,"unknown");
|
||||
|
||||
#undef ISAAC_CREATE_CUBLAS_EXCEPTION
|
||||
}
|
||||
|
||||
namespace ocl
|
||||
{
|
||||
@@ -183,9 +196,6 @@ namespace ocl
|
||||
#ifdef CL_INVALID_PROPERTY
|
||||
ISAAC_CREATE_CL_EXCEPTION(invalid_property, "invalid property");
|
||||
#endif
|
||||
|
||||
ISAACAPI void check(cl_int err);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@@ -68,6 +68,7 @@ public:
|
||||
typedef std::map<std::pair<expression_type, numeric_type>, std::shared_ptr<value_type> > map_type;
|
||||
private:
|
||||
static std::shared_ptr<templates::base> create(std::string const & template_name, std::vector<int> const & x);
|
||||
static std::shared_ptr<templates::base> create(std::string const & op, std::string const & x);
|
||||
static void import(std::string const & fname, driver::CommandQueue const & queue);
|
||||
static map_type & init(driver::CommandQueue const & queue);
|
||||
public:
|
||||
|
@@ -217,9 +217,9 @@ void backend::platforms(std::vector<Platform> & platforms)
|
||||
if(dispatch::clinit())
|
||||
{
|
||||
cl_uint nplatforms;
|
||||
check(dispatch::dispatch::clGetPlatformIDs(0, NULL, &nplatforms));
|
||||
dispatch::dispatch::clGetPlatformIDs(0, NULL, &nplatforms);
|
||||
std::vector<cl_platform_id> clplatforms(nplatforms);
|
||||
check(dispatch::dispatch::clGetPlatformIDs(nplatforms, clplatforms.data(), NULL));
|
||||
dispatch::dispatch::clGetPlatformIDs(nplatforms, clplatforms.data(), NULL);
|
||||
for(cl_platform_id p: clplatforms){
|
||||
Platform tmp(p);
|
||||
if(tmp.name().find("CUDA")!=std::string::npos && has_cuda)
|
||||
|
@@ -45,7 +45,7 @@ Buffer::Buffer(Context const & context, size_t size) : backend_(context.backend_
|
||||
switch(backend_)
|
||||
{
|
||||
case CUDA:
|
||||
check(dispatch::cuMemAlloc(&h_.cu(), size));
|
||||
dispatch::cuMemAlloc(&h_.cu(), size);
|
||||
break;
|
||||
case OPENCL:
|
||||
cl_int err;
|
||||
|
@@ -112,6 +112,25 @@ void check(CUresult err)
|
||||
}
|
||||
}
|
||||
|
||||
void check(cublasStatus_t err)
|
||||
{
|
||||
using namespace isaac::exception::cublas;
|
||||
switch(err)
|
||||
{
|
||||
case CUBLAS_STATUS_SUCCESS : break;
|
||||
case CUBLAS_STATUS_NOT_INITIALIZED : throw not_initialized();
|
||||
case CUBLAS_STATUS_ALLOC_FAILED : throw alloc_failed();
|
||||
case CUBLAS_STATUS_INVALID_VALUE : throw invalid_value();
|
||||
case CUBLAS_STATUS_ARCH_MISMATCH : throw arch_mismatch();
|
||||
case CUBLAS_STATUS_MAPPING_ERROR : throw mapping_error();
|
||||
case CUBLAS_STATUS_EXECUTION_FAILED: throw execution_failed();
|
||||
case CUBLAS_STATUS_INTERNAL_ERROR : throw internal_error();
|
||||
case CUBLAS_STATUS_NOT_SUPPORTED : throw not_supported();
|
||||
case CUBLAS_STATUS_LICENSE_ERROR : throw license_error();
|
||||
default : throw unknown();
|
||||
}
|
||||
}
|
||||
|
||||
void check_destruction(CUresult result)
|
||||
{
|
||||
if(result!=CUDA_ERROR_DEINITIALIZED)
|
||||
@@ -179,6 +198,7 @@ void check(cl_int err)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -50,7 +50,7 @@ CommandQueue::CommandQueue(Context const & context, Device const & device, cl_co
|
||||
{
|
||||
case CUDA:
|
||||
{
|
||||
check(dispatch::cuStreamCreate(&h_.cu(), 0));
|
||||
dispatch::cuStreamCreate(&h_.cu(), 0);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -86,8 +86,8 @@ void CommandQueue::synchronize()
|
||||
{
|
||||
switch(backend_)
|
||||
{
|
||||
case CUDA: check(dispatch::cuStreamSynchronize(h_.cu())); break;
|
||||
case OPENCL: check(dispatch::clFinish(h_.cl())); break;
|
||||
case CUDA: dispatch::cuStreamSynchronize(h_.cu()); break;
|
||||
case OPENCL: dispatch::clFinish(h_.cl()); break;
|
||||
default: throw;
|
||||
}
|
||||
}
|
||||
@@ -98,16 +98,16 @@ void CommandQueue::enqueue(Kernel const & kernel, NDRange global, driver::NDRang
|
||||
{
|
||||
case CUDA:
|
||||
if(event)
|
||||
check(dispatch::cuEventRecord(event->h_.cu().first, h_.cu()));
|
||||
dispatch::cuEventRecord(event->h_.cu().first, h_.cu());
|
||||
|
||||
check(dispatch::cuLaunchKernel(kernel.h_.cu(), global[0]/local[0], global[1]/local[1], global[2]/local[2],
|
||||
local[0], local[1], local[2], 0, h_.cu(),(void**)&kernel.cu_params_[0], NULL));
|
||||
dispatch::cuLaunchKernel(kernel.h_.cu(), global[0]/local[0], global[1]/local[1], global[2]/local[2],
|
||||
local[0], local[1], local[2], 0, h_.cu(),(void**)&kernel.cu_params_[0], NULL);
|
||||
|
||||
if(event)
|
||||
check(dispatch::cuEventRecord(event->h_.cu().second, h_.cu()));
|
||||
dispatch::cuEventRecord(event->h_.cu().second, h_.cu());
|
||||
break;
|
||||
case OPENCL:
|
||||
check(dispatch::clEnqueueNDRangeKernel(h_.cl(), kernel.h_.cl(), global.dimension(), NULL, (const size_t *)global, (const size_t *) local, 0, NULL, event?&event->h_.cl():NULL));
|
||||
dispatch::clEnqueueNDRangeKernel(h_.cl(), kernel.h_.cl(), global.dimension(), NULL, (const size_t *)global, (const size_t *) local, 0, NULL, event?&event->h_.cl():NULL);
|
||||
break;
|
||||
default: throw;
|
||||
}
|
||||
@@ -119,12 +119,12 @@ void CommandQueue::write(Buffer const & buffer, bool blocking, std::size_t offse
|
||||
{
|
||||
case CUDA:
|
||||
if(blocking)
|
||||
check(dispatch::cuMemcpyHtoD(buffer.h_.cu() + offset, ptr, size));
|
||||
dispatch::cuMemcpyHtoD(buffer.h_.cu() + offset, ptr, size);
|
||||
else
|
||||
check(dispatch::cuMemcpyHtoDAsync(buffer.h_.cu() + offset, ptr, size, h_.cu()));
|
||||
dispatch::cuMemcpyHtoDAsync(buffer.h_.cu() + offset, ptr, size, h_.cu());
|
||||
break;
|
||||
case OPENCL:
|
||||
check(dispatch::clEnqueueWriteBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL));
|
||||
dispatch::clEnqueueWriteBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL);
|
||||
break;
|
||||
default: throw;
|
||||
}
|
||||
@@ -136,12 +136,12 @@ void CommandQueue::read(Buffer const & buffer, bool blocking, std::size_t offset
|
||||
{
|
||||
case CUDA:
|
||||
if(blocking)
|
||||
check(dispatch::cuMemcpyDtoH(ptr, buffer.h_.cu() + offset, size));
|
||||
dispatch::cuMemcpyDtoH(ptr, buffer.h_.cu() + offset, size);
|
||||
else
|
||||
check(dispatch::cuMemcpyDtoHAsync(ptr, buffer.h_.cu() + offset, size, h_.cu()));
|
||||
dispatch::cuMemcpyDtoHAsync(ptr, buffer.h_.cu() + offset, size, h_.cu());
|
||||
break;
|
||||
case OPENCL:
|
||||
check(dispatch::clEnqueueReadBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL));
|
||||
dispatch::clEnqueueReadBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL);
|
||||
break;
|
||||
default: throw;
|
||||
}
|
||||
|
@@ -74,7 +74,7 @@ Context::Context(Device const & device) : backend_(device.backend_), device_(dev
|
||||
switch(backend_)
|
||||
{
|
||||
case CUDA:
|
||||
check(dispatch::cuCtxCreate(&h_.cu(), CU_CTX_SCHED_AUTO, device.h_.cu()));
|
||||
dispatch::cuCtxCreate(&h_.cu(), CU_CTX_SCHED_AUTO, device.h_.cu());
|
||||
break;
|
||||
case OPENCL:
|
||||
cl_int err;
|
||||
|
@@ -38,7 +38,7 @@ template<CUdevice_attribute attr>
|
||||
int Device::cuGetInfo() const
|
||||
{
|
||||
int res;
|
||||
check(dispatch::cuDeviceGetAttribute(&res, attr, h_.cu()));
|
||||
dispatch::cuDeviceGetAttribute(&res, attr, h_.cu());
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -197,7 +197,7 @@ std::string Device::name() const
|
||||
{
|
||||
case CUDA:
|
||||
char tmp[128];
|
||||
check(dispatch::cuDeviceGetName(tmp, 128, h_.cu()));
|
||||
dispatch::cuDeviceGetName(tmp, 128, h_.cu());
|
||||
return std::string(tmp);
|
||||
case OPENCL:
|
||||
return ocl::info<CL_DEVICE_NAME>(h_.cl());
|
||||
|
@@ -20,6 +20,7 @@
|
||||
*/
|
||||
|
||||
#include "isaac/driver/dispatch.h"
|
||||
#include "isaac/driver/context.h"
|
||||
|
||||
namespace isaac
|
||||
{
|
||||
@@ -126,11 +127,8 @@ bool dispatch::nvrtcinit()
|
||||
|
||||
bool dispatch::cublasinit()
|
||||
{
|
||||
if(cublas_==nullptr){
|
||||
if(cublas_==nullptr)
|
||||
cublas_ = dlopen("libcublas.so", RTLD_LAZY);
|
||||
if(cublas_!=nullptr)
|
||||
dispatch::cublasCreate_v2(&cublas_handle_);
|
||||
}
|
||||
return cublas_ != nullptr;
|
||||
}
|
||||
|
||||
@@ -212,19 +210,28 @@ NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTXSize, nvrtcProgram, size_t *)
|
||||
NVRTC_DEFINE6(nvrtcResult, nvrtcCreateProgram, nvrtcProgram *, const char *, const char *, int, const char **, const char **)
|
||||
NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLog, nvrtcProgram, char *)
|
||||
|
||||
cublasHandle_t dispatch::cublasHandle(Context const & ctx)
|
||||
{
|
||||
static std::map<Context, cublasHandle_t> handles;
|
||||
auto pr = handles.insert({ctx, cublasHandle_t()});
|
||||
if(pr.second)
|
||||
cublasCreate_v2(&pr.first->second);
|
||||
return pr.first->second;
|
||||
}
|
||||
|
||||
CUBLAS_DEFINE1(cublasStatus_t, cublasCreate_v2, cublasHandle_t*)
|
||||
|
||||
cublasStatus_t dispatch::cublasGetStream(cudaStream_t *a)
|
||||
{ return f_impl<dispatch::cublasinit>(cublas_, cublasGetStream_v2, cublasGetStream_, "cublasGetStream_v2", cublas_handle_, a); }
|
||||
cublasStatus_t dispatch::cublasGetStream(cublasHandle_t h, cudaStream_t *a)
|
||||
{ return f_impl<dispatch::cublasinit>(cublas_, cublasGetStream_v2, cublasGetStream_, "cublasGetStream_v2", h, a); }
|
||||
|
||||
cublasStatus_t dispatch::cublasSetStream(cudaStream_t a)
|
||||
{ return f_impl<dispatch::cublasinit>(cublas_, cublasSetStream_v2, cublasSetStream_, "cublasSetStream_v2", cublas_handle_, a); }
|
||||
cublasStatus_t dispatch::cublasSetStream(cublasHandle_t h, cudaStream_t a)
|
||||
{ return f_impl<dispatch::cublasinit>(cublas_, cublasSetStream_v2, cublasSetStream_, "cublasSetStream_v2", h, a); }
|
||||
|
||||
cublasStatus_t dispatch::cublasSgemm(cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc)
|
||||
{ return f_impl<dispatch::cublasinit>(cublas_, cublasSgemm_v2, cublasSgemm_, "cublasSgemm_v2", cublas_handle_, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
|
||||
cublasStatus_t dispatch::cublasSgemm(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc)
|
||||
{ return f_impl<dispatch::cublasinit>(cublas_, cublasSgemm_v2, cublasSgemm_, "cublasSgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
|
||||
|
||||
cublasStatus_t dispatch::cublasDgemm(cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc)
|
||||
{ return f_impl<dispatch::cublasinit>(cublas_, cublasDgemm_v2, cublasDgemm_, "cublasDgemm_v2", cublas_handle_, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
|
||||
cublasStatus_t dispatch::cublasDgemm(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc)
|
||||
{ return f_impl<dispatch::cublasinit>(cublas_, cublasDgemm_v2, cublasDgemm_, "cublasDgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
|
||||
|
||||
void dispatch::release()
|
||||
{
|
||||
@@ -250,7 +257,6 @@ void * dispatch::opencl_;
|
||||
void * dispatch::cuda_;
|
||||
void * dispatch::nvrtc_;
|
||||
void * dispatch::cublas_;
|
||||
cublasHandle_t dispatch::cublas_handle_;
|
||||
|
||||
//OpenCL
|
||||
void* dispatch::clBuildProgram_;
|
||||
|
@@ -33,8 +33,8 @@ Event::Event(backend_type backend) : backend_(backend), h_(backend_, true)
|
||||
switch(backend_)
|
||||
{
|
||||
case CUDA:
|
||||
check(dispatch::dispatch::cuEventCreate(&h_.cu().first, CU_EVENT_DEFAULT));
|
||||
check(dispatch::dispatch::cuEventCreate(&h_.cu().second, CU_EVENT_DEFAULT));
|
||||
dispatch::cuEventCreate(&h_.cu().first, CU_EVENT_DEFAULT);
|
||||
dispatch::cuEventCreate(&h_.cu().second, CU_EVENT_DEFAULT);
|
||||
break;
|
||||
case OPENCL:
|
||||
break;
|
||||
@@ -54,7 +54,7 @@ long Event::elapsed_time() const
|
||||
{
|
||||
case CUDA:
|
||||
float time;
|
||||
check(dispatch::cuEventElapsedTime(&time, h_.cu().first, h_.cu().second));
|
||||
dispatch::cuEventElapsedTime(&time, h_.cu().first, h_.cu().second);
|
||||
return 1e6*time;
|
||||
case OPENCL:
|
||||
return static_cast<long>(ocl::info<CL_PROFILING_COMMAND_END>(h_.cl()) - ocl::info<CL_PROFILING_COMMAND_START>(h_.cl()));
|
||||
|
@@ -37,7 +37,7 @@ template<class CLType, class CUType>
|
||||
void Handle<CLType, CUType>::_delete(CUcontext x) { check_destruction(dispatch::cuCtxDestroy(x)); }
|
||||
|
||||
template<class CLType, class CUType>
|
||||
void Handle<CLType, CUType>::_delete(CUdeviceptr x) { check_destruction(dispatch::dispatch::cuMemFree(x)); }
|
||||
void Handle<CLType, CUType>::_delete(CUdeviceptr x) { check_destruction(dispatch::cuMemFree(x)); }
|
||||
|
||||
template<class CLType, class CUType>
|
||||
void Handle<CLType, CUType>::_delete(CUstream x) { check_destruction(dispatch::cuStreamDestroy(x)); }
|
||||
@@ -46,38 +46,38 @@ template<class CLType, class CUType>
|
||||
void Handle<CLType, CUType>::_delete(CUdevice) { }
|
||||
|
||||
template<class CLType, class CUType>
|
||||
void Handle<CLType, CUType>::_delete(CUevent x) { check_destruction(dispatch::dispatch::cuEventDestroy(x)); }
|
||||
void Handle<CLType, CUType>::_delete(CUevent x) { check_destruction(dispatch::cuEventDestroy(x)); }
|
||||
|
||||
template<class CLType, class CUType>
|
||||
void Handle<CLType, CUType>::_delete(CUfunction) { }
|
||||
|
||||
template<class CLType, class CUType>
|
||||
void Handle<CLType, CUType>::_delete(CUmodule x) { check_destruction(dispatch::dispatch::cuModuleUnload(x)); }
|
||||
void Handle<CLType, CUType>::_delete(CUmodule x) { check_destruction(dispatch::cuModuleUnload(x)); }
|
||||
|
||||
template<class CLType, class CUType>
|
||||
void Handle<CLType, CUType>::_delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
|
||||
|
||||
//OpenCL
|
||||
template<class CLType, class CUType>
|
||||
void Handle<CLType, CUType>::release(cl_context x) { check(dispatch::clReleaseContext(x)); }
|
||||
void Handle<CLType, CUType>::release(cl_context x) { dispatch::clReleaseContext(x); }
|
||||
|
||||
template<class CLType, class CUType>
|
||||
void Handle<CLType, CUType>::release(cl_mem x) { check(dispatch::clReleaseMemObject(x)); }
|
||||
void Handle<CLType, CUType>::release(cl_mem x) { dispatch::clReleaseMemObject(x); }
|
||||
|
||||
template<class CLType, class CUType>
|
||||
void Handle<CLType, CUType>::release(cl_command_queue x) { check(dispatch::clReleaseCommandQueue(x)); }
|
||||
void Handle<CLType, CUType>::release(cl_command_queue x) { dispatch::clReleaseCommandQueue(x); }
|
||||
|
||||
template<class CLType, class CUType>
|
||||
void Handle<CLType, CUType>::release(cl_device_id x) { check(dispatch::clReleaseDevice(x)); }
|
||||
void Handle<CLType, CUType>::release(cl_device_id x) { dispatch::clReleaseDevice(x); }
|
||||
|
||||
template<class CLType, class CUType>
|
||||
void Handle<CLType, CUType>::release(cl_event x) { check(dispatch::clReleaseEvent(x)); }
|
||||
void Handle<CLType, CUType>::release(cl_event x) { dispatch::clReleaseEvent(x); }
|
||||
|
||||
template<class CLType, class CUType>
|
||||
void Handle<CLType, CUType>::release(cl_kernel x) { check(dispatch::clReleaseKernel(x)); }
|
||||
void Handle<CLType, CUType>::release(cl_kernel x) { dispatch::clReleaseKernel(x); }
|
||||
|
||||
template<class CLType, class CUType>
|
||||
void Handle<CLType, CUType>::release(cl_program x) { check(dispatch::clReleaseProgram(x)); }
|
||||
void Handle<CLType, CUType>::release(cl_program x) { dispatch::clReleaseProgram(x); }
|
||||
|
||||
template<class CLType, class CUType>
|
||||
Handle<CLType, CUType>::Handle(backend_type backend, bool take_ownership): backend_(backend), has_ownership_(take_ownership)
|
||||
|
@@ -45,8 +45,7 @@ struct info<cl_mem>
|
||||
|
||||
static void get(cl_mem handle, cl_mem_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret)
|
||||
{
|
||||
cl_int err = dispatch::clGetMemObjectInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
check(err);
|
||||
dispatch::clGetMemObjectInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -57,8 +56,7 @@ struct info<cl_device_id>
|
||||
|
||||
static void get(cl_device_id handle, cl_device_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret)
|
||||
{
|
||||
cl_int err = dispatch::clGetDeviceInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
check(err);
|
||||
dispatch::clGetDeviceInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -68,13 +66,11 @@ struct info<cl_kernel>
|
||||
typedef cl_kernel_info type;
|
||||
|
||||
static void get(cl_kernel handle, cl_kernel_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||
cl_int err = dispatch::clGetKernelInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
check(err);
|
||||
dispatch::clGetKernelInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
}
|
||||
|
||||
static void get(cl_kernel handle, cl_device_id dev_id, cl_kernel_work_group_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||
cl_int err = dispatch::clGetKernelWorkGroupInfo(handle, dev_id, param_name,param_value_size,param_value,param_value_size_ret);
|
||||
check(err);
|
||||
dispatch::clGetKernelWorkGroupInfo(handle, dev_id, param_name,param_value_size,param_value,param_value_size_ret);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -84,8 +80,7 @@ struct info<cl_context>
|
||||
typedef cl_context_info type;
|
||||
|
||||
static void get(cl_context handle, cl_context_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||
cl_int err = dispatch::clGetContextInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
check(err);
|
||||
dispatch::clGetContextInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -95,13 +90,11 @@ struct info<cl_program>
|
||||
typedef cl_program_info type;
|
||||
|
||||
static void get(cl_program handle, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||
cl_int err = dispatch::clGetProgramInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
check(err);
|
||||
dispatch::clGetProgramInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
}
|
||||
|
||||
static void get(cl_program handle, cl_device_id device, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||
cl_int err = dispatch::clGetProgramBuildInfo(handle,device,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
check(err);
|
||||
dispatch::clGetProgramBuildInfo(handle,device,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -111,8 +104,7 @@ struct info<cl_event>
|
||||
{
|
||||
typedef cl_profiling_info type;
|
||||
static void get(cl_event handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||
cl_int err = dispatch::clGetEventProfilingInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
check(err);
|
||||
dispatch::clGetEventProfilingInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -121,8 +113,7 @@ struct info<cl_command_queue>
|
||||
{
|
||||
typedef cl_command_queue_info type;
|
||||
static void get(cl_command_queue handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||
cl_int err = dispatch::clGetCommandQueueInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
check(err);
|
||||
dispatch::clGetCommandQueueInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -131,8 +122,7 @@ struct info<cl_platform_id>
|
||||
{
|
||||
typedef cl_command_queue_info type;
|
||||
static void get(cl_platform_id handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||
cl_int err = dispatch::clGetPlatformInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
check(err);
|
||||
dispatch::clGetPlatformInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||
}
|
||||
};
|
||||
|
||||
|
@@ -38,7 +38,7 @@ Kernel::Kernel(Program const & program, const char * name) : backend_(program.ba
|
||||
case CUDA:
|
||||
cu_params_store_.reserve(64);
|
||||
cu_params_.reserve(64);
|
||||
check(dispatch::cuModuleGetFunction(&h_.cu(), program.h_.cu(), name));\
|
||||
dispatch::cuModuleGetFunction(&h_.cu(), program.h_.cu(), name);\
|
||||
break;
|
||||
case OPENCL:
|
||||
cl_int err;
|
||||
@@ -85,7 +85,7 @@ void Kernel::setArg(unsigned int index, std::size_t size, void* ptr)
|
||||
cu_params_[index] = cu_params_store_[index].get();
|
||||
break;
|
||||
case OPENCL:
|
||||
check(dispatch::clSetKernelArg(h_.cl(), index, size, ptr));
|
||||
dispatch::clSetKernelArg(h_.cl(), index, size, ptr);
|
||||
break;
|
||||
default:
|
||||
throw;
|
||||
@@ -101,7 +101,7 @@ void Kernel::setArg(unsigned int index, Buffer const & data)
|
||||
setArg(index, sizeof(CUdeviceptr), (void*)&data.h_.cu()); break;
|
||||
}
|
||||
case OPENCL:
|
||||
check(dispatch::clSetKernelArg(h_.cl(), index, sizeof(cl_mem), (void*)&data.h_.cl()));
|
||||
dispatch::clSetKernelArg(h_.cl(), index, sizeof(cl_mem), (void*)&data.h_.cl());
|
||||
break;
|
||||
default: throw;
|
||||
}
|
||||
|
@@ -78,7 +78,7 @@ void Platform::devices(std::vector<Device> & devices) const
|
||||
case CUDA:
|
||||
{
|
||||
int N;
|
||||
check(dispatch::cuDeviceGetCount(&N));
|
||||
dispatch::cuDeviceGetCount(&N);
|
||||
for(int i = 0 ; i < N ; ++i){
|
||||
CUdevice device;
|
||||
dispatch::cuDeviceGet(&device, i);
|
||||
@@ -89,9 +89,9 @@ void Platform::devices(std::vector<Device> & devices) const
|
||||
case OPENCL:
|
||||
{
|
||||
cl_uint ndevices;
|
||||
check(dispatch::dispatch::clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, 0, NULL, &ndevices));
|
||||
dispatch::dispatch::clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, 0, NULL, &ndevices);
|
||||
std::vector<cl_device_id> device_ids(ndevices);
|
||||
check(dispatch::dispatch::clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, ndevices, device_ids.data(), NULL));
|
||||
dispatch::dispatch::clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, ndevices, device_ids.data(), NULL);
|
||||
for(cl_device_id d : device_ids)
|
||||
devices.push_back(Device(d));
|
||||
break;
|
||||
|
@@ -55,7 +55,7 @@ Program::Program(Context const & context, std::string const & source) : backend_
|
||||
//Load cached program
|
||||
if(cache_path.size() && std::ifstream(fname, std::ios::binary))
|
||||
{
|
||||
check(dispatch::cuModuleLoad(&h_.cu(), fname.c_str()));
|
||||
dispatch::cuModuleLoad(&h_.cu(), fname.c_str());
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -64,28 +64,28 @@ Program::Program(Context const & context, std::string const & source) : backend_
|
||||
const char * includes[] = {"vector.h"};
|
||||
const char * src[] = {helpers::cuda::vector};
|
||||
|
||||
check(dispatch::nvrtcCreateProgram(&prog, source.c_str(), NULL, 1, src, includes));
|
||||
dispatch::nvrtcCreateProgram(&prog, source.c_str(), NULL, 1, src, includes);
|
||||
try{
|
||||
std::pair<unsigned int, unsigned int> capability = context_.device().nv_compute_capability();
|
||||
std::string capability_opt = "--gpu-architecture=compute_";
|
||||
capability_opt += tools::to_string(capability.first) + tools::to_string(capability.second);
|
||||
const char * options[] = {capability_opt.c_str(), "--restrict"};
|
||||
check(dispatch::nvrtcCompileProgram(prog, 2, options));
|
||||
dispatch::nvrtcCompileProgram(prog, 2, options);
|
||||
}catch(exception::nvrtc::compilation const &)
|
||||
{
|
||||
size_t logsize;
|
||||
check(dispatch::nvrtcGetProgramLogSize(prog, &logsize));
|
||||
dispatch::nvrtcGetProgramLogSize(prog, &logsize);
|
||||
std::string log(logsize, 0);
|
||||
check(dispatch::nvrtcGetProgramLog(prog, (char*)log.data()));
|
||||
dispatch::nvrtcGetProgramLog(prog, (char*)log.data());
|
||||
std::cout << "Compilation failed:" << std::endl;
|
||||
std::cout << log << std::endl;
|
||||
}
|
||||
|
||||
size_t ptx_size;
|
||||
check(dispatch::nvrtcGetPTXSize(prog, &ptx_size));
|
||||
dispatch::nvrtcGetPTXSize(prog, &ptx_size);
|
||||
std::vector<char> ptx(ptx_size);
|
||||
check(dispatch::nvrtcGetPTX(prog, ptx.data()));
|
||||
check(dispatch::cuModuleLoadDataEx(&h_.cu(), ptx.data(), 0, NULL, NULL));
|
||||
dispatch::nvrtcGetPTX(prog, ptx.data());
|
||||
dispatch::cuModuleLoadDataEx(&h_.cu(), ptx.data(), 0, NULL, NULL);
|
||||
|
||||
//Save cached program
|
||||
if (cache_path.size())
|
||||
@@ -112,7 +112,7 @@ Program::Program(Context const & context, std::string const & source) : backend_
|
||||
|
||||
// str.assign((std::istreambuf_iterator<char>(ifs)),
|
||||
// std::istreambuf_iterator<char>());
|
||||
// check(dispatch::cuModuleLoadDataEx(&h_.cu(), str.c_str(), 0, NULL, NULL));
|
||||
// dispatch::cuModuleLoadDataEx(&h_.cu(), str.c_str(), 0, NULL, NULL);
|
||||
|
||||
break;
|
||||
}
|
||||
@@ -141,7 +141,7 @@ Program::Program(Context const & context, std::string const & source) : backend_
|
||||
char* cbuffer = buffer.data();
|
||||
h_.cl() = dispatch::clCreateProgramWithBinary(context_.h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), &len, (const unsigned char **)&cbuffer, NULL, &err);
|
||||
check(err);
|
||||
check(dispatch::clBuildProgram(h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL));
|
||||
dispatch::clBuildProgram(h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -150,7 +150,7 @@ Program::Program(Context const & context, std::string const & source) : backend_
|
||||
const char * csrc = source.c_str();
|
||||
h_.cl() = dispatch::clCreateProgramWithSource(context_.h_.cl(), 1, &csrc, &srclen, &err);
|
||||
try{
|
||||
check(dispatch::clBuildProgram(h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL));
|
||||
dispatch::clBuildProgram(h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL);
|
||||
//Save cached program
|
||||
if (cache_path.size())
|
||||
{
|
||||
|
@@ -50,7 +50,7 @@ std::vector<int_t> infos(expression_tree const & tree, symbolic::preset::gemm::a
|
||||
}
|
||||
|
||||
/* ------------------ CUBLAS ------------------ */
|
||||
cublas_gemm::cublas_gemm(char A_trans, char B_trans): A_trans_(A_trans), B_trans_(B_trans), init_(driver::dispatch::cublasinit())
|
||||
cublas_gemm::cublas_gemm(char A_trans, char B_trans): A_trans_(A_trans), B_trans_(B_trans), init_(true)
|
||||
{ }
|
||||
|
||||
int cublas_gemm::is_invalid(expression_tree const &, driver::Device const & device) const
|
||||
@@ -85,30 +85,31 @@ void cublas_gemm::enqueue(driver::CommandQueue & queue, driver::Program const &,
|
||||
CUdeviceptr cuB = args.B->array.handle.cu;
|
||||
CUdeviceptr cuC = args.C->array.handle.cu;
|
||||
runtime::execution_options_type const & opt = control.execution_options();
|
||||
auto cuT = [](char xt) { return xt=='N'?CUBLAS_OP_N:CUBLAS_OP_T; };
|
||||
auto cuT = [](char xt) { return (xt=='N')?CUBLAS_OP_N:CUBLAS_OP_T; };
|
||||
int offA = args.A->array.start, offB = args.B->array.start, offC = args.C->array.start;
|
||||
cublasHandle_t h = drv::dispatch::cublasHandle(queue.context());
|
||||
//Set new stream
|
||||
cudaStream_t bkp;
|
||||
drv::Event event(drv::CUDA);
|
||||
drv::dispatch::cublasGetStream(&bkp);
|
||||
drv::dispatch::cublasSetStream((cudaStream_t)queue.handle().cu());
|
||||
drv::dispatch::cublasGetStream(h,&bkp);
|
||||
drv::dispatch::cublasSetStream(h,(cudaStream_t)queue.handle().cu());
|
||||
values_holder alpha = args.alpha.values();
|
||||
values_holder beta = args.beta.values();
|
||||
if(opt.events)
|
||||
drv::check(drv::dispatch::cuEventRecord(event.handle().cu().first, queue.handle().cu()));
|
||||
if(args.C->dtype==FLOAT_TYPE)
|
||||
drv::dispatch::cublasSgemm(cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float32, (float*)cuA, args.A->ld[1], (float*)cuB, args.B->ld[1], &beta.float32, (float*)cuC, args.C->ld[1]);
|
||||
drv::dispatch::cublasSgemm(h,cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float32, (float*)cuA + offA , args.A->ld[1], (float*)cuB + offB, args.B->ld[1], &beta.float32, (float*)cuC + offC, args.C->ld[1]);
|
||||
else
|
||||
drv::dispatch::cublasDgemm(cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float64, (double*)cuA, args.A->ld[1], (double*)cuB, args.B->ld[1], &beta.float64, (double*)cuC, args.C->ld[1]);
|
||||
drv::dispatch::cublasDgemm(h,cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float64, (double*)cuA + offA, args.A->ld[1], (double*)cuB + offB, args.B->ld[1], &beta.float64, (double*)cuC + offC, args.C->ld[1]);
|
||||
if(opt.events){
|
||||
drv::check(drv::dispatch::cuEventRecord(event.handle().cu().second, queue.handle().cu()));
|
||||
opt.events->push_back(event);
|
||||
}
|
||||
//Revert old stream
|
||||
drv::dispatch::cublasSetStream(bkp);
|
||||
drv::dispatch::cublasSetStream(h,bkp);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* -------------------------------------------- */
|
||||
unsigned int gemm::lmem_usage(expression_tree const & expression) const
|
||||
{
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@@ -139,6 +139,16 @@ profiles::value_type::templates_container const & profiles::value_type::template
|
||||
return templates_;
|
||||
}
|
||||
|
||||
std::shared_ptr<templates::base> profiles::create(std::string const & op, std::string const & str)
|
||||
{
|
||||
if(str=="cublas_gemm"){
|
||||
if(op=="gemm_nn") return std::shared_ptr<templates::base>(new templates::cublas_gemm('N', 'N'));
|
||||
if(op=="gemm_nt") return std::shared_ptr<templates::base>(new templates::cublas_gemm('N', 'T'));
|
||||
if(op=="gemm_tn") return std::shared_ptr<templates::base>(new templates::cublas_gemm('T', 'N'));
|
||||
if(op=="gemm_tt") return std::shared_ptr<templates::base>(new templates::cublas_gemm('T', 'T'));
|
||||
}
|
||||
throw;
|
||||
}
|
||||
|
||||
std::shared_ptr<templates::base> profiles::create(std::string const & template_name, std::vector<int> const & x)
|
||||
{
|
||||
@@ -189,8 +199,12 @@ void profiles::import(std::string const & str, driver::CommandQueue const & queu
|
||||
// Get profiles
|
||||
std::vector<std::shared_ptr<templates::base> > templates;
|
||||
rapidjson::Value const & profiles = document[opcstr][dtcstr]["profiles"];
|
||||
for (rapidjson::SizeType id = 0 ; id < profiles.Size() ; ++id)
|
||||
templates.push_back(create(operation, rapidjson::to_int_array<int>(profiles[id])));
|
||||
for (rapidjson::SizeType i = 0 ; i < profiles.Size() ; ++i){
|
||||
if(profiles[i].IsString())
|
||||
templates.push_back(create(operation, profiles[i].GetString()));
|
||||
else
|
||||
templates.push_back(create(operation, rapidjson::to_int_array<int>(profiles[i])));
|
||||
}
|
||||
if(templates.size()>1){
|
||||
// Get predictor
|
||||
predictors::random_forest predictor(document[opcstr][dtcstr]["predictor"]);
|
||||
|
@@ -174,7 +174,7 @@ def is_local_optimum(parameters, template, sizes, context):
|
||||
|
||||
#Evaluate the provided parameters guess
|
||||
reference = tools.benchmark(template(*parameters), tree)
|
||||
if isinf(reference):
|
||||
if reference==float('inf'):
|
||||
return False
|
||||
|
||||
#Latency bound -- ignore
|
||||
|
@@ -40,7 +40,8 @@ def linspace(a, b, n=100):
|
||||
def expspace(a,b,N,r=128):
|
||||
return [int(ceil(exp(x)/r)*r) for x in linspace(log(a), log(b), N)]
|
||||
|
||||
def benchmark(template, tree):
|
||||
|
||||
def benchmark(template, tree, operation=sc.templates.gemm_nn):
|
||||
queue = tree.context.queues[0]
|
||||
queue.profiles[template, sc.float32] = sc.profile(template, sc.float32, queue)
|
||||
times = []
|
||||
@@ -49,14 +50,14 @@ def benchmark(template, tree):
|
||||
#Warm-up
|
||||
try:
|
||||
z, events = sc.driver.enqueue(tree)
|
||||
tree.context.queues[0].synchronize()
|
||||
queue.synchronize()
|
||||
except profile_execution_failure:
|
||||
return float("inf")
|
||||
#Time
|
||||
while total < 1e-1:
|
||||
while total < 1e-2:
|
||||
start = time()
|
||||
z, events = sc.driver.enqueue(tree)
|
||||
tree.context.queues[0].synchronize()
|
||||
queue.synchronize()
|
||||
end = time()
|
||||
times.append(end - start)
|
||||
total += times[-1]
|
||||
@@ -91,11 +92,12 @@ def tree_of(template, sizes, context):
|
||||
AT = template is sc.templates.gemm_tn or template is sc.templates.gemm_tt
|
||||
BT = template is sc.templates.gemm_nt or template is sc.templates.gemm_tt
|
||||
M, N, K = sizes
|
||||
C = sc.empty((M,N), context=context)
|
||||
A = sc.empty((K, M) if AT else (M, K), context=context)
|
||||
B = sc.empty((N, K) if BT else (K, N), context=context)
|
||||
AA = A.T if AT else A
|
||||
BB = B.T if BT else B
|
||||
return sc.dot(AA, BB), (A, B)
|
||||
return sc.assign(C, sc.dot(AA, BB)), (A, B, C)
|
||||
|
||||
def memory_footprint(template, sizes):
|
||||
if issubclass(template, sc.templates.elementwise_1d):
|
||||
@@ -123,14 +125,16 @@ def metric_name_of(template):
|
||||
return 'GB/S'
|
||||
|
||||
def external_profiles(template):
|
||||
res = []
|
||||
if template is sc.templates.gemm_nn:
|
||||
return [sc.templates.cublas_gemm('N', 'N')]
|
||||
res += [sc.templates.cublas_gemm('N','N')]
|
||||
elif template is sc.templates.gemm_tn:
|
||||
return [sc.templates.cublas_gemm('T', 'N')]
|
||||
res += [sc.templates.cublas_gemm('T','N')]
|
||||
elif template is sc.templates.gemm_nt:
|
||||
return [sc.templates.cublas_gemm('N', 'T')]
|
||||
res += [sc.templates.cublas_gemm('N','T')]
|
||||
elif template is sc.templates.gemm_tt:
|
||||
return [sc.templates.cublas_gemm('T', 'T')]
|
||||
res += [sc.templates.cublas_gemm('T','T')]
|
||||
return res
|
||||
|
||||
def genetic_infos_of(template):
|
||||
if issubclass(template, sc.templates.elementwise_1d):
|
||||
@@ -144,4 +148,8 @@ def genetic_infos_of(template):
|
||||
elif issubclass(template, sc.templates.gemm):
|
||||
return {'categorical': [8,9], 'nbits': [3,3,3,3,3,2,2,2,2,2,3,3]}
|
||||
|
||||
|
||||
def convert(profile):
|
||||
if isinstance(profile, str):
|
||||
return profile
|
||||
else:
|
||||
return map(int, profile)
|
||||
|
@@ -77,11 +77,14 @@ class Tuner:
|
||||
#Square
|
||||
for N in [896, 1760, 2048, 2560]:
|
||||
sizes += [(N, N)]
|
||||
#Tall and Skinny
|
||||
#Short/Fat
|
||||
for M in [16, 32, 64, 128]:
|
||||
for N in [1024, 4096, 16384, 65536, 262144]:
|
||||
sizes += [(M, N)]
|
||||
sizes += [(N, M)]
|
||||
#Tall/Skinny
|
||||
for N in [16, 32, 64, 128]:
|
||||
for M in [1024, 4096, 16384, 65536, 262144]:
|
||||
sizes += [(M, N)]
|
||||
|
||||
#BLAS3 training sizes
|
||||
if operation in [sc.templates.gemm_nn, sc.templates.gemm_nt, sc.templates.gemm_tn, sc.templates.gemm_tt]:
|
||||
@@ -113,8 +116,6 @@ class Tuner:
|
||||
try:
|
||||
with open(os.path.join(savepath, 'X.csv')) as f:
|
||||
X = [tuple(map(int, row)) for row in csv.reader(f, delimiter=',')]
|
||||
with open(os.path.join(savepath, 'Y.csv')) as f:
|
||||
Y = [map(float, row) for row in csv.reader(f, delimiter=',')]
|
||||
with open(os.path.join(savepath, 'profiles.csv')) as f:
|
||||
def mmap(x):
|
||||
if x=='FETCH_FROM_LOCAL':
|
||||
@@ -125,6 +126,8 @@ class Tuner:
|
||||
return sc.templates.fetch_type.FETCH_FROM_GLOBAL_STRIDED
|
||||
return int(x)
|
||||
profiles = [map(mmap,row) for v in row for row in csv.reader(f, delimiter=',')]
|
||||
with open(os.path.join(savepath, 'Y.csv')) as f:
|
||||
Y = [map(float, row) for row in csv.reader(f, delimiter=',')]
|
||||
except:
|
||||
pass
|
||||
|
||||
@@ -140,29 +143,30 @@ class Tuner:
|
||||
self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
|
||||
continue
|
||||
tree, operands = tools.tree_of(operation, x, context)
|
||||
#Check if GA needs to run (i.e., current best prediction is not a local optimum)
|
||||
tune = True
|
||||
#Best predicted profile for x
|
||||
best = None
|
||||
if idx > 0:
|
||||
dim = min(10, idx+1)
|
||||
clf = RandomForestRegressor(dim, dim).fit(X, Y)
|
||||
if len(profiles) > 1:
|
||||
clf = RandomForestRegressor(20, max_depth=5).fit(X, Y)
|
||||
predictions = clf.predict(x)[0]
|
||||
for idx in (-predictions).argsort():
|
||||
ts = tools.benchmark(operation(*profiles[idx]), tree)
|
||||
if np.isfinite(ts):
|
||||
break
|
||||
if np.isfinite(ts):
|
||||
best = profiles[idx]
|
||||
tune = not optimize.is_local_optimum(predicted, operation, x, context)
|
||||
ts = tools.benchmark(operation(*best), tree)
|
||||
if ts != float('inf'):
|
||||
break
|
||||
else:
|
||||
best = profiles[0]
|
||||
|
||||
#Retune if necessary
|
||||
tune = not (best and optimize.is_local_optimum(best, operation, x, context))
|
||||
if tune:
|
||||
optimizer = optimize.GeneticOptimizer(self.logger, naccept=1000, niter=1000, cxpb=.4, mutpb=.4, popsize=20, progress_bar = self.progress_bar)
|
||||
best = optimizer.run(operation, x, context, prior=best)[0]
|
||||
if best not in profiles:
|
||||
profiles.append(best)
|
||||
for xx,yy in zip(X, Y):
|
||||
tree, _operands = tools.tree_of(operation, xx, context)
|
||||
time = tools.benchmark(operation(*best), _tree)
|
||||
tree, _ = tools.tree_of(operation, xx, context)
|
||||
time = tools.benchmark(operation(*best), tree)
|
||||
yy.append(performance(xx, time))
|
||||
#Update dataset
|
||||
X.append(x)
|
||||
@@ -182,11 +186,12 @@ class Tuner:
|
||||
for prof in tools.external_profiles(operation):
|
||||
for x, y in zip(X, Y):
|
||||
tree, operands = tools.tree_of(operation, x, context)
|
||||
perf = performance(x,tools.benchmark(prof, tree))
|
||||
perf = performance(x,tools.benchmark(prof, tree, operation))
|
||||
if perf > 0:
|
||||
profiles.append(prof.__class__.__name__)
|
||||
y.append(perf)
|
||||
|
||||
|
||||
#Pruning of useless profiles
|
||||
if len(Y[0]) > 1:
|
||||
unused = np.where(np.bincount(np.argmax(Y, 1))==0)[0]
|
||||
@@ -212,5 +217,5 @@ class Tuner:
|
||||
'threshold': e.tree_.threshold.astype('float64').tolist(),
|
||||
'feature': e.tree_.feature.astype('float64').tolist(),
|
||||
'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_]
|
||||
D['profiles'] = [map(int, x) for x in profiles]
|
||||
D['profiles'] = [tools.convert(x) for x in profiles]
|
||||
json.dump(json_data, open(json_path,'w'))
|
||||
|
Reference in New Issue
Block a user