GEMM: Enabled use of cuBLAS when predicted beneficial
This commit is contained in:
@@ -29,7 +29,7 @@ double bench(OP const & op, SYNC const & sync)
|
|||||||
double total_time = 0;
|
double total_time = 0;
|
||||||
op();
|
op();
|
||||||
sync();
|
sync();
|
||||||
while(total_time*1e-9 < 1e-1){
|
while(total_time*1e-9 < 2e-1){
|
||||||
tmr.start();
|
tmr.start();
|
||||||
op();
|
op();
|
||||||
sync();
|
sync();
|
||||||
@@ -239,12 +239,13 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
#ifdef BENCH_CUBLAS
|
#ifdef BENCH_CUBLAS
|
||||||
times.push_back(bench([&](){cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 1, (T*)cu(C), ldc);}, cusync));
|
times.push_back(bench([&](){cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 1, (T*)cu(C), ldc);}, cusync));
|
||||||
#endif
|
#endif
|
||||||
std::cout << name << "\t" << M << "\t" << N << "\t" << K << "\t" << cAT << "\t" << cBT;
|
|
||||||
std::transform(times.begin(), times.end(), std::back_inserter(tflops), [&](double t){ return 2*M*N*K/t*1e-3;});
|
std::transform(times.begin(), times.end(), std::back_inserter(tflops), [&](double t){ return 2*M*N*K/t*1e-3;});
|
||||||
double best = max(tflops);
|
auto fastest = tflops;
|
||||||
for(auto x: tflops){
|
std::sort(fastest.begin(), fastest.end(), std::greater<double>());
|
||||||
|
std::cout << name << "\t" << M << "\t" << N << "\t" << K << "\t" << cAT << "\t" << cBT;
|
||||||
|
for(auto x: tflops){
|
||||||
std::cout << "\t";
|
std::cout << "\t";
|
||||||
if(x==best)
|
if(x == fastest[0] && x/fastest[1] >= 1.02)
|
||||||
std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET);
|
std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET);
|
||||||
else
|
else
|
||||||
std::cout << x;
|
std::cout << x;
|
||||||
|
@@ -38,13 +38,6 @@ enum backend_type
|
|||||||
CUDA
|
CUDA
|
||||||
};
|
};
|
||||||
|
|
||||||
void check(nvrtcResult err);
|
|
||||||
|
|
||||||
void check(CUresult);
|
|
||||||
void check_destruction(CUresult);
|
|
||||||
|
|
||||||
void check(cl_int err);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -50,7 +50,7 @@ private:
|
|||||||
static CUdevice device(CUcontext)
|
static CUdevice device(CUcontext)
|
||||||
{
|
{
|
||||||
CUdevice res;
|
CUdevice res;
|
||||||
check(dispatch::cuCtxGetDevice(&res));
|
dispatch::cuCtxGetDevice(&res);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -32,7 +32,8 @@
|
|||||||
#include "isaac/driver/external/CUDA/cuda.h"
|
#include "isaac/driver/external/CUDA/cuda.h"
|
||||||
#include "isaac/driver/external/CUDA/nvrtc.h"
|
#include "isaac/driver/external/CUDA/nvrtc.h"
|
||||||
#include "isaac/driver/external/CUDA/cublas.h"
|
#include "isaac/driver/external/CUDA/cublas.h"
|
||||||
|
//Exceptions
|
||||||
|
#include "isaac/driver/common.h"
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
namespace isaac
|
namespace isaac
|
||||||
@@ -40,6 +41,14 @@ namespace isaac
|
|||||||
namespace driver
|
namespace driver
|
||||||
{
|
{
|
||||||
|
|
||||||
|
class Context;
|
||||||
|
|
||||||
|
template<class T> void check(T){}
|
||||||
|
void check(nvrtcResult err);
|
||||||
|
void check(CUresult err);
|
||||||
|
void check(cublasStatus_t err);
|
||||||
|
void check(cl_int err);
|
||||||
|
void check_destruction(CUresult);
|
||||||
|
|
||||||
class dispatch
|
class dispatch
|
||||||
{
|
{
|
||||||
@@ -61,11 +70,11 @@ private:
|
|||||||
cache = dlsym(lib_h, name);
|
cache = dlsym(lib_h, name);
|
||||||
FunPtrT fptr;
|
FunPtrT fptr;
|
||||||
*reinterpret_cast<void **>(&fptr) = cache;
|
*reinterpret_cast<void **>(&fptr) = cache;
|
||||||
return (*fptr)(args...);
|
typename return_type<FunPtrT>::type res = (*fptr)(args...);
|
||||||
|
check(res);
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
static bool clinit();
|
static bool clinit();
|
||||||
static bool cublasinit();
|
static bool cublasinit();
|
||||||
@@ -146,17 +155,19 @@ public:
|
|||||||
static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames);
|
static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames);
|
||||||
static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
|
static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
|
||||||
|
|
||||||
static cublasStatus_t cublasGetStream(cudaStream_t *streamId);
|
static cublasHandle_t cublasHandle(Context const & ctx);
|
||||||
static cublasStatus_t cublasSetStream(cudaStream_t streamId);
|
static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
|
||||||
static cublasStatus_t cublasSgemm (cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc);
|
static cublasStatus_t cublasGetStream(cublasHandle_t h, cudaStream_t *streamId);
|
||||||
static cublasStatus_t cublasDgemm (cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc);
|
static cublasStatus_t cublasSetStream(cublasHandle_t h, cudaStream_t streamId);
|
||||||
|
static cublasStatus_t cublasSgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc);
|
||||||
|
static cublasStatus_t cublasDgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static void* opencl_;
|
static void* opencl_;
|
||||||
static void* cuda_;
|
static void* cuda_;
|
||||||
static void* nvrtc_;
|
static void* nvrtc_;
|
||||||
static void* cublas_;
|
static void* cublas_;
|
||||||
static cublasHandle_t cublas_handle_;
|
|
||||||
|
|
||||||
//OpenCL
|
//OpenCL
|
||||||
static void* clBuildProgram_;
|
static void* clBuildProgram_;
|
||||||
|
@@ -39,7 +39,7 @@ namespace nvrtc
|
|||||||
|
|
||||||
#define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class ISAACAPI name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } }
|
#define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class ISAACAPI name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } }
|
||||||
|
|
||||||
ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory ,"out of memory exception");
|
ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory ,"out of memory");
|
||||||
ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure ,"program creation failure");
|
ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure ,"program creation failure");
|
||||||
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_input ,"invalid input");
|
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_input ,"invalid input");
|
||||||
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_program ,"invalid program");
|
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_program ,"invalid program");
|
||||||
@@ -49,8 +49,6 @@ namespace nvrtc
|
|||||||
ISAAC_CREATE_NVRTC_EXCEPTION(unknown_error ,"unknown error");
|
ISAAC_CREATE_NVRTC_EXCEPTION(unknown_error ,"unknown error");
|
||||||
|
|
||||||
#undef ISAAC_CREATE_NVRTC_EXCEPTION
|
#undef ISAAC_CREATE_NVRTC_EXCEPTION
|
||||||
|
|
||||||
void check(nvrtcResult err);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -120,11 +118,26 @@ namespace cuda
|
|||||||
ISAAC_CREATE_CUDA_EXCEPTION(unknown ,"unknown");
|
ISAAC_CREATE_CUDA_EXCEPTION(unknown ,"unknown");
|
||||||
|
|
||||||
#undef ISAAC_CREATE_CUDA_EXCEPTION
|
#undef ISAAC_CREATE_CUDA_EXCEPTION
|
||||||
|
|
||||||
void check(CUresult);
|
|
||||||
void check_destruction(CUresult);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace cublas
|
||||||
|
{
|
||||||
|
|
||||||
|
#define ISAAC_CREATE_CUBLAS_EXCEPTION(name, msg) class ISAACAPI name: public std::exception { public: const char * what() const throw(){ return "CUBLAS: Error- " msg; } }
|
||||||
|
|
||||||
|
ISAAC_CREATE_CUBLAS_EXCEPTION(not_initialized ,"not initialized");
|
||||||
|
ISAAC_CREATE_CUBLAS_EXCEPTION(alloc_failed ,"alloc failed");
|
||||||
|
ISAAC_CREATE_CUBLAS_EXCEPTION(invalid_value ,"invalid value");
|
||||||
|
ISAAC_CREATE_CUBLAS_EXCEPTION(arch_mismatch ,"arch mismatch");
|
||||||
|
ISAAC_CREATE_CUBLAS_EXCEPTION(mapping_error ,"mapping error");
|
||||||
|
ISAAC_CREATE_CUBLAS_EXCEPTION(execution_failed ,"execution failed");
|
||||||
|
ISAAC_CREATE_CUBLAS_EXCEPTION(internal_error ,"internal error");
|
||||||
|
ISAAC_CREATE_CUBLAS_EXCEPTION(not_supported ,"not supported");
|
||||||
|
ISAAC_CREATE_CUBLAS_EXCEPTION(license_error ,"license error");
|
||||||
|
ISAAC_CREATE_CUBLAS_EXCEPTION(unknown ,"unknown");
|
||||||
|
|
||||||
|
#undef ISAAC_CREATE_CUBLAS_EXCEPTION
|
||||||
|
}
|
||||||
|
|
||||||
namespace ocl
|
namespace ocl
|
||||||
{
|
{
|
||||||
@@ -183,9 +196,6 @@ namespace ocl
|
|||||||
#ifdef CL_INVALID_PROPERTY
|
#ifdef CL_INVALID_PROPERTY
|
||||||
ISAAC_CREATE_CL_EXCEPTION(invalid_property, "invalid property");
|
ISAAC_CREATE_CL_EXCEPTION(invalid_property, "invalid property");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ISAACAPI void check(cl_int err);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -68,6 +68,7 @@ public:
|
|||||||
typedef std::map<std::pair<expression_type, numeric_type>, std::shared_ptr<value_type> > map_type;
|
typedef std::map<std::pair<expression_type, numeric_type>, std::shared_ptr<value_type> > map_type;
|
||||||
private:
|
private:
|
||||||
static std::shared_ptr<templates::base> create(std::string const & template_name, std::vector<int> const & x);
|
static std::shared_ptr<templates::base> create(std::string const & template_name, std::vector<int> const & x);
|
||||||
|
static std::shared_ptr<templates::base> create(std::string const & op, std::string const & x);
|
||||||
static void import(std::string const & fname, driver::CommandQueue const & queue);
|
static void import(std::string const & fname, driver::CommandQueue const & queue);
|
||||||
static map_type & init(driver::CommandQueue const & queue);
|
static map_type & init(driver::CommandQueue const & queue);
|
||||||
public:
|
public:
|
||||||
|
@@ -217,9 +217,9 @@ void backend::platforms(std::vector<Platform> & platforms)
|
|||||||
if(dispatch::clinit())
|
if(dispatch::clinit())
|
||||||
{
|
{
|
||||||
cl_uint nplatforms;
|
cl_uint nplatforms;
|
||||||
check(dispatch::dispatch::clGetPlatformIDs(0, NULL, &nplatforms));
|
dispatch::dispatch::clGetPlatformIDs(0, NULL, &nplatforms);
|
||||||
std::vector<cl_platform_id> clplatforms(nplatforms);
|
std::vector<cl_platform_id> clplatforms(nplatforms);
|
||||||
check(dispatch::dispatch::clGetPlatformIDs(nplatforms, clplatforms.data(), NULL));
|
dispatch::dispatch::clGetPlatformIDs(nplatforms, clplatforms.data(), NULL);
|
||||||
for(cl_platform_id p: clplatforms){
|
for(cl_platform_id p: clplatforms){
|
||||||
Platform tmp(p);
|
Platform tmp(p);
|
||||||
if(tmp.name().find("CUDA")!=std::string::npos && has_cuda)
|
if(tmp.name().find("CUDA")!=std::string::npos && has_cuda)
|
||||||
|
@@ -45,7 +45,7 @@ Buffer::Buffer(Context const & context, size_t size) : backend_(context.backend_
|
|||||||
switch(backend_)
|
switch(backend_)
|
||||||
{
|
{
|
||||||
case CUDA:
|
case CUDA:
|
||||||
check(dispatch::cuMemAlloc(&h_.cu(), size));
|
dispatch::cuMemAlloc(&h_.cu(), size);
|
||||||
break;
|
break;
|
||||||
case OPENCL:
|
case OPENCL:
|
||||||
cl_int err;
|
cl_int err;
|
||||||
|
@@ -112,10 +112,29 @@ void check(CUresult err)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void check(cublasStatus_t err)
|
||||||
|
{
|
||||||
|
using namespace isaac::exception::cublas;
|
||||||
|
switch(err)
|
||||||
|
{
|
||||||
|
case CUBLAS_STATUS_SUCCESS : break;
|
||||||
|
case CUBLAS_STATUS_NOT_INITIALIZED : throw not_initialized();
|
||||||
|
case CUBLAS_STATUS_ALLOC_FAILED : throw alloc_failed();
|
||||||
|
case CUBLAS_STATUS_INVALID_VALUE : throw invalid_value();
|
||||||
|
case CUBLAS_STATUS_ARCH_MISMATCH : throw arch_mismatch();
|
||||||
|
case CUBLAS_STATUS_MAPPING_ERROR : throw mapping_error();
|
||||||
|
case CUBLAS_STATUS_EXECUTION_FAILED: throw execution_failed();
|
||||||
|
case CUBLAS_STATUS_INTERNAL_ERROR : throw internal_error();
|
||||||
|
case CUBLAS_STATUS_NOT_SUPPORTED : throw not_supported();
|
||||||
|
case CUBLAS_STATUS_LICENSE_ERROR : throw license_error();
|
||||||
|
default : throw unknown();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void check_destruction(CUresult result)
|
void check_destruction(CUresult result)
|
||||||
{
|
{
|
||||||
if(result!=CUDA_ERROR_DEINITIALIZED)
|
if(result!=CUDA_ERROR_DEINITIALIZED)
|
||||||
check(result);
|
check(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -179,6 +198,7 @@ void check(cl_int err)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -50,7 +50,7 @@ CommandQueue::CommandQueue(Context const & context, Device const & device, cl_co
|
|||||||
{
|
{
|
||||||
case CUDA:
|
case CUDA:
|
||||||
{
|
{
|
||||||
check(dispatch::cuStreamCreate(&h_.cu(), 0));
|
dispatch::cuStreamCreate(&h_.cu(), 0);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -86,8 +86,8 @@ void CommandQueue::synchronize()
|
|||||||
{
|
{
|
||||||
switch(backend_)
|
switch(backend_)
|
||||||
{
|
{
|
||||||
case CUDA: check(dispatch::cuStreamSynchronize(h_.cu())); break;
|
case CUDA: dispatch::cuStreamSynchronize(h_.cu()); break;
|
||||||
case OPENCL: check(dispatch::clFinish(h_.cl())); break;
|
case OPENCL: dispatch::clFinish(h_.cl()); break;
|
||||||
default: throw;
|
default: throw;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -98,16 +98,16 @@ void CommandQueue::enqueue(Kernel const & kernel, NDRange global, driver::NDRang
|
|||||||
{
|
{
|
||||||
case CUDA:
|
case CUDA:
|
||||||
if(event)
|
if(event)
|
||||||
check(dispatch::cuEventRecord(event->h_.cu().first, h_.cu()));
|
dispatch::cuEventRecord(event->h_.cu().first, h_.cu());
|
||||||
|
|
||||||
check(dispatch::cuLaunchKernel(kernel.h_.cu(), global[0]/local[0], global[1]/local[1], global[2]/local[2],
|
dispatch::cuLaunchKernel(kernel.h_.cu(), global[0]/local[0], global[1]/local[1], global[2]/local[2],
|
||||||
local[0], local[1], local[2], 0, h_.cu(),(void**)&kernel.cu_params_[0], NULL));
|
local[0], local[1], local[2], 0, h_.cu(),(void**)&kernel.cu_params_[0], NULL);
|
||||||
|
|
||||||
if(event)
|
if(event)
|
||||||
check(dispatch::cuEventRecord(event->h_.cu().second, h_.cu()));
|
dispatch::cuEventRecord(event->h_.cu().second, h_.cu());
|
||||||
break;
|
break;
|
||||||
case OPENCL:
|
case OPENCL:
|
||||||
check(dispatch::clEnqueueNDRangeKernel(h_.cl(), kernel.h_.cl(), global.dimension(), NULL, (const size_t *)global, (const size_t *) local, 0, NULL, event?&event->h_.cl():NULL));
|
dispatch::clEnqueueNDRangeKernel(h_.cl(), kernel.h_.cl(), global.dimension(), NULL, (const size_t *)global, (const size_t *) local, 0, NULL, event?&event->h_.cl():NULL);
|
||||||
break;
|
break;
|
||||||
default: throw;
|
default: throw;
|
||||||
}
|
}
|
||||||
@@ -119,12 +119,12 @@ void CommandQueue::write(Buffer const & buffer, bool blocking, std::size_t offse
|
|||||||
{
|
{
|
||||||
case CUDA:
|
case CUDA:
|
||||||
if(blocking)
|
if(blocking)
|
||||||
check(dispatch::cuMemcpyHtoD(buffer.h_.cu() + offset, ptr, size));
|
dispatch::cuMemcpyHtoD(buffer.h_.cu() + offset, ptr, size);
|
||||||
else
|
else
|
||||||
check(dispatch::cuMemcpyHtoDAsync(buffer.h_.cu() + offset, ptr, size, h_.cu()));
|
dispatch::cuMemcpyHtoDAsync(buffer.h_.cu() + offset, ptr, size, h_.cu());
|
||||||
break;
|
break;
|
||||||
case OPENCL:
|
case OPENCL:
|
||||||
check(dispatch::clEnqueueWriteBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL));
|
dispatch::clEnqueueWriteBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL);
|
||||||
break;
|
break;
|
||||||
default: throw;
|
default: throw;
|
||||||
}
|
}
|
||||||
@@ -136,12 +136,12 @@ void CommandQueue::read(Buffer const & buffer, bool blocking, std::size_t offset
|
|||||||
{
|
{
|
||||||
case CUDA:
|
case CUDA:
|
||||||
if(blocking)
|
if(blocking)
|
||||||
check(dispatch::cuMemcpyDtoH(ptr, buffer.h_.cu() + offset, size));
|
dispatch::cuMemcpyDtoH(ptr, buffer.h_.cu() + offset, size);
|
||||||
else
|
else
|
||||||
check(dispatch::cuMemcpyDtoHAsync(ptr, buffer.h_.cu() + offset, size, h_.cu()));
|
dispatch::cuMemcpyDtoHAsync(ptr, buffer.h_.cu() + offset, size, h_.cu());
|
||||||
break;
|
break;
|
||||||
case OPENCL:
|
case OPENCL:
|
||||||
check(dispatch::clEnqueueReadBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL));
|
dispatch::clEnqueueReadBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL);
|
||||||
break;
|
break;
|
||||||
default: throw;
|
default: throw;
|
||||||
}
|
}
|
||||||
|
@@ -74,7 +74,7 @@ Context::Context(Device const & device) : backend_(device.backend_), device_(dev
|
|||||||
switch(backend_)
|
switch(backend_)
|
||||||
{
|
{
|
||||||
case CUDA:
|
case CUDA:
|
||||||
check(dispatch::cuCtxCreate(&h_.cu(), CU_CTX_SCHED_AUTO, device.h_.cu()));
|
dispatch::cuCtxCreate(&h_.cu(), CU_CTX_SCHED_AUTO, device.h_.cu());
|
||||||
break;
|
break;
|
||||||
case OPENCL:
|
case OPENCL:
|
||||||
cl_int err;
|
cl_int err;
|
||||||
|
@@ -38,7 +38,7 @@ template<CUdevice_attribute attr>
|
|||||||
int Device::cuGetInfo() const
|
int Device::cuGetInfo() const
|
||||||
{
|
{
|
||||||
int res;
|
int res;
|
||||||
check(dispatch::cuDeviceGetAttribute(&res, attr, h_.cu()));
|
dispatch::cuDeviceGetAttribute(&res, attr, h_.cu());
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -197,7 +197,7 @@ std::string Device::name() const
|
|||||||
{
|
{
|
||||||
case CUDA:
|
case CUDA:
|
||||||
char tmp[128];
|
char tmp[128];
|
||||||
check(dispatch::cuDeviceGetName(tmp, 128, h_.cu()));
|
dispatch::cuDeviceGetName(tmp, 128, h_.cu());
|
||||||
return std::string(tmp);
|
return std::string(tmp);
|
||||||
case OPENCL:
|
case OPENCL:
|
||||||
return ocl::info<CL_DEVICE_NAME>(h_.cl());
|
return ocl::info<CL_DEVICE_NAME>(h_.cl());
|
||||||
|
@@ -20,6 +20,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "isaac/driver/dispatch.h"
|
#include "isaac/driver/dispatch.h"
|
||||||
|
#include "isaac/driver/context.h"
|
||||||
|
|
||||||
namespace isaac
|
namespace isaac
|
||||||
{
|
{
|
||||||
@@ -126,11 +127,8 @@ bool dispatch::nvrtcinit()
|
|||||||
|
|
||||||
bool dispatch::cublasinit()
|
bool dispatch::cublasinit()
|
||||||
{
|
{
|
||||||
if(cublas_==nullptr){
|
if(cublas_==nullptr)
|
||||||
cublas_ = dlopen("libcublas.so", RTLD_LAZY);
|
cublas_ = dlopen("libcublas.so", RTLD_LAZY);
|
||||||
if(cublas_!=nullptr)
|
|
||||||
dispatch::cublasCreate_v2(&cublas_handle_);
|
|
||||||
}
|
|
||||||
return cublas_ != nullptr;
|
return cublas_ != nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -212,19 +210,28 @@ NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTXSize, nvrtcProgram, size_t *)
|
|||||||
NVRTC_DEFINE6(nvrtcResult, nvrtcCreateProgram, nvrtcProgram *, const char *, const char *, int, const char **, const char **)
|
NVRTC_DEFINE6(nvrtcResult, nvrtcCreateProgram, nvrtcProgram *, const char *, const char *, int, const char **, const char **)
|
||||||
NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLog, nvrtcProgram, char *)
|
NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLog, nvrtcProgram, char *)
|
||||||
|
|
||||||
|
cublasHandle_t dispatch::cublasHandle(Context const & ctx)
|
||||||
|
{
|
||||||
|
static std::map<Context, cublasHandle_t> handles;
|
||||||
|
auto pr = handles.insert({ctx, cublasHandle_t()});
|
||||||
|
if(pr.second)
|
||||||
|
cublasCreate_v2(&pr.first->second);
|
||||||
|
return pr.first->second;
|
||||||
|
}
|
||||||
|
|
||||||
CUBLAS_DEFINE1(cublasStatus_t, cublasCreate_v2, cublasHandle_t*)
|
CUBLAS_DEFINE1(cublasStatus_t, cublasCreate_v2, cublasHandle_t*)
|
||||||
|
|
||||||
cublasStatus_t dispatch::cublasGetStream(cudaStream_t *a)
|
cublasStatus_t dispatch::cublasGetStream(cublasHandle_t h, cudaStream_t *a)
|
||||||
{ return f_impl<dispatch::cublasinit>(cublas_, cublasGetStream_v2, cublasGetStream_, "cublasGetStream_v2", cublas_handle_, a); }
|
{ return f_impl<dispatch::cublasinit>(cublas_, cublasGetStream_v2, cublasGetStream_, "cublasGetStream_v2", h, a); }
|
||||||
|
|
||||||
cublasStatus_t dispatch::cublasSetStream(cudaStream_t a)
|
cublasStatus_t dispatch::cublasSetStream(cublasHandle_t h, cudaStream_t a)
|
||||||
{ return f_impl<dispatch::cublasinit>(cublas_, cublasSetStream_v2, cublasSetStream_, "cublasSetStream_v2", cublas_handle_, a); }
|
{ return f_impl<dispatch::cublasinit>(cublas_, cublasSetStream_v2, cublasSetStream_, "cublasSetStream_v2", h, a); }
|
||||||
|
|
||||||
cublasStatus_t dispatch::cublasSgemm(cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc)
|
cublasStatus_t dispatch::cublasSgemm(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc)
|
||||||
{ return f_impl<dispatch::cublasinit>(cublas_, cublasSgemm_v2, cublasSgemm_, "cublasSgemm_v2", cublas_handle_, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
|
{ return f_impl<dispatch::cublasinit>(cublas_, cublasSgemm_v2, cublasSgemm_, "cublasSgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
|
||||||
|
|
||||||
cublasStatus_t dispatch::cublasDgemm(cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc)
|
cublasStatus_t dispatch::cublasDgemm(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc)
|
||||||
{ return f_impl<dispatch::cublasinit>(cublas_, cublasDgemm_v2, cublasDgemm_, "cublasDgemm_v2", cublas_handle_, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
|
{ return f_impl<dispatch::cublasinit>(cublas_, cublasDgemm_v2, cublasDgemm_, "cublasDgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
|
||||||
|
|
||||||
void dispatch::release()
|
void dispatch::release()
|
||||||
{
|
{
|
||||||
@@ -250,7 +257,6 @@ void * dispatch::opencl_;
|
|||||||
void * dispatch::cuda_;
|
void * dispatch::cuda_;
|
||||||
void * dispatch::nvrtc_;
|
void * dispatch::nvrtc_;
|
||||||
void * dispatch::cublas_;
|
void * dispatch::cublas_;
|
||||||
cublasHandle_t dispatch::cublas_handle_;
|
|
||||||
|
|
||||||
//OpenCL
|
//OpenCL
|
||||||
void* dispatch::clBuildProgram_;
|
void* dispatch::clBuildProgram_;
|
||||||
|
@@ -33,8 +33,8 @@ Event::Event(backend_type backend) : backend_(backend), h_(backend_, true)
|
|||||||
switch(backend_)
|
switch(backend_)
|
||||||
{
|
{
|
||||||
case CUDA:
|
case CUDA:
|
||||||
check(dispatch::dispatch::cuEventCreate(&h_.cu().first, CU_EVENT_DEFAULT));
|
dispatch::cuEventCreate(&h_.cu().first, CU_EVENT_DEFAULT);
|
||||||
check(dispatch::dispatch::cuEventCreate(&h_.cu().second, CU_EVENT_DEFAULT));
|
dispatch::cuEventCreate(&h_.cu().second, CU_EVENT_DEFAULT);
|
||||||
break;
|
break;
|
||||||
case OPENCL:
|
case OPENCL:
|
||||||
break;
|
break;
|
||||||
@@ -54,7 +54,7 @@ long Event::elapsed_time() const
|
|||||||
{
|
{
|
||||||
case CUDA:
|
case CUDA:
|
||||||
float time;
|
float time;
|
||||||
check(dispatch::cuEventElapsedTime(&time, h_.cu().first, h_.cu().second));
|
dispatch::cuEventElapsedTime(&time, h_.cu().first, h_.cu().second);
|
||||||
return 1e6*time;
|
return 1e6*time;
|
||||||
case OPENCL:
|
case OPENCL:
|
||||||
return static_cast<long>(ocl::info<CL_PROFILING_COMMAND_END>(h_.cl()) - ocl::info<CL_PROFILING_COMMAND_START>(h_.cl()));
|
return static_cast<long>(ocl::info<CL_PROFILING_COMMAND_END>(h_.cl()) - ocl::info<CL_PROFILING_COMMAND_START>(h_.cl()));
|
||||||
|
@@ -37,7 +37,7 @@ template<class CLType, class CUType>
|
|||||||
void Handle<CLType, CUType>::_delete(CUcontext x) { check_destruction(dispatch::cuCtxDestroy(x)); }
|
void Handle<CLType, CUType>::_delete(CUcontext x) { check_destruction(dispatch::cuCtxDestroy(x)); }
|
||||||
|
|
||||||
template<class CLType, class CUType>
|
template<class CLType, class CUType>
|
||||||
void Handle<CLType, CUType>::_delete(CUdeviceptr x) { check_destruction(dispatch::dispatch::cuMemFree(x)); }
|
void Handle<CLType, CUType>::_delete(CUdeviceptr x) { check_destruction(dispatch::cuMemFree(x)); }
|
||||||
|
|
||||||
template<class CLType, class CUType>
|
template<class CLType, class CUType>
|
||||||
void Handle<CLType, CUType>::_delete(CUstream x) { check_destruction(dispatch::cuStreamDestroy(x)); }
|
void Handle<CLType, CUType>::_delete(CUstream x) { check_destruction(dispatch::cuStreamDestroy(x)); }
|
||||||
@@ -46,38 +46,38 @@ template<class CLType, class CUType>
|
|||||||
void Handle<CLType, CUType>::_delete(CUdevice) { }
|
void Handle<CLType, CUType>::_delete(CUdevice) { }
|
||||||
|
|
||||||
template<class CLType, class CUType>
|
template<class CLType, class CUType>
|
||||||
void Handle<CLType, CUType>::_delete(CUevent x) { check_destruction(dispatch::dispatch::cuEventDestroy(x)); }
|
void Handle<CLType, CUType>::_delete(CUevent x) { check_destruction(dispatch::cuEventDestroy(x)); }
|
||||||
|
|
||||||
template<class CLType, class CUType>
|
template<class CLType, class CUType>
|
||||||
void Handle<CLType, CUType>::_delete(CUfunction) { }
|
void Handle<CLType, CUType>::_delete(CUfunction) { }
|
||||||
|
|
||||||
template<class CLType, class CUType>
|
template<class CLType, class CUType>
|
||||||
void Handle<CLType, CUType>::_delete(CUmodule x) { check_destruction(dispatch::dispatch::cuModuleUnload(x)); }
|
void Handle<CLType, CUType>::_delete(CUmodule x) { check_destruction(dispatch::cuModuleUnload(x)); }
|
||||||
|
|
||||||
template<class CLType, class CUType>
|
template<class CLType, class CUType>
|
||||||
void Handle<CLType, CUType>::_delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
|
void Handle<CLType, CUType>::_delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
|
||||||
|
|
||||||
//OpenCL
|
//OpenCL
|
||||||
template<class CLType, class CUType>
|
template<class CLType, class CUType>
|
||||||
void Handle<CLType, CUType>::release(cl_context x) { check(dispatch::clReleaseContext(x)); }
|
void Handle<CLType, CUType>::release(cl_context x) { dispatch::clReleaseContext(x); }
|
||||||
|
|
||||||
template<class CLType, class CUType>
|
template<class CLType, class CUType>
|
||||||
void Handle<CLType, CUType>::release(cl_mem x) { check(dispatch::clReleaseMemObject(x)); }
|
void Handle<CLType, CUType>::release(cl_mem x) { dispatch::clReleaseMemObject(x); }
|
||||||
|
|
||||||
template<class CLType, class CUType>
|
template<class CLType, class CUType>
|
||||||
void Handle<CLType, CUType>::release(cl_command_queue x) { check(dispatch::clReleaseCommandQueue(x)); }
|
void Handle<CLType, CUType>::release(cl_command_queue x) { dispatch::clReleaseCommandQueue(x); }
|
||||||
|
|
||||||
template<class CLType, class CUType>
|
template<class CLType, class CUType>
|
||||||
void Handle<CLType, CUType>::release(cl_device_id x) { check(dispatch::clReleaseDevice(x)); }
|
void Handle<CLType, CUType>::release(cl_device_id x) { dispatch::clReleaseDevice(x); }
|
||||||
|
|
||||||
template<class CLType, class CUType>
|
template<class CLType, class CUType>
|
||||||
void Handle<CLType, CUType>::release(cl_event x) { check(dispatch::clReleaseEvent(x)); }
|
void Handle<CLType, CUType>::release(cl_event x) { dispatch::clReleaseEvent(x); }
|
||||||
|
|
||||||
template<class CLType, class CUType>
|
template<class CLType, class CUType>
|
||||||
void Handle<CLType, CUType>::release(cl_kernel x) { check(dispatch::clReleaseKernel(x)); }
|
void Handle<CLType, CUType>::release(cl_kernel x) { dispatch::clReleaseKernel(x); }
|
||||||
|
|
||||||
template<class CLType, class CUType>
|
template<class CLType, class CUType>
|
||||||
void Handle<CLType, CUType>::release(cl_program x) { check(dispatch::clReleaseProgram(x)); }
|
void Handle<CLType, CUType>::release(cl_program x) { dispatch::clReleaseProgram(x); }
|
||||||
|
|
||||||
template<class CLType, class CUType>
|
template<class CLType, class CUType>
|
||||||
Handle<CLType, CUType>::Handle(backend_type backend, bool take_ownership): backend_(backend), has_ownership_(take_ownership)
|
Handle<CLType, CUType>::Handle(backend_type backend, bool take_ownership): backend_(backend), has_ownership_(take_ownership)
|
||||||
|
@@ -45,8 +45,7 @@ struct info<cl_mem>
|
|||||||
|
|
||||||
static void get(cl_mem handle, cl_mem_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret)
|
static void get(cl_mem handle, cl_mem_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret)
|
||||||
{
|
{
|
||||||
cl_int err = dispatch::clGetMemObjectInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
dispatch::clGetMemObjectInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||||
check(err);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -57,8 +56,7 @@ struct info<cl_device_id>
|
|||||||
|
|
||||||
static void get(cl_device_id handle, cl_device_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret)
|
static void get(cl_device_id handle, cl_device_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret)
|
||||||
{
|
{
|
||||||
cl_int err = dispatch::clGetDeviceInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
dispatch::clGetDeviceInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||||
check(err);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -68,13 +66,11 @@ struct info<cl_kernel>
|
|||||||
typedef cl_kernel_info type;
|
typedef cl_kernel_info type;
|
||||||
|
|
||||||
static void get(cl_kernel handle, cl_kernel_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
static void get(cl_kernel handle, cl_kernel_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||||
cl_int err = dispatch::clGetKernelInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
dispatch::clGetKernelInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||||
check(err);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void get(cl_kernel handle, cl_device_id dev_id, cl_kernel_work_group_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
static void get(cl_kernel handle, cl_device_id dev_id, cl_kernel_work_group_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||||
cl_int err = dispatch::clGetKernelWorkGroupInfo(handle, dev_id, param_name,param_value_size,param_value,param_value_size_ret);
|
dispatch::clGetKernelWorkGroupInfo(handle, dev_id, param_name,param_value_size,param_value,param_value_size_ret);
|
||||||
check(err);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -84,8 +80,7 @@ struct info<cl_context>
|
|||||||
typedef cl_context_info type;
|
typedef cl_context_info type;
|
||||||
|
|
||||||
static void get(cl_context handle, cl_context_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
static void get(cl_context handle, cl_context_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||||
cl_int err = dispatch::clGetContextInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
dispatch::clGetContextInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||||
check(err);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -95,13 +90,11 @@ struct info<cl_program>
|
|||||||
typedef cl_program_info type;
|
typedef cl_program_info type;
|
||||||
|
|
||||||
static void get(cl_program handle, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
static void get(cl_program handle, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||||
cl_int err = dispatch::clGetProgramInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
dispatch::clGetProgramInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||||
check(err);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void get(cl_program handle, cl_device_id device, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
static void get(cl_program handle, cl_device_id device, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||||
cl_int err = dispatch::clGetProgramBuildInfo(handle,device,param_name,param_value_size,param_value,param_value_size_ret);
|
dispatch::clGetProgramBuildInfo(handle,device,param_name,param_value_size,param_value,param_value_size_ret);
|
||||||
check(err);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -111,8 +104,7 @@ struct info<cl_event>
|
|||||||
{
|
{
|
||||||
typedef cl_profiling_info type;
|
typedef cl_profiling_info type;
|
||||||
static void get(cl_event handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
static void get(cl_event handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||||
cl_int err = dispatch::clGetEventProfilingInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
dispatch::clGetEventProfilingInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||||
check(err);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -121,8 +113,7 @@ struct info<cl_command_queue>
|
|||||||
{
|
{
|
||||||
typedef cl_command_queue_info type;
|
typedef cl_command_queue_info type;
|
||||||
static void get(cl_command_queue handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
static void get(cl_command_queue handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||||
cl_int err = dispatch::clGetCommandQueueInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
dispatch::clGetCommandQueueInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||||
check(err);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -131,8 +122,7 @@ struct info<cl_platform_id>
|
|||||||
{
|
{
|
||||||
typedef cl_command_queue_info type;
|
typedef cl_command_queue_info type;
|
||||||
static void get(cl_platform_id handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
static void get(cl_platform_id handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
|
||||||
cl_int err = dispatch::clGetPlatformInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
dispatch::clGetPlatformInfo(handle,param_name,param_value_size,param_value,param_value_size_ret);
|
||||||
check(err);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -38,7 +38,7 @@ Kernel::Kernel(Program const & program, const char * name) : backend_(program.ba
|
|||||||
case CUDA:
|
case CUDA:
|
||||||
cu_params_store_.reserve(64);
|
cu_params_store_.reserve(64);
|
||||||
cu_params_.reserve(64);
|
cu_params_.reserve(64);
|
||||||
check(dispatch::cuModuleGetFunction(&h_.cu(), program.h_.cu(), name));\
|
dispatch::cuModuleGetFunction(&h_.cu(), program.h_.cu(), name);\
|
||||||
break;
|
break;
|
||||||
case OPENCL:
|
case OPENCL:
|
||||||
cl_int err;
|
cl_int err;
|
||||||
@@ -85,7 +85,7 @@ void Kernel::setArg(unsigned int index, std::size_t size, void* ptr)
|
|||||||
cu_params_[index] = cu_params_store_[index].get();
|
cu_params_[index] = cu_params_store_[index].get();
|
||||||
break;
|
break;
|
||||||
case OPENCL:
|
case OPENCL:
|
||||||
check(dispatch::clSetKernelArg(h_.cl(), index, size, ptr));
|
dispatch::clSetKernelArg(h_.cl(), index, size, ptr);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
throw;
|
throw;
|
||||||
@@ -101,7 +101,7 @@ void Kernel::setArg(unsigned int index, Buffer const & data)
|
|||||||
setArg(index, sizeof(CUdeviceptr), (void*)&data.h_.cu()); break;
|
setArg(index, sizeof(CUdeviceptr), (void*)&data.h_.cu()); break;
|
||||||
}
|
}
|
||||||
case OPENCL:
|
case OPENCL:
|
||||||
check(dispatch::clSetKernelArg(h_.cl(), index, sizeof(cl_mem), (void*)&data.h_.cl()));
|
dispatch::clSetKernelArg(h_.cl(), index, sizeof(cl_mem), (void*)&data.h_.cl());
|
||||||
break;
|
break;
|
||||||
default: throw;
|
default: throw;
|
||||||
}
|
}
|
||||||
|
@@ -78,7 +78,7 @@ void Platform::devices(std::vector<Device> & devices) const
|
|||||||
case CUDA:
|
case CUDA:
|
||||||
{
|
{
|
||||||
int N;
|
int N;
|
||||||
check(dispatch::cuDeviceGetCount(&N));
|
dispatch::cuDeviceGetCount(&N);
|
||||||
for(int i = 0 ; i < N ; ++i){
|
for(int i = 0 ; i < N ; ++i){
|
||||||
CUdevice device;
|
CUdevice device;
|
||||||
dispatch::cuDeviceGet(&device, i);
|
dispatch::cuDeviceGet(&device, i);
|
||||||
@@ -89,9 +89,9 @@ void Platform::devices(std::vector<Device> & devices) const
|
|||||||
case OPENCL:
|
case OPENCL:
|
||||||
{
|
{
|
||||||
cl_uint ndevices;
|
cl_uint ndevices;
|
||||||
check(dispatch::dispatch::clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, 0, NULL, &ndevices));
|
dispatch::dispatch::clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, 0, NULL, &ndevices);
|
||||||
std::vector<cl_device_id> device_ids(ndevices);
|
std::vector<cl_device_id> device_ids(ndevices);
|
||||||
check(dispatch::dispatch::clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, ndevices, device_ids.data(), NULL));
|
dispatch::dispatch::clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, ndevices, device_ids.data(), NULL);
|
||||||
for(cl_device_id d : device_ids)
|
for(cl_device_id d : device_ids)
|
||||||
devices.push_back(Device(d));
|
devices.push_back(Device(d));
|
||||||
break;
|
break;
|
||||||
|
@@ -55,7 +55,7 @@ Program::Program(Context const & context, std::string const & source) : backend_
|
|||||||
//Load cached program
|
//Load cached program
|
||||||
if(cache_path.size() && std::ifstream(fname, std::ios::binary))
|
if(cache_path.size() && std::ifstream(fname, std::ios::binary))
|
||||||
{
|
{
|
||||||
check(dispatch::cuModuleLoad(&h_.cu(), fname.c_str()));
|
dispatch::cuModuleLoad(&h_.cu(), fname.c_str());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -64,28 +64,28 @@ Program::Program(Context const & context, std::string const & source) : backend_
|
|||||||
const char * includes[] = {"vector.h"};
|
const char * includes[] = {"vector.h"};
|
||||||
const char * src[] = {helpers::cuda::vector};
|
const char * src[] = {helpers::cuda::vector};
|
||||||
|
|
||||||
check(dispatch::nvrtcCreateProgram(&prog, source.c_str(), NULL, 1, src, includes));
|
dispatch::nvrtcCreateProgram(&prog, source.c_str(), NULL, 1, src, includes);
|
||||||
try{
|
try{
|
||||||
std::pair<unsigned int, unsigned int> capability = context_.device().nv_compute_capability();
|
std::pair<unsigned int, unsigned int> capability = context_.device().nv_compute_capability();
|
||||||
std::string capability_opt = "--gpu-architecture=compute_";
|
std::string capability_opt = "--gpu-architecture=compute_";
|
||||||
capability_opt += tools::to_string(capability.first) + tools::to_string(capability.second);
|
capability_opt += tools::to_string(capability.first) + tools::to_string(capability.second);
|
||||||
const char * options[] = {capability_opt.c_str(), "--restrict"};
|
const char * options[] = {capability_opt.c_str(), "--restrict"};
|
||||||
check(dispatch::nvrtcCompileProgram(prog, 2, options));
|
dispatch::nvrtcCompileProgram(prog, 2, options);
|
||||||
}catch(exception::nvrtc::compilation const &)
|
}catch(exception::nvrtc::compilation const &)
|
||||||
{
|
{
|
||||||
size_t logsize;
|
size_t logsize;
|
||||||
check(dispatch::nvrtcGetProgramLogSize(prog, &logsize));
|
dispatch::nvrtcGetProgramLogSize(prog, &logsize);
|
||||||
std::string log(logsize, 0);
|
std::string log(logsize, 0);
|
||||||
check(dispatch::nvrtcGetProgramLog(prog, (char*)log.data()));
|
dispatch::nvrtcGetProgramLog(prog, (char*)log.data());
|
||||||
std::cout << "Compilation failed:" << std::endl;
|
std::cout << "Compilation failed:" << std::endl;
|
||||||
std::cout << log << std::endl;
|
std::cout << log << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ptx_size;
|
size_t ptx_size;
|
||||||
check(dispatch::nvrtcGetPTXSize(prog, &ptx_size));
|
dispatch::nvrtcGetPTXSize(prog, &ptx_size);
|
||||||
std::vector<char> ptx(ptx_size);
|
std::vector<char> ptx(ptx_size);
|
||||||
check(dispatch::nvrtcGetPTX(prog, ptx.data()));
|
dispatch::nvrtcGetPTX(prog, ptx.data());
|
||||||
check(dispatch::cuModuleLoadDataEx(&h_.cu(), ptx.data(), 0, NULL, NULL));
|
dispatch::cuModuleLoadDataEx(&h_.cu(), ptx.data(), 0, NULL, NULL);
|
||||||
|
|
||||||
//Save cached program
|
//Save cached program
|
||||||
if (cache_path.size())
|
if (cache_path.size())
|
||||||
@@ -112,7 +112,7 @@ Program::Program(Context const & context, std::string const & source) : backend_
|
|||||||
|
|
||||||
// str.assign((std::istreambuf_iterator<char>(ifs)),
|
// str.assign((std::istreambuf_iterator<char>(ifs)),
|
||||||
// std::istreambuf_iterator<char>());
|
// std::istreambuf_iterator<char>());
|
||||||
// check(dispatch::cuModuleLoadDataEx(&h_.cu(), str.c_str(), 0, NULL, NULL));
|
// dispatch::cuModuleLoadDataEx(&h_.cu(), str.c_str(), 0, NULL, NULL);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -141,7 +141,7 @@ Program::Program(Context const & context, std::string const & source) : backend_
|
|||||||
char* cbuffer = buffer.data();
|
char* cbuffer = buffer.data();
|
||||||
h_.cl() = dispatch::clCreateProgramWithBinary(context_.h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), &len, (const unsigned char **)&cbuffer, NULL, &err);
|
h_.cl() = dispatch::clCreateProgramWithBinary(context_.h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), &len, (const unsigned char **)&cbuffer, NULL, &err);
|
||||||
check(err);
|
check(err);
|
||||||
check(dispatch::clBuildProgram(h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL));
|
dispatch::clBuildProgram(h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -150,7 +150,7 @@ Program::Program(Context const & context, std::string const & source) : backend_
|
|||||||
const char * csrc = source.c_str();
|
const char * csrc = source.c_str();
|
||||||
h_.cl() = dispatch::clCreateProgramWithSource(context_.h_.cl(), 1, &csrc, &srclen, &err);
|
h_.cl() = dispatch::clCreateProgramWithSource(context_.h_.cl(), 1, &csrc, &srclen, &err);
|
||||||
try{
|
try{
|
||||||
check(dispatch::clBuildProgram(h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL));
|
dispatch::clBuildProgram(h_.cl(), static_cast<cl_uint>(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL);
|
||||||
//Save cached program
|
//Save cached program
|
||||||
if (cache_path.size())
|
if (cache_path.size())
|
||||||
{
|
{
|
||||||
|
@@ -50,7 +50,7 @@ std::vector<int_t> infos(expression_tree const & tree, symbolic::preset::gemm::a
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* ------------------ CUBLAS ------------------ */
|
/* ------------------ CUBLAS ------------------ */
|
||||||
cublas_gemm::cublas_gemm(char A_trans, char B_trans): A_trans_(A_trans), B_trans_(B_trans), init_(driver::dispatch::cublasinit())
|
cublas_gemm::cublas_gemm(char A_trans, char B_trans): A_trans_(A_trans), B_trans_(B_trans), init_(true)
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
int cublas_gemm::is_invalid(expression_tree const &, driver::Device const & device) const
|
int cublas_gemm::is_invalid(expression_tree const &, driver::Device const & device) const
|
||||||
@@ -85,30 +85,31 @@ void cublas_gemm::enqueue(driver::CommandQueue & queue, driver::Program const &,
|
|||||||
CUdeviceptr cuB = args.B->array.handle.cu;
|
CUdeviceptr cuB = args.B->array.handle.cu;
|
||||||
CUdeviceptr cuC = args.C->array.handle.cu;
|
CUdeviceptr cuC = args.C->array.handle.cu;
|
||||||
runtime::execution_options_type const & opt = control.execution_options();
|
runtime::execution_options_type const & opt = control.execution_options();
|
||||||
auto cuT = [](char xt) { return xt=='N'?CUBLAS_OP_N:CUBLAS_OP_T; };
|
auto cuT = [](char xt) { return (xt=='N')?CUBLAS_OP_N:CUBLAS_OP_T; };
|
||||||
|
int offA = args.A->array.start, offB = args.B->array.start, offC = args.C->array.start;
|
||||||
|
cublasHandle_t h = drv::dispatch::cublasHandle(queue.context());
|
||||||
//Set new stream
|
//Set new stream
|
||||||
cudaStream_t bkp;
|
cudaStream_t bkp;
|
||||||
drv::Event event(drv::CUDA);
|
drv::Event event(drv::CUDA);
|
||||||
drv::dispatch::cublasGetStream(&bkp);
|
drv::dispatch::cublasGetStream(h,&bkp);
|
||||||
drv::dispatch::cublasSetStream((cudaStream_t)queue.handle().cu());
|
drv::dispatch::cublasSetStream(h,(cudaStream_t)queue.handle().cu());
|
||||||
values_holder alpha = args.alpha.values();
|
values_holder alpha = args.alpha.values();
|
||||||
values_holder beta = args.beta.values();
|
values_holder beta = args.beta.values();
|
||||||
if(opt.events)
|
if(opt.events)
|
||||||
drv::check(drv::dispatch::cuEventRecord(event.handle().cu().first, queue.handle().cu()));
|
drv::check(drv::dispatch::cuEventRecord(event.handle().cu().first, queue.handle().cu()));
|
||||||
if(args.C->dtype==FLOAT_TYPE)
|
if(args.C->dtype==FLOAT_TYPE)
|
||||||
drv::dispatch::cublasSgemm(cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float32, (float*)cuA, args.A->ld[1], (float*)cuB, args.B->ld[1], &beta.float32, (float*)cuC, args.C->ld[1]);
|
drv::dispatch::cublasSgemm(h,cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float32, (float*)cuA + offA , args.A->ld[1], (float*)cuB + offB, args.B->ld[1], &beta.float32, (float*)cuC + offC, args.C->ld[1]);
|
||||||
else
|
else
|
||||||
drv::dispatch::cublasDgemm(cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float64, (double*)cuA, args.A->ld[1], (double*)cuB, args.B->ld[1], &beta.float64, (double*)cuC, args.C->ld[1]);
|
drv::dispatch::cublasDgemm(h,cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float64, (double*)cuA + offA, args.A->ld[1], (double*)cuB + offB, args.B->ld[1], &beta.float64, (double*)cuC + offC, args.C->ld[1]);
|
||||||
if(opt.events){
|
if(opt.events){
|
||||||
drv::check(drv::dispatch::cuEventRecord(event.handle().cu().second, queue.handle().cu()));
|
drv::check(drv::dispatch::cuEventRecord(event.handle().cu().second, queue.handle().cu()));
|
||||||
opt.events->push_back(event);
|
opt.events->push_back(event);
|
||||||
}
|
}
|
||||||
//Revert old stream
|
//Revert old stream
|
||||||
drv::dispatch::cublasSetStream(bkp);
|
drv::dispatch::cublasSetStream(h,bkp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/* -------------------------------------------- */
|
/* -------------------------------------------- */
|
||||||
unsigned int gemm::lmem_usage(expression_tree const & expression) const
|
unsigned int gemm::lmem_usage(expression_tree const & expression) const
|
||||||
{
|
{
|
||||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@@ -139,6 +139,16 @@ profiles::value_type::templates_container const & profiles::value_type::template
|
|||||||
return templates_;
|
return templates_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<templates::base> profiles::create(std::string const & op, std::string const & str)
|
||||||
|
{
|
||||||
|
if(str=="cublas_gemm"){
|
||||||
|
if(op=="gemm_nn") return std::shared_ptr<templates::base>(new templates::cublas_gemm('N', 'N'));
|
||||||
|
if(op=="gemm_nt") return std::shared_ptr<templates::base>(new templates::cublas_gemm('N', 'T'));
|
||||||
|
if(op=="gemm_tn") return std::shared_ptr<templates::base>(new templates::cublas_gemm('T', 'N'));
|
||||||
|
if(op=="gemm_tt") return std::shared_ptr<templates::base>(new templates::cublas_gemm('T', 'T'));
|
||||||
|
}
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
|
||||||
std::shared_ptr<templates::base> profiles::create(std::string const & template_name, std::vector<int> const & x)
|
std::shared_ptr<templates::base> profiles::create(std::string const & template_name, std::vector<int> const & x)
|
||||||
{
|
{
|
||||||
@@ -189,8 +199,12 @@ void profiles::import(std::string const & str, driver::CommandQueue const & queu
|
|||||||
// Get profiles
|
// Get profiles
|
||||||
std::vector<std::shared_ptr<templates::base> > templates;
|
std::vector<std::shared_ptr<templates::base> > templates;
|
||||||
rapidjson::Value const & profiles = document[opcstr][dtcstr]["profiles"];
|
rapidjson::Value const & profiles = document[opcstr][dtcstr]["profiles"];
|
||||||
for (rapidjson::SizeType id = 0 ; id < profiles.Size() ; ++id)
|
for (rapidjson::SizeType i = 0 ; i < profiles.Size() ; ++i){
|
||||||
templates.push_back(create(operation, rapidjson::to_int_array<int>(profiles[id])));
|
if(profiles[i].IsString())
|
||||||
|
templates.push_back(create(operation, profiles[i].GetString()));
|
||||||
|
else
|
||||||
|
templates.push_back(create(operation, rapidjson::to_int_array<int>(profiles[i])));
|
||||||
|
}
|
||||||
if(templates.size()>1){
|
if(templates.size()>1){
|
||||||
// Get predictor
|
// Get predictor
|
||||||
predictors::random_forest predictor(document[opcstr][dtcstr]["predictor"]);
|
predictors::random_forest predictor(document[opcstr][dtcstr]["predictor"]);
|
||||||
|
@@ -174,7 +174,7 @@ def is_local_optimum(parameters, template, sizes, context):
|
|||||||
|
|
||||||
#Evaluate the provided parameters guess
|
#Evaluate the provided parameters guess
|
||||||
reference = tools.benchmark(template(*parameters), tree)
|
reference = tools.benchmark(template(*parameters), tree)
|
||||||
if isinf(reference):
|
if reference==float('inf'):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
#Latency bound -- ignore
|
#Latency bound -- ignore
|
||||||
|
@@ -39,8 +39,9 @@ def linspace(a, b, n=100):
|
|||||||
|
|
||||||
def expspace(a,b,N,r=128):
|
def expspace(a,b,N,r=128):
|
||||||
return [int(ceil(exp(x)/r)*r) for x in linspace(log(a), log(b), N)]
|
return [int(ceil(exp(x)/r)*r) for x in linspace(log(a), log(b), N)]
|
||||||
|
|
||||||
def benchmark(template, tree):
|
|
||||||
|
def benchmark(template, tree, operation=sc.templates.gemm_nn):
|
||||||
queue = tree.context.queues[0]
|
queue = tree.context.queues[0]
|
||||||
queue.profiles[template, sc.float32] = sc.profile(template, sc.float32, queue)
|
queue.profiles[template, sc.float32] = sc.profile(template, sc.float32, queue)
|
||||||
times = []
|
times = []
|
||||||
@@ -49,14 +50,14 @@ def benchmark(template, tree):
|
|||||||
#Warm-up
|
#Warm-up
|
||||||
try:
|
try:
|
||||||
z, events = sc.driver.enqueue(tree)
|
z, events = sc.driver.enqueue(tree)
|
||||||
tree.context.queues[0].synchronize()
|
queue.synchronize()
|
||||||
except profile_execution_failure:
|
except profile_execution_failure:
|
||||||
return float("inf")
|
return float("inf")
|
||||||
#Time
|
#Time
|
||||||
while total < 1e-1:
|
while total < 1e-2:
|
||||||
start = time()
|
start = time()
|
||||||
z, events = sc.driver.enqueue(tree)
|
z, events = sc.driver.enqueue(tree)
|
||||||
tree.context.queues[0].synchronize()
|
queue.synchronize()
|
||||||
end = time()
|
end = time()
|
||||||
times.append(end - start)
|
times.append(end - start)
|
||||||
total += times[-1]
|
total += times[-1]
|
||||||
@@ -91,11 +92,12 @@ def tree_of(template, sizes, context):
|
|||||||
AT = template is sc.templates.gemm_tn or template is sc.templates.gemm_tt
|
AT = template is sc.templates.gemm_tn or template is sc.templates.gemm_tt
|
||||||
BT = template is sc.templates.gemm_nt or template is sc.templates.gemm_tt
|
BT = template is sc.templates.gemm_nt or template is sc.templates.gemm_tt
|
||||||
M, N, K = sizes
|
M, N, K = sizes
|
||||||
|
C = sc.empty((M,N), context=context)
|
||||||
A = sc.empty((K, M) if AT else (M, K), context=context)
|
A = sc.empty((K, M) if AT else (M, K), context=context)
|
||||||
B = sc.empty((N, K) if BT else (K, N), context=context)
|
B = sc.empty((N, K) if BT else (K, N), context=context)
|
||||||
AA = A.T if AT else A
|
AA = A.T if AT else A
|
||||||
BB = B.T if BT else B
|
BB = B.T if BT else B
|
||||||
return sc.dot(AA, BB), (A, B)
|
return sc.assign(C, sc.dot(AA, BB)), (A, B, C)
|
||||||
|
|
||||||
def memory_footprint(template, sizes):
|
def memory_footprint(template, sizes):
|
||||||
if issubclass(template, sc.templates.elementwise_1d):
|
if issubclass(template, sc.templates.elementwise_1d):
|
||||||
@@ -123,14 +125,16 @@ def metric_name_of(template):
|
|||||||
return 'GB/S'
|
return 'GB/S'
|
||||||
|
|
||||||
def external_profiles(template):
|
def external_profiles(template):
|
||||||
|
res = []
|
||||||
if template is sc.templates.gemm_nn:
|
if template is sc.templates.gemm_nn:
|
||||||
return [sc.templates.cublas_gemm('N', 'N')]
|
res += [sc.templates.cublas_gemm('N','N')]
|
||||||
elif template is sc.templates.gemm_tn:
|
elif template is sc.templates.gemm_tn:
|
||||||
return [sc.templates.cublas_gemm('T', 'N')]
|
res += [sc.templates.cublas_gemm('T','N')]
|
||||||
elif template is sc.templates.gemm_nt:
|
elif template is sc.templates.gemm_nt:
|
||||||
return [sc.templates.cublas_gemm('N', 'T')]
|
res += [sc.templates.cublas_gemm('N','T')]
|
||||||
elif template is sc.templates.gemm_tt:
|
elif template is sc.templates.gemm_tt:
|
||||||
return [sc.templates.cublas_gemm('T', 'T')]
|
res += [sc.templates.cublas_gemm('T','T')]
|
||||||
|
return res
|
||||||
|
|
||||||
def genetic_infos_of(template):
|
def genetic_infos_of(template):
|
||||||
if issubclass(template, sc.templates.elementwise_1d):
|
if issubclass(template, sc.templates.elementwise_1d):
|
||||||
@@ -144,4 +148,8 @@ def genetic_infos_of(template):
|
|||||||
elif issubclass(template, sc.templates.gemm):
|
elif issubclass(template, sc.templates.gemm):
|
||||||
return {'categorical': [8,9], 'nbits': [3,3,3,3,3,2,2,2,2,2,3,3]}
|
return {'categorical': [8,9], 'nbits': [3,3,3,3,3,2,2,2,2,2,3,3]}
|
||||||
|
|
||||||
|
def convert(profile):
|
||||||
|
if isinstance(profile, str):
|
||||||
|
return profile
|
||||||
|
else:
|
||||||
|
return map(int, profile)
|
||||||
|
@@ -77,11 +77,14 @@ class Tuner:
|
|||||||
#Square
|
#Square
|
||||||
for N in [896, 1760, 2048, 2560]:
|
for N in [896, 1760, 2048, 2560]:
|
||||||
sizes += [(N, N)]
|
sizes += [(N, N)]
|
||||||
#Tall and Skinny
|
#Short/Fat
|
||||||
for M in [16, 32, 64, 128]:
|
for M in [16, 32, 64, 128]:
|
||||||
for N in [1024, 4096, 16384, 65536, 262144]:
|
for N in [1024, 4096, 16384, 65536, 262144]:
|
||||||
sizes += [(M, N)]
|
sizes += [(M, N)]
|
||||||
sizes += [(N, M)]
|
#Tall/Skinny
|
||||||
|
for N in [16, 32, 64, 128]:
|
||||||
|
for M in [1024, 4096, 16384, 65536, 262144]:
|
||||||
|
sizes += [(M, N)]
|
||||||
|
|
||||||
#BLAS3 training sizes
|
#BLAS3 training sizes
|
||||||
if operation in [sc.templates.gemm_nn, sc.templates.gemm_nt, sc.templates.gemm_tn, sc.templates.gemm_tt]:
|
if operation in [sc.templates.gemm_nn, sc.templates.gemm_nt, sc.templates.gemm_tn, sc.templates.gemm_tt]:
|
||||||
@@ -113,8 +116,6 @@ class Tuner:
|
|||||||
try:
|
try:
|
||||||
with open(os.path.join(savepath, 'X.csv')) as f:
|
with open(os.path.join(savepath, 'X.csv')) as f:
|
||||||
X = [tuple(map(int, row)) for row in csv.reader(f, delimiter=',')]
|
X = [tuple(map(int, row)) for row in csv.reader(f, delimiter=',')]
|
||||||
with open(os.path.join(savepath, 'Y.csv')) as f:
|
|
||||||
Y = [map(float, row) for row in csv.reader(f, delimiter=',')]
|
|
||||||
with open(os.path.join(savepath, 'profiles.csv')) as f:
|
with open(os.path.join(savepath, 'profiles.csv')) as f:
|
||||||
def mmap(x):
|
def mmap(x):
|
||||||
if x=='FETCH_FROM_LOCAL':
|
if x=='FETCH_FROM_LOCAL':
|
||||||
@@ -125,6 +126,8 @@ class Tuner:
|
|||||||
return sc.templates.fetch_type.FETCH_FROM_GLOBAL_STRIDED
|
return sc.templates.fetch_type.FETCH_FROM_GLOBAL_STRIDED
|
||||||
return int(x)
|
return int(x)
|
||||||
profiles = [map(mmap,row) for v in row for row in csv.reader(f, delimiter=',')]
|
profiles = [map(mmap,row) for v in row for row in csv.reader(f, delimiter=',')]
|
||||||
|
with open(os.path.join(savepath, 'Y.csv')) as f:
|
||||||
|
Y = [map(float, row) for row in csv.reader(f, delimiter=',')]
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -140,36 +143,37 @@ class Tuner:
|
|||||||
self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
|
self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
|
||||||
continue
|
continue
|
||||||
tree, operands = tools.tree_of(operation, x, context)
|
tree, operands = tools.tree_of(operation, x, context)
|
||||||
#Check if GA needs to run (i.e., current best prediction is not a local optimum)
|
#Best predicted profile for x
|
||||||
tune = True
|
|
||||||
best = None
|
best = None
|
||||||
if idx > 0:
|
if idx > 0:
|
||||||
dim = min(10, idx+1)
|
if len(profiles) > 1:
|
||||||
clf = RandomForestRegressor(dim, dim).fit(X, Y)
|
clf = RandomForestRegressor(20, max_depth=5).fit(X, Y)
|
||||||
predictions = clf.predict(x)[0]
|
predictions = clf.predict(x)[0]
|
||||||
for idx in (-predictions).argsort():
|
for idx in (-predictions).argsort():
|
||||||
ts = tools.benchmark(operation(*profiles[idx]), tree)
|
best = profiles[idx]
|
||||||
if np.isfinite(ts):
|
ts = tools.benchmark(operation(*best), tree)
|
||||||
break
|
if ts != float('inf'):
|
||||||
if np.isfinite(ts):
|
break
|
||||||
best = profiles[idx]
|
else:
|
||||||
tune = not optimize.is_local_optimum(predicted, operation, x, context)
|
best = profiles[0]
|
||||||
|
|
||||||
#Retune if necessary
|
#Retune if necessary
|
||||||
|
tune = not (best and optimize.is_local_optimum(best, operation, x, context))
|
||||||
if tune:
|
if tune:
|
||||||
optimizer = optimize.GeneticOptimizer(self.logger, naccept=1000, niter=1000, cxpb=.4, mutpb=.4, popsize=20, progress_bar = self.progress_bar)
|
optimizer = optimize.GeneticOptimizer(self.logger, naccept=1000, niter=1000, cxpb=.4, mutpb=.4, popsize=20, progress_bar = self.progress_bar)
|
||||||
best = optimizer.run(operation, x, context, prior=best)[0]
|
best = optimizer.run(operation, x, context, prior=best)[0]
|
||||||
if best not in profiles:
|
if best not in profiles:
|
||||||
profiles.append(best)
|
profiles.append(best)
|
||||||
for xx,yy in zip(X, Y):
|
for xx,yy in zip(X, Y):
|
||||||
tree, _operands = tools.tree_of(operation, xx, context)
|
tree, _ = tools.tree_of(operation, xx, context)
|
||||||
time = tools.benchmark(operation(*best), _tree)
|
time = tools.benchmark(operation(*best), tree)
|
||||||
yy.append(performance(xx, time))
|
yy.append(performance(xx, time))
|
||||||
#Update dataset
|
#Update dataset
|
||||||
X.append(x)
|
X.append(x)
|
||||||
y = [performance(x,tools.benchmark(operation(*prf), tree)) for prf in profiles]
|
y = [performance(x,tools.benchmark(operation(*prf), tree)) for prf in profiles]
|
||||||
Y.append(y)
|
Y.append(y)
|
||||||
#Save data
|
#Save data
|
||||||
for (fname, data) in zip(['X.csv', 'Y.csv', 'profiles.csv'], [X, Y, profiles]):
|
for (fname, data) in zip(['X.csv', 'Y.csv', 'profiles.csv'], [X, Y, profiles]):
|
||||||
with open(os.path.join(savepath, fname), 'wb') as f:
|
with open(os.path.join(savepath, fname), 'wb') as f:
|
||||||
csv.writer(f).writerows(data)
|
csv.writer(f).writerows(data)
|
||||||
#print performance info in case no tuning was done
|
#print performance info in case no tuning was done
|
||||||
@@ -182,11 +186,12 @@ class Tuner:
|
|||||||
for prof in tools.external_profiles(operation):
|
for prof in tools.external_profiles(operation):
|
||||||
for x, y in zip(X, Y):
|
for x, y in zip(X, Y):
|
||||||
tree, operands = tools.tree_of(operation, x, context)
|
tree, operands = tools.tree_of(operation, x, context)
|
||||||
perf = performance(x,tools.benchmark(prof, tree))
|
perf = performance(x,tools.benchmark(prof, tree, operation))
|
||||||
if perf > 0:
|
if perf > 0:
|
||||||
profiles.append(prof.__class__.__name__)
|
profiles.append(prof.__class__.__name__)
|
||||||
y.append(perf)
|
y.append(perf)
|
||||||
|
|
||||||
|
|
||||||
#Pruning of useless profiles
|
#Pruning of useless profiles
|
||||||
if len(Y[0]) > 1:
|
if len(Y[0]) > 1:
|
||||||
unused = np.where(np.bincount(np.argmax(Y, 1))==0)[0]
|
unused = np.where(np.bincount(np.argmax(Y, 1))==0)[0]
|
||||||
@@ -212,5 +217,5 @@ class Tuner:
|
|||||||
'threshold': e.tree_.threshold.astype('float64').tolist(),
|
'threshold': e.tree_.threshold.astype('float64').tolist(),
|
||||||
'feature': e.tree_.feature.astype('float64').tolist(),
|
'feature': e.tree_.feature.astype('float64').tolist(),
|
||||||
'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_]
|
'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_]
|
||||||
D['profiles'] = [map(int, x) for x in profiles]
|
D['profiles'] = [tools.convert(x) for x in profiles]
|
||||||
json.dump(json_data, open(json_path,'w'))
|
json.dump(json_data, open(json_path,'w'))
|
||||||
|
Reference in New Issue
Block a user