CUDA: various improvements

This commit is contained in:
Philippe Tillet
2015-08-21 13:06:20 -04:00
parent 33dac6b05a
commit 10524ebdee
25 changed files with 170 additions and 130 deletions

View File

@@ -39,12 +39,14 @@ string(REPLACE ";" " " BLAS_DEF_STR "${BLAS_DEF}")
include_directories(${CMAKE_CURRENT_SOURCE_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR})
foreach(PROG blas) foreach(PROG blas)
if(CUDA_FOUND) if(CUDA_FOUND)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} " ${BLAS_DEF_STR} -std=c++11 ${BACKEND_DEFINES}") set(OLD_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS "")
set(CUPROG ${CMAKE_CURRENT_BINARY_DIR}/${PROG}.cu) set(CUPROG ${CMAKE_CURRENT_BINARY_DIR}/${PROG}.cu)
file(COPY ${PROG}.cpp DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) file(COPY ${PROG}.cpp DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${PROG}.cpp ${CUPROG}) file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${PROG}.cpp ${CUPROG})
cuda_add_executable(${PROG}-bench ${CUPROG}) cuda_add_executable(${PROG}-bench ${CUPROG} OPTIONS "-std=c++11 ${BLAS_DEF_STR} ${BACKEND_DEFINES}")
cuda_add_cublas_to_target(${PROG}-bench) cuda_add_cublas_to_target(${PROG}-bench)
set(CMAKE_CXX_FLAGS "${OLD_CXX_FLAGS}")
else() else()
add_executable(${PROG}-bench ${PROG}.cpp) add_executable(${PROG}-bench ${PROG}.cpp)
set_target_properties(${PROG}-bench PROPERTIES COMPILE_FLAGS "${BLAS_DEF_STR}") set_target_properties(${PROG}-bench PROPERTIES COMPILE_FLAGS "${BLAS_DEF_STR}")

View File

@@ -94,7 +94,7 @@ void bench(sc::numeric_type dtype, std::string operation)
cudaEventCreate(&stop);\ cudaEventCreate(&stop);\
OP;\ OP;\
cudaThreadSynchronize();\ cudaThreadSynchronize();\
while(total_time*1e-3 < 1e-3){\ while(total_time*1e-3 < 1e-2){\
flush = sc::zeros(1e6, 1, dtype);\ flush = sc::zeros(1e6, 1, dtype);\
cudaEventRecord(start,0);\ cudaEventRecord(start,0);\
OP;\ OP;\
@@ -191,7 +191,6 @@ void bench(sc::numeric_type dtype, std::string operation)
#endif #endif
#ifdef BENCH_CUBLAS #ifdef BENCH_CUBLAS
T *cux, *cuy; T *cux, *cuy;
T result;
cudaMalloc((void**) &cux, N * sizeof(T)); cudaMalloc((void**) &cux, N * sizeof(T));
cudaMalloc((void**) &cuy, N * sizeof(T)); cudaMalloc((void**) &cuy, N * sizeof(T));
BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t) BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t)
@@ -210,6 +209,7 @@ void bench(sc::numeric_type dtype, std::string operation)
//AlexNet //AlexNet
MNs.push_back(std::make_tuple('N',1000,256)); MNs.push_back(std::make_tuple('N',1000,256));
MNs.push_back(std::make_tuple('N',4096,256)); MNs.push_back(std::make_tuple('N',4096,256));
MNs.push_back(std::make_tuple('T',169,256)); MNs.push_back(std::make_tuple('T',169,256));
MNs.push_back(std::make_tuple('T',169,384)); MNs.push_back(std::make_tuple('T',169,384));
MNs.push_back(std::make_tuple('T',729,256)); MNs.push_back(std::make_tuple('T',729,256));
@@ -261,39 +261,43 @@ void bench(sc::numeric_type dtype, std::string operation)
if(operation.substr(0,4)=="gemm") if(operation.substr(0,4)=="gemm")
{ {
std::vector<std::tuple<std::string, char, char, int_t, int_t, int_t> > MNKs; std::vector<std::tuple<std::string, char, char, int_t, int_t, int_t> > MNKs;
MNKs.push_back(std::make_tuple("Square [512]",'N','T',512,512,512)); //Square
MNKs.push_back(std::make_tuple("Square [1536]",'N','T',1536,1536,1536)); MNKs.push_back(std::make_tuple("Square [N=896]",'N','T',896,896,896));
//AlexNet (Forward) MNKs.push_back(std::make_tuple("Square [N=2560]",'N','T',2560,2560,2560));
MNKs.push_back(std::make_tuple("F-Conv1",'N','N',3025,96,363));
MNKs.push_back(std::make_tuple("F-Conv2",'N','N',729,128,1200));
MNKs.push_back(std::make_tuple("F-Conv3",'N','N',169,384,2304));
MNKs.push_back(std::make_tuple("F-Conv4",'N','N',169,192,1728));
MNKs.push_back(std::make_tuple("F-Conv5",'N','N',169,128,1728));
//LeNet (Forward)
MNKs.push_back(std::make_tuple("F-Conv1",'N','N',576,20,25));
MNKs.push_back(std::make_tuple("F-Conv2",'N','N',64,50,500));
//AlexNet (Backward) //Convolution
MNKs.push_back(std::make_tuple("B-Conv5",'T','N',1728,128,169)); MNKs.push_back(std::make_tuple("Convolution [AlexNet-1]",'N','N',3025,96,363));
MNKs.push_back(std::make_tuple("B-Conv4",'T','N',1728,192,169)); MNKs.push_back(std::make_tuple("Convolution [AlexNet-2]",'N','N',729,128,1200));
MNKs.push_back(std::make_tuple("B-Conv3",'T','N',2304,384,169)); MNKs.push_back(std::make_tuple("Convolution [AlexNet-3]",'N','N',169,384,2304));
MNKs.push_back(std::make_tuple("B-Conv2",'T','N',1200,128,729)); MNKs.push_back(std::make_tuple("Convolution [AlexNet-4]",'N','N',169,192,1728));
MNKs.push_back(std::make_tuple("B-Conv1",'T','N',363,96,3025)); MNKs.push_back(std::make_tuple("Convolution [AlexNet-5]",'N','N',169,128,1728));
//LeNet (Backward) // MNKs.push_back(std::make_tuple("Convolution [LeNet-1],'N','N',576,20,25));
MNKs.push_back(std::make_tuple("B-Conv2",'T','N',500,50,64)); // MNKs.push_back(std::make_tuple("Convolution [LeNet-2]",'N','N',64,50,500));
MNKs.push_back(std::make_tuple("B-Conv1",'T','N',25,20,576));
//Convolution Gradient-1
MNKs.push_back(std::make_tuple("Convolution Gradient-1 [AlexNet-5]",'T','N',1728,128,169));
MNKs.push_back(std::make_tuple("Convolution Gradient-1 [AlexNet-4]",'T','N',1728,192,169));
MNKs.push_back(std::make_tuple("Convolution Gradient-1 [AlexNet-3]",'T','N',2304,384,169));
MNKs.push_back(std::make_tuple("Convolution Gradient-1 [AlexNet-2]",'T','N',1200,128,729));
MNKs.push_back(std::make_tuple("Convolution Gradient-1 [AlexNet-1]",'T','N',363,96,3025));
// MNKs.push_back(std::make_tuple("Conv. Gradient-1 [LeNet-2]",'T','N',500,50,64));
// MNKs.push_back(std::make_tuple("Conv. Gradient-1 [LeNet-1]",'T','N',25,20,576));
MNKs.push_back(std::make_tuple("B-Conv5 [bottom]",'N','T',169,1728,128)); //Convolution Gradient-2
MNKs.push_back(std::make_tuple("B-Conv4 [bottom]",'N','T',169,1728,192)); MNKs.push_back(std::make_tuple("Convolution Gradient-2 [AlexNet-5]",'N','T',169,1728,128));
MNKs.push_back(std::make_tuple("B-Conv3 [bottom]",'N','T',169,2304,384)); MNKs.push_back(std::make_tuple("Convolution Gradient-2 [AlexNet-4]",'N','T',169,1728,192));
MNKs.push_back(std::make_tuple("B-Conv2 [bottom]",'N','T',729,1200,128)); MNKs.push_back(std::make_tuple("Convolution Gradient-2 [AlexNet-3]",'N','T',169,2304,384));
//LeNet (Backward) MNKs.push_back(std::make_tuple("Convolution Gradient-2 [AlexNet-2]",'N','T',729,1200,128));
MNKs.push_back(std::make_tuple("B-Conv2 [bottom]",'N','T',64,500,50)); // MNKs.push_back(std::make_tuple("Conv. Gradient-2 [LeNet-2]",'N','T',64,500,50));
//Covariance (e.g., ICA) //Covariance (e.g., ICA, 10minutes/1khz)
MNKs.push_back(std::make_tuple("ICA [32 channels]",'N','N',32,32,32000)); MNKs.push_back(std::make_tuple("ICA [32 channels]",'N','T',32,32,600000));
MNKs.push_back(std::make_tuple("ICA [256 channels]",'N','N',256,256,32000)); MNKs.push_back(std::make_tuple("ICA [256 channels]",'N','T',256,256,600000));
//Bi-diagonalization
MNKs.push_back(std::make_tuple("Bidiagonalization [Iteration 1]",'N','T',4096,4096,32));
MNKs.push_back(std::make_tuple("Bidiagonalization [Iteration 10]",'N','T',3456,3456,32));
MNKs.push_back(std::make_tuple("Bidiagonalization [Iteration 50]",'N','T',896,896,32));
/*---------*/ /*---------*/
/*--BLAS3--*/ /*--BLAS3--*/
@@ -317,7 +321,7 @@ void bench(sc::numeric_type dtype, std::string operation)
#ifdef HAS_A_BLAS #ifdef HAS_A_BLAS
int_t lda = A.ld(), ldb = B.ld(), ldc = C.ld(); int_t lda = A.ld(), ldb = B.ld(), ldc = C.ld();
#endif #endif
BENCHMARK_ISAAC(C = sc::control(AT?(BT?dot(A.T(),B.T()):dot(A.T(),B)):(BT?dot(A,B.T()):dot(A,B)), sc::execution_options_type(0, &events)), (double)2*M*N*K/t); BENCHMARK_ISAAC(C = sc::control(AT?(BT?dot(A.T(),B.T()):dot(A.T(),B)):(BT?dot(A,B.T()):dot(A,B)), sc::execution_options_type(0, &events), sc::dispatcher_options_type(false)), (double)2*M*N*K/t);
/* clblas */ /* clblas */
#ifdef BENCH_CLBLAS #ifdef BENCH_CLBLAS
BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, BT?clblasTrans:clblasNoTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb, BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, BT?clblasTrans:clblasNoTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb,
@@ -336,7 +340,7 @@ void bench(sc::numeric_type dtype, std::string operation)
cudaMalloc((void**) &cuA, M * K * sizeof(T)); cudaMalloc((void**) &cuA, M * K * sizeof(T));
cudaMalloc((void**) &cuB, K * N * sizeof(T)); cudaMalloc((void**) &cuB, K * N * sizeof(T));
cudaMalloc((void**) &cuC, M * N * sizeof(T)); cudaMalloc((void**) &cuC, M * N * sizeof(T));
BENCHMARK_CUDA(cublasSgemm('n', 't', M, N, K, 1, cuA, lda, cuB, ldb, 1, cuC, ldc), (double)2*M*N*K/t) BENCHMARK_CUDA(cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, cuA, lda, cuB, ldb, 1, cuC, ldc), (double)2*M*N*K/t)
cudaFree(cuA); cudaFree(cuA);
cudaFree(cuB); cudaFree(cuB);
cudaFree(cuC); cudaFree(cuC);

View File

@@ -143,7 +143,7 @@ def main():
libraries=libraries)] libraries=libraries)]
#External #External
extensions += [Extension('autotuning.external.sklearn._tree', extensions += [Extension('external.sklearn._tree',
['external/sklearn/_tree.c'], ['external/sklearn/_tree.c'],
include_dirs = [numpy_include])] include_dirs = [numpy_include])]
@@ -155,7 +155,7 @@ def main():
author='Philippe Tillet', author='Philippe Tillet',
author_email='ptillet@g.harvard.edu', author_email='ptillet@g.harvard.edu',
license='MPL 2.0', license='MPL 2.0',
packages=['isaac','isaac.autotuning', 'isaac.autotuning.external', 'isaac.autotuning.external.deap', 'isaac.autotuning.external.deap.tools', 'isaac.autotuning.external.sklearn'], packages=['isaac','isaac.external', 'isaac.external.sklearn'],
ext_package="isaac", ext_package="isaac",
ext_modules=extensions, ext_modules=extensions,
cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass}, cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass},

View File

@@ -18,10 +18,10 @@
#define RESTORE_MSVC_WARNING_C4275 __pragma(warning(disable: 4275)) #define RESTORE_MSVC_WARNING_C4275 __pragma(warning(disable: 4275))
#else #else
#define DISABLE_MSVC_WARNING_C4251 #define DISABLE_MSVC_WARNING_C4251
#define RESTORE_MSVC_WARNING_C4251 #define RESTORE_MSVC_WARNING_C4251
#define DISABLE_MSVC_WARNING_C4275 #define DISABLE_MSVC_WARNING_C4275
#define RESTORE_MSVC_WARNING_C4275 #define RESTORE_MSVC_WARNING_C4275
#endif #endif
#endif #endif

View File

@@ -19,7 +19,7 @@ namespace isaac
namespace driver namespace driver
{ {
enum ISAACAPI backend_type enum backend_type
{ {
OPENCL OPENCL
#ifdef ISAAC_WITH_CUDA #ifdef ISAAC_WITH_CUDA

View File

@@ -20,7 +20,7 @@ private:
friend class CommandQueue; friend class CommandQueue;
public: public:
enum ISAACAPI Type enum Type
{ {
GPU = CL_DEVICE_TYPE_GPU, GPU = CL_DEVICE_TYPE_GPU,
CPU = CL_DEVICE_TYPE_CPU, CPU = CL_DEVICE_TYPE_CPU,

View File

@@ -15,10 +15,6 @@ namespace driver
class ISAACAPI Event class ISAACAPI Event
{ {
friend class CommandQueue; friend class CommandQueue;
private:
#ifdef ISAAC_WITH_CUDA
typedef std::pair<CUevent, CUevent> cu_event_t;
#endif
public: public:
Event(cl_event const & event, bool take_ownership = true); Event(cl_event const & event, bool take_ownership = true);
Event(backend_type backend); Event(backend_type backend);

View File

@@ -12,6 +12,15 @@ namespace isaac
namespace driver namespace driver
{ {
#ifdef ISAAC_WITH_CUDA
struct cu_event_t{
operator bool() const { return first && second; }
CUevent first;
CUevent second;
};
#endif
#ifdef ISAAC_WITH_CUDA #ifdef ISAAC_WITH_CUDA
#define HANDLE_TYPE(CLTYPE, CUTYPE) Handle<CLTYPE, CUTYPE> #define HANDLE_TYPE(CLTYPE, CUTYPE) Handle<CLTYPE, CUTYPE>
#else #else
@@ -30,7 +39,7 @@ private:
static void _delete(CUevent x); static void _delete(CUevent x);
static void _delete(CUfunction); static void _delete(CUfunction);
static void _delete(CUmodule x); static void _delete(CUmodule x);
static void _delete(std::pair<CUevent, CUevent> x); static void _delete(cu_event_t x);
#endif #endif
static void release(cl_context x); static void release(cl_context x);

View File

@@ -21,7 +21,7 @@ public:
virtual ~symbolic_binder(); virtual ~symbolic_binder();
virtual bool bind(driver::Buffer const &) = 0; virtual bool bind(driver::Buffer const &) = 0;
virtual unsigned int get(driver::Buffer const &) = 0; virtual unsigned int get(driver::Buffer const &) = 0;
virtual unsigned int get(); unsigned int get();
protected: protected:
unsigned int current_arg_; unsigned int current_arg_;
std::map<driver::Buffer,unsigned int> memory; std::map<driver::Buffer,unsigned int> memory;

View File

@@ -70,6 +70,7 @@ struct CastPrefix: public keyword{ CastPrefix(driver::backend_type backend, std:
struct InitPrefix: public keyword{ InitPrefix(driver::backend_type backend, std::string const & datatype): keyword(backend, "", "make_" + datatype){} }; struct InitPrefix: public keyword{ InitPrefix(driver::backend_type backend, std::string const & datatype): keyword(backend, "", "make_" + datatype){} };
struct Infinity: public keyword{ Infinity(driver::backend_type backend, std::string const & datatype): keyword(backend, "INFINITY", "infinity<" + datatype + ">()"){} }; struct Infinity: public keyword{ Infinity(driver::backend_type backend, std::string const & datatype): keyword(backend, "INFINITY", "infinity<" + datatype + ">()"){} };
struct Select: public keyword{ Select(driver::backend_type backend, std::string cond, std::string if_value, std::string else_value): keyword(backend, "select(" + else_value + "," + if_value + "," + cond + ")", "(" + cond + ")?" + if_value + ":" + else_value) {} };
#undef ADD_KEYWORD #undef ADD_KEYWORD

View File

@@ -1,9 +1,9 @@
#ifndef ISAAC_VALUE_SCALAR_H #ifndef ISAAC_VALUE_SCALAR_H
#define ISAAC_VALUE_SCALAR_H #define ISAAC_VALUE_SCALAR_H
#include <inttypes.h>
#include "isaac/defines.h" #include "isaac/defines.h"
#include "isaac/types.h" #include "isaac/types.h"
#include <stdint.h>
namespace isaac namespace isaac
{ {

View File

@@ -130,12 +130,12 @@ void backend::platforms(std::vector<Platform> & platforms)
#ifdef ISAAC_WITH_CUDA #ifdef ISAAC_WITH_CUDA
platforms.push_back(Platform(CUDA)); platforms.push_back(Platform(CUDA));
#endif #endif
cl_uint nplatforms; // cl_uint nplatforms;
ocl::check(clGetPlatformIDs(0, NULL, &nplatforms)); // ocl::check(clGetPlatformIDs(0, NULL, &nplatforms));
std::vector<cl_platform_id> clplatforms(nplatforms); // std::vector<cl_platform_id> clplatforms(nplatforms);
ocl::check(clGetPlatformIDs(nplatforms, clplatforms.data(), NULL)); // ocl::check(clGetPlatformIDs(nplatforms, clplatforms.data(), NULL));
for(cl_platform_id p: clplatforms) // for(cl_platform_id p: clplatforms)
platforms.push_back(Platform(p)); // platforms.push_back(Platform(p));
} }
void backend::synchronize(Context const & context) void backend::synchronize(Context const & context)

View File

@@ -44,7 +44,7 @@ long Event::elapsed_time() const
} }
} }
HANDLE_TYPE(cl_event, Event::cu_event_t) & Event::handle() HANDLE_TYPE(cl_event, cu_event_t) & Event::handle()
{ return h_; } { return h_; }
} }

View File

@@ -19,7 +19,7 @@ template<class CLType, class CUType>
void Handle<CLType, CUType>::_delete(CUstream x) { cuStreamDestroy(x); } void Handle<CLType, CUType>::_delete(CUstream x) { cuStreamDestroy(x); }
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::_delete(CUdevice) { } void Handle<CLType, CUType>::_delete(CUdevice) { std::cout << "CUdevice" << std::endl;}
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::_delete(CUevent x) { cuEventDestroy(x); } void Handle<CLType, CUType>::_delete(CUevent x) { cuEventDestroy(x); }
@@ -31,7 +31,7 @@ template<class CLType, class CUType>
void Handle<CLType, CUType>::_delete(CUmodule x) { cuModuleUnload(x); } void Handle<CLType, CUType>::_delete(CUmodule x) { cuModuleUnload(x); }
template<class CLType, class CUType> template<class CLType, class CUType>
void Handle<CLType, CUType>::_delete(std::pair<CUevent, CUevent> x) { _delete(x.first); _delete(x.second); } void Handle<CLType, CUType>::_delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
#endif #endif
@@ -100,8 +100,9 @@ template<class CLType, class CUType>
Handle<CLType, CUType>::~Handle() Handle<CLType, CUType>::~Handle()
{ {
#ifdef ISAAC_WITH_CUDA #ifdef ISAAC_WITH_CUDA
if(has_ownership_ && cu_.unique()) if(has_ownership_ && cu_ && cu_.unique() && *cu_){
_delete(*cu_); _delete(*cu_);
}
#endif #endif
if(has_ownership_ && cl_ && cl_.unique() && *cl_) if(has_ownership_ && cl_ && cl_.unique() && *cl_)
release(*cl_); release(*cl_);
@@ -132,7 +133,7 @@ template class Handle<cl_mem, CUdeviceptr>;
template class Handle<cl_command_queue, CUstream>; template class Handle<cl_command_queue, CUstream>;
template class Handle<cl_context, CUcontext>; template class Handle<cl_context, CUcontext>;
template class Handle<cl_device_id, CUdevice>; template class Handle<cl_device_id, CUdevice>;
template class Handle<cl_event, std::pair<CUevent, CUevent> >; template class Handle<cl_event, cu_event_t>;
template class Handle<cl_kernel, CUfunction>; template class Handle<cl_kernel, CUfunction>;
template class Handle<cl_program, CUmodule>; template class Handle<cl_program, CUmodule>;
#else #else

View File

@@ -10,6 +10,8 @@ namespace driver
NDRange::NDRange(size_t size0) NDRange::NDRange(size_t size0)
{ {
sizes_[0] = size0; sizes_[0] = size0;
sizes_[1] = 1;
sizes_[2] = 1;
dimension_ = 1; dimension_ = 1;
} }
@@ -17,6 +19,7 @@ NDRange::NDRange(size_t size0, size_t size1)
{ {
sizes_[0] = size0; sizes_[0] = size0;
sizes_[1] = size1; sizes_[1] = size1;
sizes_[2] = 1;
dimension_ = 2; dimension_ = 2;
} }

View File

@@ -12,7 +12,11 @@ namespace driver
{ {
#ifdef ISAAC_WITH_CUDA #ifdef ISAAC_WITH_CUDA
Platform::Platform(backend_type backend): backend_(backend){} Platform::Platform(backend_type backend): backend_(backend)
{
if(backend==CUDA)
cuInit(0);
}
#endif #endif
Platform::Platform(cl_platform_id const & platform) : backend_(OPENCL) Platform::Platform(cl_platform_id const & platform) : backend_(OPENCL)

View File

@@ -56,8 +56,11 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
return 0; return 0;
} }
int gemm::is_invalid_impl(driver::Device const &, expressions_tuple const &) const int gemm::is_invalid_impl(driver::Device const & device, expressions_tuple const &) const
{ {
if(device.vendor()==driver::Device::Vendor::NVIDIA && p_.simd_width > 1)
return TEMPLATE_INVALID_SIMD_WIDTH;
if(p_.A_fetching_policy!=FETCH_FROM_LOCAL || p_.B_fetching_policy!=FETCH_FROM_LOCAL) if(p_.A_fetching_policy!=FETCH_FROM_LOCAL || p_.B_fetching_policy!=FETCH_FROM_LOCAL)
return TEMPLATE_INVALID_FETCHING_POLICY_TYPE; return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
@@ -170,8 +173,8 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
stream << Local(backend) << " " << sdtype << " lB[" << p_.kL*p_.nL << "];" << std::endl; stream << Local(backend) << " " << sdtype << " lB[" << p_.kL*p_.nL << "];" << std::endl;
unsigned int npA = p_.mL/(A_trans_=='N'?p_.local_fetch_0*p_.simd_width:p_.local_fetch_1); unsigned int npA = p_.mL/(A_trans_=='N'?p_.local_fetch_0*p_.simd_width:p_.local_fetch_1);
unsigned int npB = p_.nL/(B_trans_=='T'?p_.local_fetch_0*p_.simd_width:p_.local_fetch_1); unsigned int npB = p_.nL/(B_trans_=='T'?p_.local_fetch_0*p_.simd_width:p_.local_fetch_1);
stream << "__global " << sdtype << "* Ai[" << npA << "];" << std::endl; stream << Global(backend) << " " << sdtype << "* Ai[" << npA << "];" << std::endl;
stream << "__global " << sdtype << "* Bi[" << npB << "];" << std::endl; stream << Global(backend) << " " << sdtype << "* Bi[" << npB << "];" << std::endl;
stream << std::endl; stream << std::endl;
stream << "//identifiers" << std::endl; stream << "//identifiers" << std::endl;
@@ -179,7 +182,11 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
stream << "int idt;" << std::endl; stream << "int idt;" << std::endl;
if(has_depth) if(has_depth)
stream << "int gidz, div, offz;" << std::endl; stream << "int gidz, div, offz;" << std::endl;
stream << "int4 ids = (int4)(" << GroupIdx0(backend) << "," << GroupIdx1(backend) << "," << LocalIdx0(backend) << "," << LocalIdx1(backend) << ");" << std::endl; stream << "uint4 ids;" << std::endl;
stream << "ids.x = " << GroupIdx0(backend) << ";" << std::endl;
stream << "ids.y = " << GroupIdx1(backend) << ";" << std::endl;
stream << "ids.z = " << LocalIdx0(backend) << ";" << std::endl;
stream << "ids.w = " << LocalIdx1(backend) << ";" << std::endl;
stream << std::endl; stream << std::endl;
stream << "//offsets" << std::endl; stream << "//offsets" << std::endl;
@@ -266,15 +273,15 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
for(unsigned int i = 0 ; i < npA ; i++ ) for(unsigned int i = 0 ; i < npA ; i++ )
if (A_trans_=='N') if (A_trans_=='N')
stream << "Ai[" << i << "] += select(0, (int)((idT.x + " << i*p_.local_fetch_0*p_.simd_width << ")" << ASTRIDE1 << "), " << i*p_.local_fetch_0*p_.simd_width << " < M);" << std::endl; stream << "Ai[" << i << "] += " << Select(backend, to_string(i*p_.local_fetch_0*p_.simd_width) + " < M", "(int)((idT.x + " + to_string(i*p_.local_fetch_0*p_.simd_width) + ")" + ASTRIDE1 + ")", "0") << ";" << std::endl;
else else
stream << "Ai[" << i << "] += select(0, (int)((idT.y + " << i*p_.local_fetch_1 << ")*lda), " << i*p_.local_fetch_1 << " < M);" << std::endl; stream << "Ai[" << i << "] += " << Select(backend, to_string(i*p_.local_fetch_1) + " < M", "(int)((idT.y + " + to_string(i*p_.local_fetch_1) + ")*lda)", "0") << ";" << std::endl;
for(unsigned int i = 0 ; i < npB ; i++ ) for(unsigned int i = 0 ; i < npB ; i++ )
if (B_trans_=='T') if (B_trans_=='T')
stream << "Bi[" << i << "] += select(0, (int)((idT.x + " << i*p_.local_fetch_0*p_.simd_width << ")" << BSTRIDE1 << "), " << i*p_.local_fetch_0*p_.simd_width << " < N);" << std::endl; stream << "Bi[" << i << "] += " << Select(backend, to_string(i*p_.local_fetch_0*p_.simd_width) + " < N", "(int)((idT.x + " + to_string(i*p_.local_fetch_0*p_.simd_width) + ")" + BSTRIDE1 + ")", "0") << ";" << std::endl;
else else
stream << "Bi[" << i << "] += select(0, (int)((idT.y + " << i*p_.local_fetch_1 << ")*ldb), " << i*p_.local_fetch_1 << " < N);" << std::endl; stream << "Bi[" << i << "] += " << Select(backend, to_string(i*p_.local_fetch_1) + " < N", "(int)((idT.y + " + to_string(i*p_.local_fetch_1) + ")*ldb)", "0") << ";" << std::endl;
stream << std::endl; stream << std::endl;
stream << "//Outer loop" << std::endl; stream << "//Outer loop" << std::endl;
@@ -504,14 +511,6 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
stream << "N -= ids.y;" << std::endl; stream << "N -= ids.y;" << std::endl;
stream << "N -= ids.w*" << p_.simd_width << ";" << std::endl; stream << "N -= ids.w*" << p_.simd_width << ";" << std::endl;
stream << "int ibm[" << p_.mS << "];" << std::endl;
for(unsigned int m=0; m < p_.mS; ++m)
{
string Ci = to_string((m/p_.simd_width)*(p_.local_size_0*p_.simd_width) + m%p_.simd_width);
stream << "ibm[" << m << "] = " << Ci << " < M;" << std::endl;
}
for(unsigned int n=0; n < p_.nS; ++n) for(unsigned int n=0; n < p_.nS; ++n)
{ {
string Cj = to_string((n/p_.simd_width)*(p_.local_size_1*p_.simd_width) + n%p_.simd_width); string Cj = to_string((n/p_.simd_width)*(p_.local_size_1*p_.simd_width) + n%p_.simd_width);
@@ -521,13 +520,15 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
for(unsigned int m=0; m < p_.mS; ++m) for(unsigned int m=0; m < p_.mS; ++m)
{ {
string Ci = to_string((m/p_.simd_width)*(p_.local_size_0*p_.simd_width) + m%p_.simd_width); string Ci = to_string((m/p_.simd_width)*(p_.local_size_0*p_.simd_width) + m%p_.simd_width);
stream << "if(ibm[" << m << "]) "; stream << "if(" << Ci << "< M) ";
stream << "C[" << Ci << CSTRIDE1 << "] = rC[" << m << "][" << n << "] + select((" << sdtype << ")0, C[" << Ci << CSTRIDE1 << "], beta>0);" << std::endl; stream << "C[" << Ci << CSTRIDE1 << "] = rC[" << m << "][" << n << "];" << std::endl;
} }
if((n+1)%p_.simd_width==0) if((n+1)%p_.simd_width==0){
stream << "C += ldc*" << p_.local_size_1*p_.simd_width - p_.simd_width + 1 << ";" << std::endl; stream << "C += ldc*" << p_.local_size_1*p_.simd_width - p_.simd_width + 1 << ";" << std::endl;
else }
else{
stream << "C += ldc;" << std::endl; stream << "C += ldc;" << std::endl;
}
} }

View File

@@ -119,15 +119,16 @@ void profiles::value_type::execute(controller<expressions_tuple> const & expr)
else if(predictor_.get()) else if(predictor_.get())
{ {
std::vector<float> predictions = predictor_->predict(x); std::vector<float> predictions = predictor_->predict(x);
do{ // do{
label = std::distance(predictions.begin(),std::max_element(predictions.begin(), predictions.end())); label = std::distance(predictions.begin(),std::max_element(predictions.begin(), predictions.end()));
predictions[label] = 0; // predictions[label] = 0;
}while(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE); // }while(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE);
} }
//Execution //Execution
if(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE) // std::cout << label << std::endl;
throw operation_not_supported_exception("Running this operation would require an overly large temporary."); // if(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE)
// throw operation_not_supported_exception("Running this operation would require an overly large temporary.");
return templates_[label]->enqueue(queue_, program, tools::to_string(label), *fallback_, expr); return templates_[label]->enqueue(queue_, program, tools::to_string(label), *fallback_, expr);
} }

View File

@@ -143,7 +143,7 @@ def main():
libraries=libraries)] libraries=libraries)]
#External #External
extensions += [Extension('autotuning.external.sklearn._tree', extensions += [Extension('external.sklearn._tree',
['external/sklearn/_tree.c'], ['external/sklearn/_tree.c'],
include_dirs = [numpy_include])] include_dirs = [numpy_include])]
@@ -155,7 +155,7 @@ def main():
author='Philippe Tillet', author='Philippe Tillet',
author_email='ptillet@g.harvard.edu', author_email='ptillet@g.harvard.edu',
license='MPL 2.0', license='MPL 2.0',
packages=['isaac','isaac.autotuning', 'isaac.autotuning.external', 'isaac.autotuning.external.deap', 'isaac.autotuning.external.deap.tools', 'isaac.autotuning.external.sklearn'], packages=['isaac','isaac.external', 'isaac.external.sklearn'],
ext_package="isaac", ext_package="isaac",
ext_modules=extensions, ext_modules=extensions,
cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass}, cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass},

View File

@@ -81,27 +81,32 @@ private:
void export_exceptions() void export_exceptions()
{ {
wrap::exception<isaac::operation_not_supported_exception>("OperationNotSupported", bp::init<std::string>())
.def("__str__", &isaac::operation_not_supported_exception::what)
;
wrap::exception<isaac::driver::ocl::exception::out_of_resources>("LaunchOutOfResources") #define BIND_EXCEPTION(CPPNAME, PYTHONNAME) \
.def("__str__", &isaac::driver::ocl::exception::out_of_resources::what) wrap::exception<isaac::CPPNAME>(PYTHONNAME, bp::init<std::string>())\
; .def("__str__", &isaac::CPPNAME::what)
wrap::exception<isaac::driver::ocl::exception::mem_object_allocation_failure>("MemObjectAllocationFailure") BIND_EXCEPTION(operation_not_supported_exception, "OperationNotSupported");
.def("__str__", &isaac::driver::ocl::exception::mem_object_allocation_failure::what)
;
wrap::exception<isaac::driver::ocl::exception::out_of_host_memory>("OutOfHostMemory") //OCL
.def("__str__", &isaac::driver::ocl::exception::out_of_host_memory::what) wrap::exception<isaac::driver::ocl::exception::base>("OclException", bp::no_init);
; #define BIND_OCL_EXCEPTION(CPPNAME, PYTHONNAME) \
wrap::exception<isaac::driver::ocl::exception::CPPNAME, bp::bases<isaac::driver::ocl::exception::base> >(PYTHONNAME)\
.def("__str__", &isaac::driver::ocl::exception::CPPNAME::what)
wrap::exception<isaac::driver::ocl::exception::invalid_work_group_size>("InvalidWorkGroupSize")
.def("__str__", &isaac::driver::ocl::exception::invalid_work_group_size::what)
;
wrap::exception<isaac::driver::ocl::exception::invalid_value>("InvalidValue") BIND_OCL_EXCEPTION(out_of_resources, "OclLaunchOutOfResources");
.def("__str__", &isaac::driver::ocl::exception::invalid_value::what) BIND_OCL_EXCEPTION(mem_object_allocation_failure, "MemObjectAllocationFailure");
; BIND_OCL_EXCEPTION(out_of_host_memory, "OutOfHostMemory");
BIND_OCL_EXCEPTION(invalid_work_group_size, "InvalidWorkGroupSize");
BIND_OCL_EXCEPTION(invalid_value, "InvalidValue");
//CUDA
wrap::exception<isaac::driver::cuda::exception::base>("CudaException", bp::no_init);
#define BIND_CUDA_EXCEPTION(CPPNAME, PYTHONNAME) \
wrap::exception<isaac::driver::cuda::exception::CPPNAME, bp::bases<isaac::driver::cuda::exception::base> >(PYTHONNAME)\
.def("__str__", &isaac::driver::cuda::exception::CPPNAME::what)
BIND_CUDA_EXCEPTION(launch_out_of_resources, "CudaLaunchOutOfResources");
} }

View File

@@ -3,6 +3,7 @@
#include "common.hpp" #include "common.hpp"
#include "isaac/array.h" #include "isaac/array.h"
#include "isaac/wrap/clBLAS.h" #include "isaac/wrap/clBLAS.h"
#include "isaac/driver/common.h"
namespace sc = isaac; namespace sc = isaac;
typedef isaac::int_t int_t; typedef isaac::int_t int_t;
@@ -45,6 +46,7 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
std::cout << std::endl;\ std::cout << std::endl;\
} }
if(queue.device().backend()==sc::driver::OPENCL){
#define PREFIX "[C]" #define PREFIX "[C]"
RUN_TEST_VECTOR_AXPY("AXPY", cz[i] = a*cx[i] + cz[i], BLAS<T>::F(clblasSaxpy, clblasDaxpy)(N, a, CHANDLE(x), x.start()[0], x.stride()[0], RUN_TEST_VECTOR_AXPY("AXPY", cz[i] = a*cx[i] + cz[i], BLAS<T>::F(clblasSaxpy, clblasDaxpy)(N, a, CHANDLE(x), x.start()[0], x.stride()[0],
CHANDLE(z), z.start()[0], z.stride()[0], CHANDLE(z), z.start()[0], z.stride()[0],
@@ -56,9 +58,9 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
RUN_TEST_VECTOR_AXPY("SCAL", cz[i] = a*cz[i], BLAS<T>::F(clblasSscal, clblasDscal)(N, a, CHANDLE(z), z.start()[0], z.stride()[0], RUN_TEST_VECTOR_AXPY("SCAL", cz[i] = a*cz[i], BLAS<T>::F(clblasSscal, clblasDscal)(N, a, CHANDLE(z), z.start()[0], z.stride()[0],
1, &clqueue, 0, NULL, NULL)); 1, &clqueue, 0, NULL, NULL));
#undef PREFIX #undef PREFIX
}
#define PREFIX "[C++]" #define PREFIX "[C++]"
RUN_TEST_VECTOR_AXPY("z = 0", cz[i] = 0, z = zeros(N, 1, dtype, context)) RUN_TEST_VECTOR_AXPY("z = 0", cz[i] = 0, z = zeros(N, 1, dtype, context))
RUN_TEST_VECTOR_AXPY("z = x", cz[i] = cx[i], z = x) RUN_TEST_VECTOR_AXPY("z = x", cz[i] = cx[i], z = x)

View File

@@ -52,7 +52,7 @@ void test_impl(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> cons
else\ else\
std::cout << std::endl; std::cout << std::endl;
if(interf==clBLAS) if(C.context().backend()==sc::driver::OPENCL && interf==clBLAS)
{ {
cl_command_queue clqueue = queue.handle().cl(); cl_command_queue clqueue = queue.handle().cl();

View File

@@ -24,20 +24,23 @@ def nrmse(y_ground, y):
def train(X, Y, profiles): def train(X, Y, profiles):
X = np.array(X) X = np.array(X)
Y = np.array(Y) Y = np.array(Y)
M = X.shape[0] #Remove unused profiles
unused = np.where(np.bincount(np.argmax(Y, 1))==0)[0]
profiles = [x in profiles for ix,x in enumerate(profiles) if ix not in unused]
Y = np.delete(Y, np.where(np.bincount(np.argmax(Y, 1))==0), axis=1)
#Shuffle
p = np.random.permutation(X.shape[0]) p = np.random.permutation(X.shape[0])
M = X.shape[0]
X = X[p,:] X = X[p,:]
Y = Y[p,:] Y = Y[p,:]
#Train the.profile #Train the.profile
cut = int(1.00*M) cut = int(.7*M)
CV = .1 XTr, YTr = X[:cut,:], Y[:cut,:]
XTr, YTr = X[:,:], Y[:,:] XCv, YCv = X[cut:,:], Y[cut:,:]
XCv, YCv = X[:max(1,CV*M),:], Y[:max(1,CV*M),:]
nrmses = {} nrmses = {}
for N in range(1,min(M+1,20)): for N in range(1,min(M+1,10)):
for depth in range(1,min(M+1,20)): for depth in range(1,min(M+1,20)):
clf = RandomForestRegressor(N, max_depth=depth).fit(XTr, YTr) clf = RandomForestRegressor(N, max_depth=depth).fit(XTr, YTr)
t = np.argmax(clf.predict(XCv), axis = 1) t = np.argmax(clf.predict(XCv), axis = 1)

View File

@@ -13,12 +13,12 @@ from numpy import cumsum
import tools import tools
fetch_types = [sc.templates.fetching_policy_type.FETCH_FROM_GLOBAL_CONTIGUOUS, fetch_types = [sc.templates.fetching_policy_type.FETCH_FROM_LOCAL,
sc.templates.fetching_policy_type.FETCH_FROM_GLOBAL_STRIDED, sc.templates.fetching_policy_type.FETCH_FROM_LOCAL,
sc.templates.fetching_policy_type.FETCH_FROM_LOCAL, sc.templates.fetching_policy_type.FETCH_FROM_LOCAL,
sc.templates.fetching_policy_type.FETCH_FROM_LOCAL] sc.templates.fetching_policy_type.FETCH_FROM_LOCAL]
to_catch = (sc.OperationNotSupported, sc.LaunchOutOfResources, sc.MemObjectAllocationFailure, sc.InvalidWorkGroupSize, sc.OutOfHostMemory, sc.InvalidValue) to_catch = (sc.OperationNotSupported, sc.OclLaunchOutOfResources, sc.CudaLaunchOutOfResources, sc.MemObjectAllocationFailure, sc.InvalidWorkGroupSize, sc.OutOfHostMemory, sc.InvalidValue)
def exhaustive(template, sizes, context): def exhaustive(template, sizes, context):
tree, _ = tools.tree_of(template, sizes, context) tree, _ = tools.tree_of(template, sizes, context)

View File

@@ -1,7 +1,7 @@
import random, argparse, json, os import random, argparse, json, os
from math import log, isinf from math import log, isinf
from itertools import chain, product from itertools import chain, product
from numpy import argsort, argmax from numpy import argsort, argmax, where, delete, bincount
from operator import mul from operator import mul
import isaac as sc import isaac as sc
from isaac.external.sklearn.forest import RandomForestRegressor from isaac.external.sklearn.forest import RandomForestRegressor
@@ -9,6 +9,8 @@ import optimize, tools, model
from json import encoder from json import encoder
import json import json
to_catch = (sc.OperationNotSupported, sc.OclLaunchOutOfResources, sc.CudaLaunchOutOfResources, sc.MemObjectAllocationFailure, sc.InvalidWorkGroupSize, sc.OutOfHostMemory, sc.InvalidValue)
encoder.FLOAT_REPR = lambda o: format(o, '.2f') encoder.FLOAT_REPR = lambda o: format(o, '.2f')
encoder.separators = (',',':') encoder.separators = (',',':')
@@ -30,14 +32,14 @@ def do_tuning(device, operation, json_path):
sizes[sc.templates.axpy] = [(x,) for x in tools.expspace(1e3, 1e8, 4)] sizes[sc.templates.axpy] = [(x,) for x in tools.expspace(1e3, 1e8, 4)]
sizes[sc.templates.gemv_n] = product(pow2range(4,17), pow2range(4,17)) sizes[sc.templates.gemv_n] = product(pow2range(4,17), pow2range(4,17))
sizes[sc.templates.gemv_t] = sizes[sc.templates.gemv_n] sizes[sc.templates.gemv_t] = sizes[sc.templates.gemv_n]
sizes[sc.templates.gemm_nn] = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 15)) sizes[sc.templates.gemm_nn] = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 17))
sizes[sc.templates.gemm_tn] = sizes[sc.templates.gemm_nn] sizes[sc.templates.gemm_tn] = sizes[sc.templates.gemm_nn]
sizes[sc.templates.gemm_nt] = sizes[sc.templates.gemm_nn] sizes[sc.templates.gemm_nt] = sizes[sc.templates.gemm_nn]
sizes[sc.templates.gemm_tt] = sizes[sc.templates.gemm_nn] sizes[sc.templates.gemm_tt] = sizes[sc.templates.gemm_nn]
#Quick tuning - AlexNet sizes + Intuition #Quick tuning - AlexNet sizes + Intuition
quick_tuning = True quick_tuning = False
if quick_tuning: if quick_tuning:
sizes[sc.templates.ger] = [(1536,1536)] sizes[sc.templates.ger] = [(1536,1536)]
@@ -48,12 +50,14 @@ def do_tuning(device, operation, json_path):
(729,256), (729,256),
(3025,96)] (3025,96)]
sizes[sc.templates.gemm_nn] = [(576,20,25),(64,50,500),(3025,96,363), sizes[sc.templates.gemm_nn] = [(3025,96,363),
(729,128,1200), (729,128,1200),
(169,384,2304), (169,384,2304),
(169,192,1728), (169,192,1728),
(169,128,1728)] (169,128,1728)]
sizes[sc.templates.gemm_nt] = [(169,1728,128),
sizes[sc.templates.gemm_nt] = [(1536,1536,1536),
(169,1728,128),
(169,1728,192), (169,1728,192),
(169,2304,384), (169,2304,384),
(729,1200,128)] (729,1200,128)]
@@ -121,7 +125,7 @@ def do_tuning(device, operation, json_path):
for b in best: for b in best:
try: try:
perf += [performance(x, tools.benchmark(operation, profiles[b], tree))] perf += [performance(x, tools.benchmark(operation, profiles[b], tree))]
except (sc.OperationNotSupported, sc.LaunchOutOfResources, sc.MemObjectAllocationFailure): except to_catch:
pass pass
predicted = profiles[best[argmax(perf)]] predicted = profiles[best[argmax(perf)]]
tune = not optimize.is_local_optimum(predicted, operation, x, context) tune = not optimize.is_local_optimum(predicted, operation, x, context)
@@ -138,7 +142,7 @@ def do_tuning(device, operation, json_path):
try: try:
time = tools.benchmark(operation, new, _tree) time = tools.benchmark(operation, new, _tree)
perf = performance(xx, time) perf = performance(xx, time)
except (sc.OperationNotSupported, sc.LaunchOutOfResources, sc.MemObjectAllocationFailure): except to_catch:
perf = 0 perf = 0
yy.append(0 if isinf(perf) else perf) yy.append(0 if isinf(perf) else perf)
#Update dataset #Update dataset
@@ -147,7 +151,7 @@ def do_tuning(device, operation, json_path):
for ip, p in enumerate(profiles): for ip, p in enumerate(profiles):
try: try:
perf = 0 if fastest and ip < nparams and predperf[ip]/fastest < .1 else performance(x,tools.benchmark(operation, p, tree)) perf = 0 if fastest and ip < nparams and predperf[ip]/fastest < .1 else performance(x,tools.benchmark(operation, p, tree))
except (sc.OperationNotSupported, sc.LaunchOutOfResources, sc.MemObjectAllocationFailure): except to_catch:
perf = 0 perf = 0
y.append(0 if isinf(perf) else perf) y.append(0 if isinf(perf) else perf)
X.append(x) X.append(x)
@@ -158,6 +162,10 @@ def do_tuning(device, operation, json_path):
csv.writer(f).writerows(data) csv.writer(f).writerows(data)
unused = where(bincount(argmax(Y, 1))==0)[0]
profiles = [x for ix,x in enumerate(profiles) if ix not in unused]
Y = delete(Y, unused, axis=1)
#Export to JSON #Export to JSON
json_path = tools.sanitize(device.name) + '.json' if not json_path else json_path json_path = tools.sanitize(device.name) + '.json' if not json_path else json_path
if os.path.isfile(json_path): if os.path.isfile(json_path):