Added a control flow API

This commit is contained in:
Philippe Tillet
2015-02-03 15:20:33 -05:00
parent 939ce15b45
commit 3a296ae3b7
9 changed files with 114 additions and 166 deletions

View File

@@ -32,7 +32,7 @@ void bench(ad::numeric_type dtype)
times.clear();\ times.clear();\
total_time = 0;\ total_time = 0;\
OP;\ OP;\
while(total_time < 5e-1){\ while(total_time < 1e-1){\
timer.start(); \ timer.start(); \
OP;\ OP;\
times.push_back(timer.get());\ times.push_back(timer.get());\

View File

@@ -5,38 +5,47 @@
namespace ad = atidlas; namespace ad = atidlas;
#ifdef BENCH_CUBLAS
__global__ void dummy(){}
#endif
int main() int main()
{ {
for(ad::cl_ext::queues_t::iterator it = ad::cl_ext::queues.begin() ; it != ad::cl_ext::queues.end() ; ++it) for(ad::cl_ext::queues_t::iterator it = ad::cl_ext::queues.begin() ; it != ad::cl_ext::queues.end() ; ++it)
{ {
ad::array x(10, ad::FLOAT_TYPE, it->first); cl::CommandQueue queue = it->second[0];
cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>(); cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>();
ad::tools::timer t; cl::Program program("__kernel void dummy(){}");
program.build();
cl::Kernel kernel(program, "dummy");
cl::NDRange offset = cl::NullRange;
cl::NDRange global(1);
cl::NDRange local(1);
cl::Event event;
std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl; std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
std::cout << "-------------------------" << std::endl; std::cout << "-------------------------" << std::endl;
x = x + x;
ad::cl_ext::synchronize(x.context());
t.start();\
for(unsigned int i = 0 ; i < 100 ; ++i){
x = x + x;
ad::cl_ext::synchronize(x.context());
}
std::cout << "Kernel launch overhead: " << t.get()/100 << std::endl;
std::cout << "Expression tree creation:" << std::endl;
#define BENCH(CREATE, STR) \
{\
ad::array_expression tmp1(CREATE);\
t.start();\
for(unsigned int i = 0 ; i < 1000 ; ++i)\
ad::array_expression tmp2(CREATE);\
std::cout << STR << ": " << t.get()/1000 << std::endl;\
}
BENCH(x + x, "2 terms"); queue.enqueueNDRangeKernel(kernel, offset, global, local, NULL, &event);
BENCH(x + x + x, "3 terms"); queue.flush();
BENCH(x + x + x + x, "4 terms"); queue.finish();
BENCH(x + x + x + x + x, "5 terms");
#undef BENCH float time = event.getProfilingInfo<CL_PROFILING_COMMAND_START>() - event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
std::cout << "Kernel launch overhead: " << time << std::endl;
#ifdef BENCH_CUBLAS
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
dummy<<1, 1>>>();
cudaEventRecord(stop);
cudaEventSynchronize();
cudaEventElapsedTime(&time, start, stop);
std::cout << "CUDA Kernel launch overhead: " << time << std::endl;
#endif
std::cout << "-------------------------" << std::endl; std::cout << "-------------------------" << std::endl;
} }

View File

@@ -1,16 +1,8 @@
file(GLOB SYSTEM_STUDIO_ROOT /opt/intel/system_studio_*) file(GLOB SYSTEM_STUDIO_ROOT /opt/intel/system_studio_*)
find_path(MKL_INCLUDE_DIR mkl_blas.h find_path(MKL_INCLUDE_DIR mkl_blas.h HINTS ${SYSTEM_STUDIO_ROOT}/mkl/include/)
HINTS find_library(MKL_LIBRARIES NAMES mkl_core HINTS ${SYSTEM_STUDIO_ROOT}/mkl/lib/intel64/)
${SYSTEM_STUDIO_ROOT}/mkl/include/) find_library(ICC_LIBRARIES NAMES iomp5 HINTS ${SYSTEM_STUDIO_ROOT}/compiler/lib/intel64/)
find_library(MKL_LIBRARIES NAMES mkl_core
HINTS
${SYSTEM_STUDIO_ROOT}/mkl/lib/intel64/)
find_library(ICC_LIBRARIES NAMES iomp5
HINTS
${SYSTEM_STUDIO_ROOT}/compiler/lib/intel64/)
if(ICC_LIBRARIES) if(ICC_LIBRARIES)
set(OMP_LIBRARIES ${ICC_LIBRARIES}) set(OMP_LIBRARIES ${ICC_LIBRARIES})
@@ -18,7 +10,6 @@ else()
set(OMP_LIBRARIES gomp) set(OMP_LIBRARIES gomp)
endif() endif()
if(MKL_LIBRARIES AND OMP_LIBRARIES) if(MKL_LIBRARIES AND OMP_LIBRARIES)
set(MKL_LIBRARIES mkl_intel_lp64 mkl_avx mkl_intel_thread ${MKL_LIBRARIES} ${OMP_LIBRARIES} pthread) set(MKL_LIBRARIES mkl_intel_lp64 mkl_avx mkl_intel_thread ${MKL_LIBRARIES} ${OMP_LIBRARIES} pthread)
endif() endif()

View File

@@ -1,88 +1,9 @@
# - Find the OpenCL headers and library file(GLOB AMDAPPSDK_ROOT /opt/AMDAPPSDK*)
#
# Defines the following if found:
# OPENCL_FOUND : TRUE if found, FALSE otherwise
# OPENCL_INCLUDE_DIRS : Include directories for OpenCL
# OPENCL_LIBRARIES : The libraries to link against
#
# The user can set the OPENCLROOT environment variable to help finding OpenCL
# if it is installed in a non-standard place.
set(ENV_ATISTREAMSDKROOT $ENV{ATISTREAMSDKROOT}) find_package(CUDA QUIET)
if(ENV_ATISTREAMSDKROOT) find_path(OPENCL_INCLUDE_DIR CL/cl.hpp HINTS ${AMDAPPSDK_ROOT}/include/ ${CUDA_SDK_ROOT_DIR}/include)
set(ENV_OPENCLROOT $ENV{ATISTREAMSDKROOT}) find_library(OPENCL_LIBRARIES NAMES OpenCL HINTS ${AMDAPPSDK_ROOT}/lib/x86_64/ ${CUDA_SDK_ROOT_DIR}/lib64)
endif(ENV_ATISTREAMSDKROOT)
set(ENV_AMDAPPSDKROOT $ENV{AMDAPPSDKROOT})
if(ENV_AMDAPPSDKROOT)
set(ENV_OPENCLROOT $ENV{AMDAPPSDKROOT})
endif(ENV_AMDAPPSDKROOT)
set(ENV_INTELOCLSDKROOT $ENV{INTELOCLSDKROOT})
if(ENV_INTELOCLSDKROOT)
set(ENV_OPENCLROOT $ENV{INTELOCLSDKROOT})
endif(ENV_INTELOCLSDKROOT)
set(ENV_OPENCLROOT2 $ENV{OPENCLROOT})
if(ENV_OPENCLROOT2)
set(ENV_OPENCLROOT $ENV{OPENCLROOT})
endif(ENV_OPENCLROOT2)
if(ENV_OPENCLROOT)
find_path(
OPENCL_INCLUDE_DIR
NAMES CL/cl.h OpenCL/cl.h
PATHS ${ENV_OPENCLROOT}/include
#NO_DEFAULT_PATH #uncomment this is you wish to surpress the use of default paths for OpenCL
)
if (("${CMAKE_SYSTEM_NAME}" MATCHES "Linux") OR (${CMAKE_SYSTEM_NAME} MATCHES "Windows"))
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
set(OPENCL_LIB_SEARCH_PATH
${OPENCL_LIB_SEARCH_PATH}
${ENV_OPENCLROOT}/lib/x86)
else(CMAKE_SIZEOF_VOID_P EQUAL 4)
set(OPENCL_LIB_SEARCH_PATH
${OPENCL_LIB_SEARCH_PATH}
${ENV_OPENCLROOT}/lib/x86_64)
endif(CMAKE_SIZEOF_VOID_P EQUAL 4)
endif(("${CMAKE_SYSTEM_NAME}" MATCHES "Linux") OR (${CMAKE_SYSTEM_NAME} MATCHES "Windows"))
find_library(
OPENCL_LIBRARY
NAMES OpenCL
PATHS ${OPENCL_LIB_SEARCH_PATH}
#NO_DEFAULT_PATH #uncomment this is you wish to surpress the use of default paths for OpenCL
)
else(ENV_OPENCLROOT)
find_path(
OPENCL_INCLUDE_DIR
NAMES CL/cl.h OpenCL/cl.h
PATHS ${PROJECT_SOURCE_DIR} #use the CL/ include folder provided with ViennaCL
)
find_library(
OPENCL_LIBRARY
NAMES OpenCL
)
endif(ENV_OPENCLROOT)
include(FindPackageHandleStandardArgs) include(FindPackageHandleStandardArgs)
find_package_handle_standard_args( find_package_handle_standard_args(OpenCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIR)
OPENCL mark_as_advanced(OpenCL)
DEFAULT_MSG
OPENCL_LIBRARY OPENCL_INCLUDE_DIR
)
if(OPENCL_FOUND)
set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})
set(OPENCL_LIBRARIES ${OPENCL_LIBRARY})
else(OPENCL_FOUND)
set(OPENCL_INCLUDE_DIRS)
set(OPENCL_LIBRARIES)
endif(OPENCL_FOUND)
mark_as_advanced(
OPENCL_INCLUDE_DIR
OPENCL_LIBRARY
)

View File

@@ -15,6 +15,7 @@ class scalar;
class array: public obj_base class array: public obj_base
{ {
friend array reshape(array const &, int_t, int_t); friend array reshape(array const &, int_t, int_t);
friend array reshape(array_expression const &, int_t, int_t);
static array_infos init_infos(numeric_type dtype, cl_mem data, int_t shape1, int_t shape2, int_t start1, int_t start2, int_t stride1, int_t stride2, int_t ld); static array_infos init_infos(numeric_type dtype, cl_mem data, int_t shape1, int_t shape2, int_t start1, int_t start2, int_t stride1, int_t stride2, int_t ld);
public: public:
//1D Constructors //1D Constructors
@@ -31,7 +32,7 @@ public:
//General constructor //General constructor
array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const & s2, int_t ld, cl::Context context = cl_ext::default_context()); array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const & s2, int_t ld, cl::Context context = cl_ext::default_context());
array(array_expression const & proxy); array(control const & proxy);
array(array const &); array(array const &);
//Getters //Getters
@@ -50,7 +51,7 @@ public:
//Numeric operators //Numeric operators
array& operator=(array const &); array& operator=(array const &);
array& operator=(array_expression const &); array& operator=(control const &);
template<class T> array & operator=(std::vector<T> const & rhs); template<class T> array & operator=(std::vector<T> const & rhs);
array_expression operator-(); array_expression operator-();
@@ -90,7 +91,7 @@ public:
explicit scalar(numeric_type dtype, cl::Buffer const & data, int_t offset, cl::Context context = cl_ext::default_context()); explicit scalar(numeric_type dtype, cl::Buffer const & data, int_t offset, cl::Context context = cl_ext::default_context());
explicit scalar(value_scalar value, cl::Context context = cl_ext::default_context()); explicit scalar(value_scalar value, cl::Context context = cl_ext::default_context());
explicit scalar(numeric_type dtype, cl::Context context = cl_ext::default_context()); explicit scalar(numeric_type dtype, cl::Context context = cl_ext::default_context());
scalar(array_expression const & proxy); scalar(control const & proxy);
scalar& operator=(value_scalar const &); scalar& operator=(value_scalar const &);
// scalar& operator=(scalar const & s); // scalar& operator=(scalar const & s);
using array::operator =; using array::operator =;

View File

@@ -7,25 +7,25 @@
namespace atidlas namespace atidlas
{ {
namespace cl_ext namespace cl_ext
{ {
typedef std::map<std::pair<cl_program, unsigned int>, cl::Kernel> kernels_t; typedef std::map<std::pair<cl_program, unsigned int>, cl::Kernel> kernels_t;
typedef std::vector<std::pair<cl::Context, std::vector<cl::CommandQueue> > > queues_t; typedef std::vector<std::pair<cl::Context, std::vector<cl::CommandQueue> > > queues_t;
queues_t init_queues(); extern kernels_t kernels;
extern queues_t queues;
extern unsigned int default_context_idx;
extern cl_command_queue_properties queue_properties;
void synchronize(cl::Context const & context); void synchronize(cl::Context const & context);
cl::Context default_context(); cl::Context default_context();
cl::CommandQueue & get_queue(cl::Context const &, std::size_t); cl::CommandQueue & get_queue(cl::Context const &, std::size_t);
cl::Device get_device(cl::CommandQueue &); cl::Device get_device(cl::CommandQueue &);
std::vector<cl::CommandQueue> & get_queues(cl::Context const & ctx); std::vector<cl::CommandQueue> & get_queues(cl::Context const & ctx);
extern unsigned int default_context_idx;
extern kernels_t kernels;
extern queues_t queues;
} }
} }
#endif #endif

View File

@@ -208,6 +208,27 @@ private:
size4 shape_; size4 shape_;
}; };
class control
{
public:
control(array_expression const & x, cl::Event* event = NULL, std::vector<cl::Event>* dependencies = NULL,
cl::CommandQueue* queue = NULL, operation_cache* cache = NULL) : x_(x), event_(event), dependencies_(dependencies), queue_(queue), cache_(cache){}
array_expression const & expression() const { return x_; }
cl::Event* event() const { return event_; }
std::vector<cl::Event>* dependencies() const { return dependencies_; }
cl::CommandQueue* queue() const { return queue_; }
operation_cache* cache() const { return cache_; }
private:
array_expression const & x_;
cl::Event* event_;
std::vector<cl::Event>* dependencies_;
cl::CommandQueue* queue_;
operation_cache* cache_;
};
class expressions_tuple class expressions_tuple
{ {
private: private:

View File

@@ -96,9 +96,9 @@ context_(context), data_(data),
infos_(init_infos(dtype, data_(), s1.size, s2.size, s1.start, s2.start, s1.stride, s2.stride, ld)) infos_(init_infos(dtype, data_(), s1.size, s2.size, s1.start, s2.start, s1.stride, s2.stride, ld))
{ } { }
array::array(array_expression const & x): array::array(control const & x):
context_(x.context()), data_(context_, CL_MEM_READ_WRITE, size_of(x.dtype())*prod(x.shape())), context_(x.expression().context()), data_(context_, CL_MEM_READ_WRITE, size_of(x.expression().dtype())*prod(x.expression().shape())),
infos_(init_infos(x.dtype(), data_(), x.shape()._1, x.shape()._2, 0, 0, 1, 1, x.shape()._1)) infos_(init_infos(x.expression().dtype(), data_(), x.expression().shape()._1, x.expression().shape()._2, 0, 0, 1, 1, x.expression().shape()._1))
{ {
*this = x; *this = x;
} }
@@ -151,8 +151,10 @@ array & array::operator=(array const & rhs)
return *this; return *this;
} }
array & array::operator=(array_expression const & rhs) array & array::operator=(control const & x)
{ {
array_expression const & rhs = x.expression();
assert(dtype() == rhs.dtype()); assert(dtype() == rhs.dtype());
array_expression expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ASSIGN_TYPE), dtype(), shape()); array_expression expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ASSIGN_TYPE), dtype(), shape());
cl::CommandQueue & queue = cl_ext::get_queue(context_, 0); cl::CommandQueue & queue = cl_ext::get_queue(context_, 0);
@@ -293,7 +295,7 @@ scalar::scalar(value_scalar value, cl::Context context) : array(1, value.dtype()
scalar::scalar(numeric_type dtype, cl::Context context) : array(1, dtype, context) scalar::scalar(numeric_type dtype, cl::Context context) : array(1, dtype, context)
{ } { }
scalar::scalar(array_expression const & proxy) : array(proxy){ } scalar::scalar(control const &proxy) : array(proxy){ }
template<class T> template<class T>
T scalar::cast() const T scalar::cast() const
@@ -710,6 +712,13 @@ array reshape(array const & a, int_t size1, int_t size2)
return tmp; return tmp;
} }
array reshape(array_expression const & a, int_t size1, int_t size2)
{
array tmp(a);
tmp.infos_.shape1 = size1;
tmp.infos_.shape2 = size2;
return tmp;
}
#define DEFINE_DOT(LTYPE, RTYPE) \ #define DEFINE_DOT(LTYPE, RTYPE) \
array_expression dot(LTYPE const & x, RTYPE const & y)\ array_expression dot(LTYPE const & x, RTYPE const & y)\

View File

@@ -8,6 +8,11 @@ namespace atidlas
namespace cl_ext namespace cl_ext
{ {
cl_command_queue_properties queue_properties = 0;
unsigned int default_context_idx = 0;
queues_t queues;
kernels_t kernels;
void synchronize(cl::Context const & context) void synchronize(cl::Context const & context)
{ {
std::vector<cl::CommandQueue> & q = get_queues(context); std::vector<cl::CommandQueue> & q = get_queues(context);
@@ -15,52 +20,43 @@ void synchronize(cl::Context const & context)
it->finish(); it->finish();
} }
queues_t init_queues() void init_queues()
{ {
queues_t result; std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
std::vector<cl::Platform> platforms; for(std::vector<cl::Platform>::iterator it = platforms.begin() ; it != platforms.end() ; ++it)
cl::Platform::get(&platforms);
for(std::vector<cl::Platform>::iterator it = platforms.begin() ; it != platforms.end() ; ++it)
{
std::vector<cl::Device> devices;
it->getDevices(CL_DEVICE_TYPE_ALL, &devices);
for(std::vector<cl::Device>::iterator itt = devices.begin() ; itt != devices.end() ; ++itt)
{ {
std::vector<cl::Device> current(1, *itt); std::vector<cl::Device> devices;
cl::Context context(current); it->getDevices(CL_DEVICE_TYPE_ALL, &devices);
cl::CommandQueue queue(context, *itt); for(std::vector<cl::Device>::iterator itt = devices.begin() ; itt != devices.end() ; ++itt)
result.push_back(std::make_pair(context, std::vector<cl::CommandQueue>(1, queue))); queues.push_back(std::make_pair(cl::Context(std::vector<cl::Device>(1, *itt)), std::vector<cl::CommandQueue>()));
} }
} for(queues_t::iterator it = queues.begin() ; it != queues.end() ; ++it)
it->second.push_back(cl::CommandQueue(it->first, it->first.getInfo<CL_CONTEXT_DEVICES>()[0], queue_properties));
return result;
} }
cl::Context default_context() cl::Context default_context()
{ {
return queues[default_context_idx].second.front().getInfo<CL_QUEUE_CONTEXT>(); if(queues.empty())
init_queues();
return queues.begin()->first;
} }
std::vector<cl::CommandQueue> & get_queues(cl::Context const & ctx) std::vector<cl::CommandQueue> & get_queues(cl::Context const & ctx)
{ {
if(queues.empty())
init_queues();
for(queues_t::iterator it = queues.begin() ; it != queues.end() ; ++it) for(queues_t::iterator it = queues.begin() ; it != queues.end() ; ++it)
if(it->first()==ctx()) if(it->first()==ctx()) return it->second;
return it->second; throw std::out_of_range("No such context registered in the backend. Please run atidlas::cl_ext:;register(context, queues)");
queues.push_back(std::make_pair(ctx, std::vector<cl::CommandQueue>(1, cl::CommandQueue(ctx, ctx.getInfo<CL_CONTEXT_DEVICES>()[0]))));
return queues.back().second;
} }
cl::CommandQueue & get_queue(cl::Context const & ctx, std::size_t idx) cl::CommandQueue & get_queue(cl::Context const & ctx, std::size_t idx)
{ return get_queues(ctx)[idx]; } {
return get_queues(ctx)[idx];
}
unsigned int default_context_idx = 0;
queues_t queues = init_queues();
kernels_t kernels = kernels_t();
} }