Added a control flow API
This commit is contained in:
@@ -32,7 +32,7 @@ void bench(ad::numeric_type dtype)
|
||||
times.clear();\
|
||||
total_time = 0;\
|
||||
OP;\
|
||||
while(total_time < 5e-1){\
|
||||
while(total_time < 1e-1){\
|
||||
timer.start(); \
|
||||
OP;\
|
||||
times.push_back(timer.get());\
|
||||
|
@@ -5,38 +5,47 @@
|
||||
|
||||
namespace ad = atidlas;
|
||||
|
||||
#ifdef BENCH_CUBLAS
|
||||
__global__ void dummy(){}
|
||||
#endif
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
for(ad::cl_ext::queues_t::iterator it = ad::cl_ext::queues.begin() ; it != ad::cl_ext::queues.end() ; ++it)
|
||||
{
|
||||
ad::array x(10, ad::FLOAT_TYPE, it->first);
|
||||
cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
|
||||
ad::tools::timer t;
|
||||
cl::CommandQueue queue = it->second[0];
|
||||
cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>();
|
||||
cl::Program program("__kernel void dummy(){}");
|
||||
program.build();
|
||||
cl::Kernel kernel(program, "dummy");
|
||||
|
||||
cl::NDRange offset = cl::NullRange;
|
||||
cl::NDRange global(1);
|
||||
cl::NDRange local(1);
|
||||
|
||||
cl::Event event;
|
||||
std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
|
||||
std::cout << "-------------------------" << std::endl;
|
||||
x = x + x;
|
||||
ad::cl_ext::synchronize(x.context());
|
||||
t.start();\
|
||||
for(unsigned int i = 0 ; i < 100 ; ++i){
|
||||
x = x + x;
|
||||
ad::cl_ext::synchronize(x.context());
|
||||
}
|
||||
std::cout << "Kernel launch overhead: " << t.get()/100 << std::endl;
|
||||
std::cout << "Expression tree creation:" << std::endl;
|
||||
#define BENCH(CREATE, STR) \
|
||||
{\
|
||||
ad::array_expression tmp1(CREATE);\
|
||||
t.start();\
|
||||
for(unsigned int i = 0 ; i < 1000 ; ++i)\
|
||||
ad::array_expression tmp2(CREATE);\
|
||||
std::cout << STR << ": " << t.get()/1000 << std::endl;\
|
||||
}
|
||||
|
||||
BENCH(x + x, "2 terms");
|
||||
BENCH(x + x + x, "3 terms");
|
||||
BENCH(x + x + x + x, "4 terms");
|
||||
BENCH(x + x + x + x + x, "5 terms");
|
||||
#undef BENCH
|
||||
queue.enqueueNDRangeKernel(kernel, offset, global, local, NULL, &event);
|
||||
queue.flush();
|
||||
queue.finish();
|
||||
|
||||
float time = event.getProfilingInfo<CL_PROFILING_COMMAND_START>() - event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
|
||||
std::cout << "Kernel launch overhead: " << time << std::endl;
|
||||
|
||||
#ifdef BENCH_CUBLAS
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
cudaEventRecord(start);
|
||||
dummy<<1, 1>>>();
|
||||
cudaEventRecord(stop);
|
||||
cudaEventSynchronize();
|
||||
cudaEventElapsedTime(&time, start, stop);
|
||||
std::cout << "CUDA Kernel launch overhead: " << time << std::endl;
|
||||
#endif
|
||||
std::cout << "-------------------------" << std::endl;
|
||||
}
|
||||
|
||||
|
@@ -1,16 +1,8 @@
|
||||
file(GLOB SYSTEM_STUDIO_ROOT /opt/intel/system_studio_*)
|
||||
|
||||
find_path(MKL_INCLUDE_DIR mkl_blas.h
|
||||
HINTS
|
||||
${SYSTEM_STUDIO_ROOT}/mkl/include/)
|
||||
|
||||
find_library(MKL_LIBRARIES NAMES mkl_core
|
||||
HINTS
|
||||
${SYSTEM_STUDIO_ROOT}/mkl/lib/intel64/)
|
||||
|
||||
find_library(ICC_LIBRARIES NAMES iomp5
|
||||
HINTS
|
||||
${SYSTEM_STUDIO_ROOT}/compiler/lib/intel64/)
|
||||
find_path(MKL_INCLUDE_DIR mkl_blas.h HINTS ${SYSTEM_STUDIO_ROOT}/mkl/include/)
|
||||
find_library(MKL_LIBRARIES NAMES mkl_core HINTS ${SYSTEM_STUDIO_ROOT}/mkl/lib/intel64/)
|
||||
find_library(ICC_LIBRARIES NAMES iomp5 HINTS ${SYSTEM_STUDIO_ROOT}/compiler/lib/intel64/)
|
||||
|
||||
if(ICC_LIBRARIES)
|
||||
set(OMP_LIBRARIES ${ICC_LIBRARIES})
|
||||
@@ -18,7 +10,6 @@ else()
|
||||
set(OMP_LIBRARIES gomp)
|
||||
endif()
|
||||
|
||||
|
||||
if(MKL_LIBRARIES AND OMP_LIBRARIES)
|
||||
set(MKL_LIBRARIES mkl_intel_lp64 mkl_avx mkl_intel_thread ${MKL_LIBRARIES} ${OMP_LIBRARIES} pthread)
|
||||
endif()
|
||||
|
@@ -1,88 +1,9 @@
|
||||
# - Find the OpenCL headers and library
|
||||
#
|
||||
# Defines the following if found:
|
||||
# OPENCL_FOUND : TRUE if found, FALSE otherwise
|
||||
# OPENCL_INCLUDE_DIRS : Include directories for OpenCL
|
||||
# OPENCL_LIBRARIES : The libraries to link against
|
||||
#
|
||||
# The user can set the OPENCLROOT environment variable to help finding OpenCL
|
||||
# if it is installed in a non-standard place.
|
||||
file(GLOB AMDAPPSDK_ROOT /opt/AMDAPPSDK*)
|
||||
|
||||
set(ENV_ATISTREAMSDKROOT $ENV{ATISTREAMSDKROOT})
|
||||
if(ENV_ATISTREAMSDKROOT)
|
||||
set(ENV_OPENCLROOT $ENV{ATISTREAMSDKROOT})
|
||||
endif(ENV_ATISTREAMSDKROOT)
|
||||
|
||||
set(ENV_AMDAPPSDKROOT $ENV{AMDAPPSDKROOT})
|
||||
if(ENV_AMDAPPSDKROOT)
|
||||
set(ENV_OPENCLROOT $ENV{AMDAPPSDKROOT})
|
||||
endif(ENV_AMDAPPSDKROOT)
|
||||
|
||||
set(ENV_INTELOCLSDKROOT $ENV{INTELOCLSDKROOT})
|
||||
if(ENV_INTELOCLSDKROOT)
|
||||
set(ENV_OPENCLROOT $ENV{INTELOCLSDKROOT})
|
||||
endif(ENV_INTELOCLSDKROOT)
|
||||
|
||||
set(ENV_OPENCLROOT2 $ENV{OPENCLROOT})
|
||||
if(ENV_OPENCLROOT2)
|
||||
set(ENV_OPENCLROOT $ENV{OPENCLROOT})
|
||||
endif(ENV_OPENCLROOT2)
|
||||
|
||||
if(ENV_OPENCLROOT)
|
||||
find_path(
|
||||
OPENCL_INCLUDE_DIR
|
||||
NAMES CL/cl.h OpenCL/cl.h
|
||||
PATHS ${ENV_OPENCLROOT}/include
|
||||
#NO_DEFAULT_PATH #uncomment this is you wish to surpress the use of default paths for OpenCL
|
||||
)
|
||||
|
||||
if (("${CMAKE_SYSTEM_NAME}" MATCHES "Linux") OR (${CMAKE_SYSTEM_NAME} MATCHES "Windows"))
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
set(OPENCL_LIB_SEARCH_PATH
|
||||
${OPENCL_LIB_SEARCH_PATH}
|
||||
${ENV_OPENCLROOT}/lib/x86)
|
||||
else(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
set(OPENCL_LIB_SEARCH_PATH
|
||||
${OPENCL_LIB_SEARCH_PATH}
|
||||
${ENV_OPENCLROOT}/lib/x86_64)
|
||||
endif(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
endif(("${CMAKE_SYSTEM_NAME}" MATCHES "Linux") OR (${CMAKE_SYSTEM_NAME} MATCHES "Windows"))
|
||||
find_library(
|
||||
OPENCL_LIBRARY
|
||||
NAMES OpenCL
|
||||
PATHS ${OPENCL_LIB_SEARCH_PATH}
|
||||
#NO_DEFAULT_PATH #uncomment this is you wish to surpress the use of default paths for OpenCL
|
||||
)
|
||||
else(ENV_OPENCLROOT)
|
||||
find_path(
|
||||
OPENCL_INCLUDE_DIR
|
||||
NAMES CL/cl.h OpenCL/cl.h
|
||||
PATHS ${PROJECT_SOURCE_DIR} #use the CL/ include folder provided with ViennaCL
|
||||
)
|
||||
|
||||
find_library(
|
||||
OPENCL_LIBRARY
|
||||
NAMES OpenCL
|
||||
)
|
||||
endif(ENV_OPENCLROOT)
|
||||
find_package(CUDA QUIET)
|
||||
find_path(OPENCL_INCLUDE_DIR CL/cl.hpp HINTS ${AMDAPPSDK_ROOT}/include/ ${CUDA_SDK_ROOT_DIR}/include)
|
||||
find_library(OPENCL_LIBRARIES NAMES OpenCL HINTS ${AMDAPPSDK_ROOT}/lib/x86_64/ ${CUDA_SDK_ROOT_DIR}/lib64)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(
|
||||
OPENCL
|
||||
DEFAULT_MSG
|
||||
OPENCL_LIBRARY OPENCL_INCLUDE_DIR
|
||||
)
|
||||
|
||||
if(OPENCL_FOUND)
|
||||
set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})
|
||||
set(OPENCL_LIBRARIES ${OPENCL_LIBRARY})
|
||||
else(OPENCL_FOUND)
|
||||
set(OPENCL_INCLUDE_DIRS)
|
||||
set(OPENCL_LIBRARIES)
|
||||
endif(OPENCL_FOUND)
|
||||
|
||||
mark_as_advanced(
|
||||
OPENCL_INCLUDE_DIR
|
||||
OPENCL_LIBRARY
|
||||
)
|
||||
|
||||
find_package_handle_standard_args(OpenCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIR)
|
||||
mark_as_advanced(OpenCL)
|
||||
|
@@ -15,6 +15,7 @@ class scalar;
|
||||
class array: public obj_base
|
||||
{
|
||||
friend array reshape(array const &, int_t, int_t);
|
||||
friend array reshape(array_expression const &, int_t, int_t);
|
||||
static array_infos init_infos(numeric_type dtype, cl_mem data, int_t shape1, int_t shape2, int_t start1, int_t start2, int_t stride1, int_t stride2, int_t ld);
|
||||
public:
|
||||
//1D Constructors
|
||||
@@ -31,7 +32,7 @@ public:
|
||||
|
||||
//General constructor
|
||||
array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const & s2, int_t ld, cl::Context context = cl_ext::default_context());
|
||||
array(array_expression const & proxy);
|
||||
array(control const & proxy);
|
||||
array(array const &);
|
||||
|
||||
//Getters
|
||||
@@ -50,7 +51,7 @@ public:
|
||||
|
||||
//Numeric operators
|
||||
array& operator=(array const &);
|
||||
array& operator=(array_expression const &);
|
||||
array& operator=(control const &);
|
||||
template<class T> array & operator=(std::vector<T> const & rhs);
|
||||
|
||||
array_expression operator-();
|
||||
@@ -90,7 +91,7 @@ public:
|
||||
explicit scalar(numeric_type dtype, cl::Buffer const & data, int_t offset, cl::Context context = cl_ext::default_context());
|
||||
explicit scalar(value_scalar value, cl::Context context = cl_ext::default_context());
|
||||
explicit scalar(numeric_type dtype, cl::Context context = cl_ext::default_context());
|
||||
scalar(array_expression const & proxy);
|
||||
scalar(control const & proxy);
|
||||
scalar& operator=(value_scalar const &);
|
||||
// scalar& operator=(scalar const & s);
|
||||
using array::operator =;
|
||||
|
@@ -7,25 +7,25 @@
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
|
||||
namespace cl_ext
|
||||
{
|
||||
|
||||
typedef std::map<std::pair<cl_program, unsigned int>, cl::Kernel> kernels_t;
|
||||
typedef std::vector<std::pair<cl::Context, std::vector<cl::CommandQueue> > > queues_t;
|
||||
|
||||
queues_t init_queues();
|
||||
extern kernels_t kernels;
|
||||
extern queues_t queues;
|
||||
extern unsigned int default_context_idx;
|
||||
extern cl_command_queue_properties queue_properties;
|
||||
|
||||
|
||||
void synchronize(cl::Context const & context);
|
||||
cl::Context default_context();
|
||||
cl::CommandQueue & get_queue(cl::Context const &, std::size_t);
|
||||
cl::Device get_device(cl::CommandQueue &);
|
||||
std::vector<cl::CommandQueue> & get_queues(cl::Context const & ctx);
|
||||
extern unsigned int default_context_idx;
|
||||
extern kernels_t kernels;
|
||||
extern queues_t queues;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -208,6 +208,27 @@ private:
|
||||
size4 shape_;
|
||||
};
|
||||
|
||||
class control
|
||||
{
|
||||
public:
|
||||
control(array_expression const & x, cl::Event* event = NULL, std::vector<cl::Event>* dependencies = NULL,
|
||||
cl::CommandQueue* queue = NULL, operation_cache* cache = NULL) : x_(x), event_(event), dependencies_(dependencies), queue_(queue), cache_(cache){}
|
||||
|
||||
array_expression const & expression() const { return x_; }
|
||||
cl::Event* event() const { return event_; }
|
||||
std::vector<cl::Event>* dependencies() const { return dependencies_; }
|
||||
cl::CommandQueue* queue() const { return queue_; }
|
||||
operation_cache* cache() const { return cache_; }
|
||||
|
||||
private:
|
||||
array_expression const & x_;
|
||||
|
||||
cl::Event* event_;
|
||||
std::vector<cl::Event>* dependencies_;
|
||||
cl::CommandQueue* queue_;
|
||||
operation_cache* cache_;
|
||||
};
|
||||
|
||||
class expressions_tuple
|
||||
{
|
||||
private:
|
||||
|
@@ -96,9 +96,9 @@ context_(context), data_(data),
|
||||
infos_(init_infos(dtype, data_(), s1.size, s2.size, s1.start, s2.start, s1.stride, s2.stride, ld))
|
||||
{ }
|
||||
|
||||
array::array(array_expression const & x):
|
||||
context_(x.context()), data_(context_, CL_MEM_READ_WRITE, size_of(x.dtype())*prod(x.shape())),
|
||||
infos_(init_infos(x.dtype(), data_(), x.shape()._1, x.shape()._2, 0, 0, 1, 1, x.shape()._1))
|
||||
array::array(control const & x):
|
||||
context_(x.expression().context()), data_(context_, CL_MEM_READ_WRITE, size_of(x.expression().dtype())*prod(x.expression().shape())),
|
||||
infos_(init_infos(x.expression().dtype(), data_(), x.expression().shape()._1, x.expression().shape()._2, 0, 0, 1, 1, x.expression().shape()._1))
|
||||
{
|
||||
*this = x;
|
||||
}
|
||||
@@ -151,8 +151,10 @@ array & array::operator=(array const & rhs)
|
||||
return *this;
|
||||
}
|
||||
|
||||
array & array::operator=(array_expression const & rhs)
|
||||
array & array::operator=(control const & x)
|
||||
{
|
||||
array_expression const & rhs = x.expression();
|
||||
|
||||
assert(dtype() == rhs.dtype());
|
||||
array_expression expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ASSIGN_TYPE), dtype(), shape());
|
||||
cl::CommandQueue & queue = cl_ext::get_queue(context_, 0);
|
||||
@@ -293,7 +295,7 @@ scalar::scalar(value_scalar value, cl::Context context) : array(1, value.dtype()
|
||||
scalar::scalar(numeric_type dtype, cl::Context context) : array(1, dtype, context)
|
||||
{ }
|
||||
|
||||
scalar::scalar(array_expression const & proxy) : array(proxy){ }
|
||||
scalar::scalar(control const &proxy) : array(proxy){ }
|
||||
|
||||
template<class T>
|
||||
T scalar::cast() const
|
||||
@@ -710,6 +712,13 @@ array reshape(array const & a, int_t size1, int_t size2)
|
||||
return tmp;
|
||||
}
|
||||
|
||||
array reshape(array_expression const & a, int_t size1, int_t size2)
|
||||
{
|
||||
array tmp(a);
|
||||
tmp.infos_.shape1 = size1;
|
||||
tmp.infos_.shape2 = size2;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
#define DEFINE_DOT(LTYPE, RTYPE) \
|
||||
array_expression dot(LTYPE const & x, RTYPE const & y)\
|
||||
|
@@ -8,6 +8,11 @@ namespace atidlas
|
||||
namespace cl_ext
|
||||
{
|
||||
|
||||
cl_command_queue_properties queue_properties = 0;
|
||||
unsigned int default_context_idx = 0;
|
||||
queues_t queues;
|
||||
kernels_t kernels;
|
||||
|
||||
void synchronize(cl::Context const & context)
|
||||
{
|
||||
std::vector<cl::CommandQueue> & q = get_queues(context);
|
||||
@@ -15,52 +20,43 @@ void synchronize(cl::Context const & context)
|
||||
it->finish();
|
||||
}
|
||||
|
||||
queues_t init_queues()
|
||||
void init_queues()
|
||||
{
|
||||
queues_t result;
|
||||
|
||||
std::vector<cl::Platform> platforms;
|
||||
cl::Platform::get(&platforms);
|
||||
|
||||
for(std::vector<cl::Platform>::iterator it = platforms.begin() ; it != platforms.end() ; ++it)
|
||||
{
|
||||
std::vector<cl::Device> devices;
|
||||
it->getDevices(CL_DEVICE_TYPE_ALL, &devices);
|
||||
for(std::vector<cl::Device>::iterator itt = devices.begin() ; itt != devices.end() ; ++itt)
|
||||
{
|
||||
std::vector<cl::Device> current(1, *itt);
|
||||
cl::Context context(current);
|
||||
cl::CommandQueue queue(context, *itt);
|
||||
result.push_back(std::make_pair(context, std::vector<cl::CommandQueue>(1, queue)));
|
||||
queues.push_back(std::make_pair(cl::Context(std::vector<cl::Device>(1, *itt)), std::vector<cl::CommandQueue>()));
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
for(queues_t::iterator it = queues.begin() ; it != queues.end() ; ++it)
|
||||
it->second.push_back(cl::CommandQueue(it->first, it->first.getInfo<CL_CONTEXT_DEVICES>()[0], queue_properties));
|
||||
}
|
||||
|
||||
|
||||
cl::Context default_context()
|
||||
{
|
||||
return queues[default_context_idx].second.front().getInfo<CL_QUEUE_CONTEXT>();
|
||||
if(queues.empty())
|
||||
init_queues();
|
||||
return queues.begin()->first;
|
||||
}
|
||||
|
||||
std::vector<cl::CommandQueue> & get_queues(cl::Context const & ctx)
|
||||
{
|
||||
if(queues.empty())
|
||||
init_queues();
|
||||
for(queues_t::iterator it = queues.begin() ; it != queues.end() ; ++it)
|
||||
if(it->first()==ctx())
|
||||
return it->second;
|
||||
queues.push_back(std::make_pair(ctx, std::vector<cl::CommandQueue>(1, cl::CommandQueue(ctx, ctx.getInfo<CL_CONTEXT_DEVICES>()[0]))));
|
||||
return queues.back().second;
|
||||
if(it->first()==ctx()) return it->second;
|
||||
throw std::out_of_range("No such context registered in the backend. Please run atidlas::cl_ext:;register(context, queues)");
|
||||
}
|
||||
|
||||
cl::CommandQueue & get_queue(cl::Context const & ctx, std::size_t idx)
|
||||
{ return get_queues(ctx)[idx]; }
|
||||
{
|
||||
return get_queues(ctx)[idx];
|
||||
}
|
||||
|
||||
|
||||
unsigned int default_context_idx = 0;
|
||||
|
||||
queues_t queues = init_queues();
|
||||
kernels_t kernels = kernels_t();
|
||||
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user