From e214927b1677c679edf52d8e9677c20b43a1e947 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Feb 2015 04:42:57 -0500 Subject: [PATCH] Better control flow through options --- bench/blas.cpp | 39 +++++----- include/atidlas/array.h | 9 ++- include/atidlas/backend/templates/base.h | 2 +- include/atidlas/backend/templates/maxpy.h | 2 +- include/atidlas/backend/templates/mproduct.h | 4 +- .../atidlas/backend/templates/mreduction.h | 2 +- include/atidlas/backend/templates/reduction.h | 2 +- include/atidlas/backend/templates/vaxpy.h | 2 +- include/atidlas/model/model.h | 16 +--- include/atidlas/symbolic/execute.h | 2 +- include/atidlas/symbolic/expression.h | 78 ++++++++++++++++--- include/atidlas/types.h | 30 ------- lib/array.cpp | 21 ++--- lib/backend/templates/maxpy.cpp | 11 ++- lib/backend/templates/mproduct.cpp | 35 +++++---- lib/backend/templates/mreduction.cpp | 14 ++-- lib/backend/templates/reduction.cpp | 15 ++-- lib/backend/templates/vaxpy.cpp | 12 ++- lib/model/model.cpp | 27 +++---- lib/symbolic/execute.cpp | 13 ++-- python/pyatidlas/src/_atidlas.cpp | 6 +- 21 files changed, 176 insertions(+), 166 deletions(-) diff --git a/bench/blas.cpp b/bench/blas.cpp index b9afc20bb..1b3cace01 100644 --- a/bench/blas.cpp +++ b/bench/blas.cpp @@ -27,44 +27,39 @@ void bench(ad::numeric_type dtype) ad::tools::timer timer; unsigned int dtsize = ad::size_of(dtype); -#define BENCHMARK(OP, PERF) \ +#define BENCHMARK(OP, TIME, PERF) \ {\ - times.clear();\ total_time = 0;\ - OP;\ - while(total_time < 1e-1){\ - timer.start(); \ + while(total_time < 1e-3){\ + cl::Event event;\ OP;\ - times.push_back(timer.get());\ + times.push_back(TIME);\ total_time += times.back();\ }\ float tres = median(times);\ - std::cout << " " << PERF << std::flush;\ + std::cout << " " << tres << std::flush;\ } +#define CL_TIME +#define CL_SYNC queue.flush(); queue.finish() /*---------*/ /*--BLAS1--*/ /*---------*/ std::cout << "#AXPY" << std::endl; for(auto N : BLAS1_N) { - std::cout << N; /* ATIDLAS */ ad::array x(N, dtype), y(N, dtype); - cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0]; - ad::model & model = ad::get_model(queue, ad::VECTOR_AXPY_TYPE, dtype); - ad::array_expression E = ad::detail::assign(y, x + y); - model.tune(E); - ad::operation_cache cache; - model.execute(E, &cache); - queue.flush(); - queue.finish(); - BENCHMARK(cache.enqueue(); queue.flush(); queue.finish();, bandwidth(3*N, tres, dtsize)); - /* clAmdBlas */ -#ifdef BENCH_CLAMDBLAS - BENCHMARK(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, NULL); queue.flush(); queue.finish();, bandwidth(3*N, tres, dtsize)) -#endif + cl::CommandQueue& queue = ad::cl_ext::queues[x.context()][0]; + cl::Event event; + y = ad::controller(x + y, ad::execution_options_type(0, &event)); + queue.flush(); queue.finish(); + std::cout << " " << bandwidth(3*N, 1e-9*(event.getProfilingInfo() - event.getProfilingInfo()), dtsize) << std::flush; +// /* clAmdBlas */ +//#ifdef BENCH_CLAMDBLAS +// BENCHMARK(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, &event(), NULL); CL_SYNC, CL_TIME, bandwidth(3*N, tres, dtsize)) +//#endif /* BLAS */ #ifdef BENCH_CBLAS std::vector cx(N), cy(N); @@ -177,6 +172,8 @@ int main(int argc, char* argv[]) clAmdBlasSetup(); #endif + ad::cl_ext::queue_properties = CL_QUEUE_PROFILING_ENABLE; + int device_idx = 0; ad::cl_ext::queues_type::data_type const & queues = ad::cl_ext::queues.data(); diff --git a/include/atidlas/array.h b/include/atidlas/array.h index 90ea96525..9adc7b33c 100644 --- a/include/atidlas/array.h +++ b/include/atidlas/array.h @@ -2,6 +2,7 @@ #define ATIDLAS_ARRAY_H_ #include +#include #include #include "atidlas/types.h" #include "atidlas/cl_ext/backend.h" @@ -16,6 +17,8 @@ class scalar; class array: public array_base { friend array reshape(array const &, int_t, int_t); + template + struct is_array { enum{ value = std::is_same::value || std::is_same::value}; }; public: //1D Constructors array(int_t size1, numeric_type dtype, cl::Context context = cl_ext::default_context()); @@ -51,8 +54,10 @@ public: //Numeric operators array& operator=(array const &); array& operator=(array_expression const &); - - template array & operator=(std::vector const & rhs); + template + array& operator=(controller const &); + template + array & operator=(std::vector const & rhs); array_expression operator-(); array_expression operator!(); diff --git a/include/atidlas/backend/templates/base.h b/include/atidlas/backend/templates/base.h index 7bba6cf27..be6098f9b 100644 --- a/include/atidlas/backend/templates/base.h +++ b/include/atidlas/backend/templates/base.h @@ -163,7 +163,7 @@ public: std::vector generate(unsigned int label, expressions_tuple const & expressions, cl::Device const & device); virtual int check_invalid(expressions_tuple const & expressions, cl::Device const & device) const = 0; virtual void enqueue(cl::CommandQueue & queue, std::vector & programs, - unsigned int label, expressions_tuple const & expressions, operation_cache* cache = NULL) = 0; + unsigned int label, controller const & expressions) = 0; virtual std::shared_ptr clone() const = 0; private: binding_policy_t binding_policy_; diff --git a/include/atidlas/backend/templates/maxpy.h b/include/atidlas/backend/templates/maxpy.h index d29506d04..56b98d02c 100644 --- a/include/atidlas/backend/templates/maxpy.h +++ b/include/atidlas/backend/templates/maxpy.h @@ -27,7 +27,7 @@ public: maxpy(parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE); maxpy(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE); std::vector input_sizes(expressions_tuple const & expressions); - void enqueue(cl::CommandQueue & queue, std::vector & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache = NULL); + void enqueue(cl::CommandQueue & queue, std::vector & programs, unsigned int label, controller const &); }; } diff --git a/include/atidlas/backend/templates/mproduct.h b/include/atidlas/backend/templates/mproduct.h index ae957338b..7f694668a 100644 --- a/include/atidlas/backend/templates/mproduct.h +++ b/include/atidlas/backend/templates/mproduct.h @@ -41,14 +41,14 @@ private: void enqueue_block(cl::CommandQueue & queue, int_t M, int_t N, int_t K, array_infos const & A, array_infos const & B, array_infos const & C, value_scalar const & alpha, value_scalar const & beta, - std::vector & programs, unsigned int label, int id, operation_cache * cache); + std::vector & programs, unsigned int label, int id, execution_options_type const & options); array_infos create_slice(array_infos & M, int_t s0_0, int_t s0_1, int_t s1_0, int_t s1_1, bool swap); std::vector infos(expressions_tuple const & expressions, lhs_rhs_element & C, lhs_rhs_element & A, lhs_rhs_element & B); public: mproduct(mproduct::parameters_type const & parameters, char A_trans, char B_trans); std::vector input_sizes(expressions_tuple const & expressions); - void enqueue(cl::CommandQueue & queue, std::vector & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache = NULL); + void enqueue(cl::CommandQueue & queue, std::vector & programs, unsigned int label, controller const &); private: const char A_trans_; const char B_trans_; diff --git a/include/atidlas/backend/templates/mreduction.h b/include/atidlas/backend/templates/mreduction.h index 61905388c..c98a6ae8f 100644 --- a/include/atidlas/backend/templates/mreduction.h +++ b/include/atidlas/backend/templates/mreduction.h @@ -35,7 +35,7 @@ private: std::vector generate_impl(unsigned int, expressions_tuple const &, std::vector const &) const; public: virtual std::vector input_sizes(expressions_tuple const & expressions); - void enqueue(cl::CommandQueue & queue,std::vector & programs,unsigned int label, expressions_tuple const & expressions, operation_cache * cache = NULL); + void enqueue(cl::CommandQueue & queue,std::vector & programs,unsigned int label, controller const &); private: reduction_type reduction_type_; }; diff --git a/include/atidlas/backend/templates/reduction.h b/include/atidlas/backend/templates/reduction.h index 25437c622..9e96a7bb9 100644 --- a/include/atidlas/backend/templates/reduction.h +++ b/include/atidlas/backend/templates/reduction.h @@ -29,7 +29,7 @@ public: reduction(reduction::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE); reduction(unsigned int simd, unsigned int ls, unsigned int ng, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE); std::vector input_sizes(expressions_tuple const & expressions); - void enqueue(cl::CommandQueue & queue, std::vector & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache = NULL); + void enqueue(cl::CommandQueue & queue, std::vector & programs, unsigned int label, controller const &); private: std::vector< cl::Buffer > tmp_; std::vector< cl::Buffer > tmpidx_; diff --git a/include/atidlas/backend/templates/vaxpy.h b/include/atidlas/backend/templates/vaxpy.h index 2f85731ce..0b8e68bf3 100644 --- a/include/atidlas/backend/templates/vaxpy.h +++ b/include/atidlas/backend/templates/vaxpy.h @@ -23,7 +23,7 @@ public: vaxpy(vaxpy::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE); vaxpy(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy, binding_policy_t binding_policy = BIND_ALL_UNIQUE); std::vector input_sizes(expressions_tuple const & expressions); - void enqueue(cl::CommandQueue & queue, std::vector & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache = NULL); + void enqueue(cl::CommandQueue & queue, std::vector & programs, unsigned int label, controller const &); }; } diff --git a/include/atidlas/model/model.h b/include/atidlas/model/model.h index 52889fa3f..e3fea979f 100644 --- a/include/atidlas/model/model.h +++ b/include/atidlas/model/model.h @@ -17,29 +17,19 @@ namespace atidlas class model { typedef std::vector< std::shared_ptr > templates_container; - public: - struct runtime_options - { - runtime_options() : label(-1), recompile(false){} - runtime_options(std::string const & p) : program_name(p), label(-1), recompile(false){} - - std::string program_name; - int label; - bool recompile; - }; private: std::string define_extension(std::string const & extensions, std::string const & ext); inline void fill_program_name(char* program_name, expressions_tuple const & expressions, binding_policy_t binding_policy); - std::vector& init(expressions_tuple const & expressions, runtime_options const & opt = runtime_options()); + std::vector& init(controller const &); public: model(predictors::random_forest const &, std::vector< std::shared_ptr > const &, cl::CommandQueue &); model(std::vector< std::shared_ptr > const &, cl::CommandQueue &); model(base const &, cl::CommandQueue &); - void execute(expressions_tuple const &, operation_cache * cache = NULL, runtime_options const & opt = runtime_options()); - void tune(expressions_tuple const &); + void execute(controller const &); + void tune(controller const &); templates_container const & templates() const; private: diff --git a/include/atidlas/symbolic/execute.h b/include/atidlas/symbolic/execute.h index 39cb4ae68..3b397bcde 100644 --- a/include/atidlas/symbolic/execute.h +++ b/include/atidlas/symbolic/execute.h @@ -9,7 +9,7 @@ namespace atidlas { /** @brief Executes a array_expression on the given queue for the given models map*/ -void execute(array_expression &, model_map_t &, operation_cache * cache = NULL); +void execute(controller const & , model_map_t &); } diff --git a/include/atidlas/symbolic/expression.h b/include/atidlas/symbolic/expression.h index 1039dfef2..647fbdec8 100644 --- a/include/atidlas/symbolic/expression.h +++ b/include/atidlas/symbolic/expression.h @@ -208,27 +208,83 @@ private: size4 shape_; }; +class operation_cache +{ + struct infos + { + cl::CommandQueue & queue; + cl::Kernel kernel; + cl::NDRange offset; + cl::NDRange global; + cl::NDRange local; + std::vector* dependencies; + cl::Event* event; + }; + +public: + void push_back(cl::CommandQueue & queue, cl::Kernel const & kernel, cl::NDRange const & offset, cl::NDRange const & global, cl::NDRange const & local, std::vector* dependencies, cl::Event* event) + { l_.push_back({queue, kernel, offset, global, local, dependencies, event}); } + + void enqueue() + { + for(infos & i : l_) + i.queue.enqueueNDRangeKernel(i.kernel, i.offset, i.global, i.local, i.dependencies, i.event); + } + +private: + std::list l_; +}; + +struct execution_options_type +{ + execution_options_type(unsigned int _queue_id = 0, cl::Event* _event = NULL, operation_cache* _cache = NULL, std::vector* _dependencies = NULL) : queue_id(_queue_id), event(_event), cache(_cache), dependencies(_dependencies){} + + void enqueue_cache(cl::CommandQueue & queue, cl::Kernel const & kernel, cl::NDRange offset, cl::NDRange global, cl::NDRange local) const + { + queue.enqueueNDRangeKernel(kernel, offset, global, local, dependencies, event); + if(cache) + cache->push_back(queue, kernel, cl::NullRange, global, local, dependencies, event); + } + + unsigned int queue_id; + cl::Event* event; + operation_cache* cache; + std::vector* dependencies; +}; + +struct dispatcher_options_type +{ + dispatcher_options_type(int _label = -1) : label(_label){} + int label; +}; + +struct compilation_options_type +{ + compilation_options_type(std::string const & _program_name = "", bool _recompile = false) : program_name(_program_name), recompile(_recompile){} + std::string program_name; + bool recompile; +}; + template class controller { public: - controller(TYPE const & x, cl::Event* event = NULL, std::vector* dependencies = NULL, - cl::CommandQueue* queue = NULL, operation_cache* cache = NULL) : x_(x), event_(event), dependencies_(dependencies), queue_(queue), cache_(cache){} + controller(TYPE const & x, execution_options_type const& execution_options = execution_options_type(), + dispatcher_options_type const & dispatcher_options = dispatcher_options_type(), compilation_options_type const & compilation_options = compilation_options_type()) + : x_(x), execution_options_(execution_options), dispatcher_options_(dispatcher_options), compilation_options_(compilation_options){} TYPE const & x() const { return x_; } - cl::Event* event() const { return event_; } - std::vector* dependencies() const { return dependencies_; } - cl::CommandQueue* queue() const { return queue_; } - operation_cache* cache() const { return cache_; } - + execution_options_type const & execution_options() const { return execution_options_; } + dispatcher_options_type const & dispatcher_options() const { return dispatcher_options_; } + compilation_options_type const & compilation_options() const { return compilation_options_; } private: TYPE const & x_; - cl::Event* event_; - std::vector* dependencies_; - cl::CommandQueue* queue_; - operation_cache* cache_; + execution_options_type execution_options_; + dispatcher_options_type dispatcher_options_; + compilation_options_type compilation_options_; }; + class expressions_tuple { private: diff --git a/include/atidlas/types.h b/include/atidlas/types.h index 289a68b6d..78ecbbe8b 100644 --- a/include/atidlas/types.h +++ b/include/atidlas/types.h @@ -61,36 +61,6 @@ struct array_infos int_t ld; }; -class operation_cache -{ - struct infos - { - infos(cl::CommandQueue & q, cl::Kernel const & k, cl::NDRange const & off, cl::NDRange const & g, cl::NDRange const & l) - : queue(q), kernel(k), offset(off), grange(g), lrange(l) {} - - cl::CommandQueue & queue; - cl::Kernel kernel; - cl::NDRange offset; - cl::NDRange grange; - cl::NDRange lrange; - }; - -public: - void push_back(cl::CommandQueue & queue, cl::Kernel const & kernel, cl::NDRange const & offset, cl::NDRange const & grange, cl::NDRange const & lrange) - { - l_.push_back(infos(queue, kernel, offset, grange, lrange)); - } - - void enqueue() - { - for(infos & elem : l_) - elem.queue.enqueueNDRangeKernel(elem.kernel, elem.offset, elem.grange, elem.lrange); - } - -private: - std::list l_; -}; - inline std::string numeric_type_to_string(numeric_type const & type) { switch (type) diff --git a/lib/array.cpp b/lib/array.cpp index 4fc6a4c09..f888c0dac 100644 --- a/lib/array.cpp +++ b/lib/array.cpp @@ -131,22 +131,17 @@ int_t array::dsize() const /*--- Assignment Operators ----*/ //--------------------------------------- array & array::operator=(array const & rhs) -{ - assert(dtype_ == rhs.dtype()); - array_expression expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ASSIGN_TYPE), context_, dtype_, shape_); - cl::CommandQueue & queue = cl_ext::queues[context_][0]; - model_map_t & mmap = atidlas::get_model_map(queue); - execute(expression, mmap); - return *this; -} +{ return *this = controller(rhs); } array & array::operator=(array_expression const & rhs) +{ return *this = controller(rhs); } + +template +array& array::operator=(controller const & c) { - assert(dtype_ == rhs.dtype()); - array_expression expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ASSIGN_TYPE), dtype_, shape_); - cl::CommandQueue & queue = cl_ext::queues[context_][0]; - model_map_t & mmap = atidlas::get_model_map(queue); - execute(expression, mmap); + assert(dtype_ == c.x().dtype()); + execute(controller(detail::assign(*this, c.x()), c.execution_options(), c.dispatcher_options(), c.compilation_options()), + atidlas::get_model_map(cl_ext::queues[context_][c.execution_options().queue_id])); return *this; } diff --git a/lib/backend/templates/maxpy.cpp b/lib/backend/templates/maxpy.cpp index 968b12205..1bf59686d 100644 --- a/lib/backend/templates/maxpy.cpp +++ b/lib/backend/templates/maxpy.cpp @@ -105,23 +105,22 @@ std::vector maxpy::input_sizes(expressions_tuple const & expressions) } void maxpy::enqueue(cl::CommandQueue & queue, std::vector & programs, - unsigned int label, expressions_tuple const & expressions, operation_cache * cache) + unsigned int label, controller const & controller) { + expressions_tuple const & expressions = controller.x(); char kname[10]; fill_kernel_name(kname, label, "d"); cl::Program & program = programs[0].program(); cl::Kernel kernel(program, kname); - cl::NDRange grange(p_.local_size_0*p_.num_groups_0, p_.local_size_1*p_.num_groups_1); - cl::NDRange lrange(p_.local_size_0, p_.local_size_1); + cl::NDRange global(p_.local_size_0*p_.num_groups_0, p_.local_size_1*p_.num_groups_1); + cl::NDRange local(p_.local_size_0, p_.local_size_1); unsigned int current_arg = 0; std::vector MN = input_sizes(expressions); kernel.setArg(current_arg++, cl_uint(MN[0])); kernel.setArg(current_arg++, cl_uint(MN[1])); set_arguments(expressions, kernel, current_arg); - queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange); - if(cache) - cache->push_back(queue, kernel, cl::NullRange, grange, lrange); + controller.execution_options().enqueue_cache(queue, kernel, cl::NullRange, global, local); } template class base_impl; diff --git a/lib/backend/templates/mproduct.cpp b/lib/backend/templates/mproduct.cpp index 625fce762..baff5c2a9 100644 --- a/lib/backend/templates/mproduct.cpp +++ b/lib/backend/templates/mproduct.cpp @@ -568,7 +568,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width void mproduct::enqueue_block(cl::CommandQueue & queue, int_t M, int_t N, int_t K, array_infos const & A, array_infos const & B, array_infos const & C, value_scalar const & alpha, value_scalar const & beta, - std::vector & programs, unsigned int label, int id, operation_cache * cache) + std::vector & programs, unsigned int label, int id, execution_options_type const & options) { if (A.shape1==0 || A.shape2==0 || B.shape1==0 || B.shape2==0 || C.shape1==0 || C.shape2==0) return; @@ -578,8 +578,8 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width cl::Program & program = programs[id].program(); cl::Kernel kernel(program, kname); - cl::NDRange lrange(p_.local_size_0, p_.local_size_1); - cl::NDRange grange = (id==1)?cl::NDRange(align(align(M,p_.mS)/p_.mS, p_.local_size_0), align(align(N,p_.nS)/p_.nS, p_.local_size_1)): + cl::NDRange local(p_.local_size_0, p_.local_size_1); + cl::NDRange global = (id==1)?cl::NDRange(align(align(M,p_.mS)/p_.mS, p_.local_size_0), align(align(N,p_.nS)/p_.nS, p_.local_size_1)): cl::NDRange(M/p_.mS, N/p_.nS); unsigned int current_arg = 0; @@ -595,10 +595,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width fun.set_arguments(B); fun.set_arguments(beta.dtype(), beta.values()); - queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange); - - if(cache) - cache->push_back(queue, kernel, cl::NullRange, grange, lrange); + options.enqueue_cache(queue,kernel, cl::NullRange, global, local); } array_infos mproduct::create_slice(array_infos & M, int_t s0_0, int_t s0_1, int_t s1_0, int_t s1_1, bool swap) @@ -649,13 +646,17 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width return infos(expressions, d0, d1, d2); } - void mproduct::enqueue(cl::CommandQueue & queue, std::vector & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache) + void mproduct::enqueue(cl::CommandQueue & queue, std::vector & programs, unsigned int label, controller const & controller) { using namespace tools; + expressions_tuple const & expressions = controller.x(); + lhs_rhs_element C, A, B; std::vector MNK = infos(expressions, C, A, B); + execution_options_type const & options = controller.execution_options(); + int_t M = MNK[0]; int_t N = MNK[1]; int_t K = MNK[2]; @@ -687,7 +688,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width { enqueue_block(queue, M, N, K, create_slice(pA, 0, M, 0, K, swap_A), create_slice(pB, 0, K, 0, N, swap_B), - create_slice(pC, 0, M, 0, N, false), *_1, *_0, programs, label, 1, cache); + create_slice(pC, 0, M, 0, N, false), *_1, *_0, programs, label, 1, options); return; } @@ -695,17 +696,17 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width int_t lN = N / p_.nL * p_.nL; int_t lK = K / p_.kL * p_.kL; - enqueue_block(queue, lM, lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_0, programs, label, 0, cache); - enqueue_block(queue, lM, lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_1, programs, label, 1, cache); + enqueue_block(queue, lM, lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_0, programs, label, 0, options); + enqueue_block(queue, lM, lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_1, programs, label, 1, options); - enqueue_block(queue, lM, N - lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_0, programs, label, 1, cache); - enqueue_block(queue, lM, N - lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_1, programs, label, 1, cache); + enqueue_block(queue, lM, N - lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_0, programs, label, 1, options); + enqueue_block(queue, lM, N - lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_1, programs, label, 1, options); - enqueue_block(queue, M - lM, lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_0, programs, label, 1, cache); - enqueue_block(queue, M - lM, lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_1, programs, label, 1, cache); + enqueue_block(queue, M - lM, lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_0, programs, label, 1, options); + enqueue_block(queue, M - lM, lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_1, programs, label, 1, options); - enqueue_block(queue, M - lM, N - lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_0, programs, label, 1, cache); - enqueue_block(queue, M - lM, N - lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_1, programs, label, 1, cache); + enqueue_block(queue, M - lM, N - lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_0, programs, label, 1, options); + enqueue_block(queue, M - lM, N - lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_1, programs, label, 1, options); } // diff --git a/lib/backend/templates/mreduction.cpp b/lib/backend/templates/mreduction.cpp index 73fd6766a..b6c5e7d59 100644 --- a/lib/backend/templates/mreduction.cpp +++ b/lib/backend/templates/mreduction.cpp @@ -214,9 +214,10 @@ std::vector mreduction::input_sizes(expressions_tuple const & expressions return tools::make_vector() << MN.first << MN.second; } -void mreduction::enqueue(cl::CommandQueue & queue, std::vector & programs, - unsigned int label, expressions_tuple const & expressions, operation_cache * cache) +void mreduction::enqueue(cl::CommandQueue & queue, std::vector & programs, unsigned int label, controller const & controller) { + expressions_tuple const & expressions = controller.x(); + char kname[10]; fill_kernel_name(kname, label, "d"); std::vector MN = input_sizes(expressions); @@ -229,18 +230,15 @@ void mreduction::enqueue(cl::CommandQueue & queue, std::vectorpush_back(queue, kernel, cl::NullRange, grange, lrange); + controller.execution_options().enqueue_cache(queue, kernel, cl::NullRange, global, local); } mreduction_rows::mreduction_rows(mreduction_parameters const & parameters, diff --git a/lib/backend/templates/reduction.cpp b/lib/backend/templates/reduction.cpp index d46d86687..7ecaa729a 100644 --- a/lib/backend/templates/reduction.cpp +++ b/lib/backend/templates/reduction.cpp @@ -280,9 +280,10 @@ std::vector reduction::input_sizes(expressions_tuple const & expressions) return tools::make_vector() << N; } -void reduction::enqueue(cl::CommandQueue & queue, std::vector & programs, - unsigned int label, expressions_tuple const & expressions, operation_cache * cache) +void reduction::enqueue(cl::CommandQueue & queue, std::vector & programs, unsigned int label, controller const & controller) { + expressions_tuple const & expressions = controller.x(); + //Preprocessing int_t size = input_sizes(expressions)[0]; std::vector reductions; @@ -307,8 +308,8 @@ void reduction::enqueue(cl::CommandQueue & queue, std::vectorpush_back(queue, kernels[k], cl::NullRange, grange[k], lrange[k]); + controller.execution_options().enqueue_cache(queue, kernels[k], cl::NullRange, global[k], local[k]); } template class base_impl; diff --git a/lib/backend/templates/vaxpy.cpp b/lib/backend/templates/vaxpy.cpp index 2f70e17de..f2014c22e 100644 --- a/lib/backend/templates/vaxpy.cpp +++ b/lib/backend/templates/vaxpy.cpp @@ -108,9 +108,9 @@ std::vector vaxpy::input_sizes(expressions_tuple const & expressions) return tools::make_vector() << size; } -void vaxpy::enqueue(cl::CommandQueue & queue, std::vector & programs, - unsigned int label, expressions_tuple const & expressions, operation_cache * cache) +void vaxpy::enqueue(cl::CommandQueue & queue, std::vector & programs, unsigned int label, controller const & controller) { + expressions_tuple const & expressions = controller.x(); //Size int_t size = input_sizes(expressions)[0]; //Kernel @@ -128,16 +128,14 @@ void vaxpy::enqueue(cl::CommandQueue & queue, std::vector cl::Kernel & kernel = it->second; //NDRange - cl::NDRange grange(p_.local_size_0*p_.num_groups); - cl::NDRange lrange(p_.local_size_0); + cl::NDRange global(p_.local_size_0*p_.num_groups); + cl::NDRange local(p_.local_size_0); //Arguments unsigned int current_arg = 0; kernel.setArg(current_arg++, cl_uint(size)); set_arguments(expressions, kernel, current_arg); - queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange); - if(cache) - cache->push_back(queue, kernel, cl::NullRange, grange, lrange); + controller.execution_options().enqueue_cache(queue, kernel, cl::NullRange, global, local); } diff --git a/lib/model/model.cpp b/lib/model/model.cpp index 6e62a4083..1d139c3f4 100644 --- a/lib/model/model.cpp +++ b/lib/model/model.cpp @@ -46,18 +46,19 @@ void model::fill_program_name(char* program_name, expressions_tuple const & expr delete binder; } -std::vector& model::init(expressions_tuple const & expressions, runtime_options const & opt) +std::vector& model::init(controller const & expressions) { - cl::Context const & context = expressions.context(); + cl::Context const & context = expressions.x().context(); std::string pname; + compilation_options_type const & opt = expressions.compilation_options(); if(opt.program_name.empty()) { char program_name[256]; - fill_program_name(program_name, expressions, BIND_TO_HANDLE); + fill_program_name(program_name, expressions.x(), BIND_TO_HANDLE); pname = std::string(program_name); } else - pname = opt.program_name; + pname = expressions.compilation_options().program_name; std::vector & to_init = lazy_programs_[context()][pname]; if(to_init.empty()) { @@ -72,7 +73,7 @@ std::vector& model::init(expressions_tuple const & expres for(size_t i = 0 ; i < templates_.size() ; ++i) { - std::vector cur = templates_[i]->generate(i, expressions, device); + std::vector cur = templates_[i]->generate(i, expressions.x(), device); for(size_t j = 0 ; j < cur.size() ; ++j){ to_init[j].add(cur[j]); } @@ -91,19 +92,19 @@ model::model(std::vector< std::shared_ptr > const & templates, cl::Command model::model(base const & tp, cl::CommandQueue & queue) : templates_(1,tp.clone()), queue_(queue) {} -void model::execute(expressions_tuple const & expressions, operation_cache *cache, runtime_options const & opt) +void model::execute(controller const & expressions) { - std::vector & compilers = init(expressions, opt); + std::vector & compilers = init(expressions); //Prediction int label = 0; - if(opt.label>=0) + if(expressions.dispatcher_options().label>=0) { - label = opt.label; + label = expressions.dispatcher_options().label; } else { - std::vector x = templates_[0]->input_sizes(expressions); + std::vector x = templates_[0]->input_sizes(expressions.x()); //The user tuned the model specifically for this input size if(hardcoded_.find(x)!=hardcoded_.end()) label = hardcoded_.at(x); @@ -116,10 +117,10 @@ void model::execute(expressions_tuple const & expressions, operation_cache *cach } //Execution - return templates_[label]->enqueue(queue_, compilers, label, expressions, cache); + return templates_[label]->enqueue(queue_, compilers, label, expressions); } -void model::tune(expressions_tuple const & expressions) +void model::tune(controller const & expressions) { std::vector & compilers = init(expressions); @@ -135,7 +136,7 @@ void model::tune(expressions_tuple const & expressions) } //Fill the override - std::vector x = templates_[0]->input_sizes(expressions); + std::vector x = templates_[0]->input_sizes(expressions.x()); hardcoded_[x] = std::distance(timings.begin(),std::min_element(timings.begin(), timings.end())); } diff --git a/lib/symbolic/execute.cpp b/lib/symbolic/execute.cpp index 5da0566ff..02622e9d3 100644 --- a/lib/symbolic/execute.cpp +++ b/lib/symbolic/execute.cpp @@ -147,11 +147,12 @@ namespace atidlas } /** @brief Executes a array_expression on the given models map*/ - void execute(atidlas::array_expression & array_expression, model_map_t & models, operation_cache * cache) + void execute(controller const & c, model_map_t & models) { - cl::Context const & context = array_expression.context(); - size_t rootidx = array_expression.root(); - array_expression::container_type & tree = const_cast(array_expression.tree()); + array_expression expression = c.x(); + cl::Context const & context = expression.context(); + size_t rootidx = expression.root(); + array_expression::container_type & tree = const_cast(expression.tree()); array_expression::node root_save = tree[rootidx]; //Todo: technically the datatype should be per temporary @@ -207,7 +208,7 @@ namespace atidlas tree[rootidx].rhs.type_family = rit->second->type_family; //Execute - pmodel->execute(array_expression); + pmodel->execute(controller(expression, c.execution_options(), c.dispatcher_options(), c.compilation_options())); tree[rootidx] = root_save; //Incorporates the temporary within the array_expression @@ -215,7 +216,7 @@ namespace atidlas } /*-----Compute final expression-----*/ - models[std::make_pair(final_type, dtype)]->execute(array_expression, cache); + models[std::make_pair(final_type, dtype)]->execute(controller(expression, c.execution_options(), c.dispatcher_options(), c.compilation_options())); } } diff --git a/python/pyatidlas/src/_atidlas.cpp b/python/pyatidlas/src/_atidlas.cpp index 2050000cb..c98e51bb2 100644 --- a/python/pyatidlas/src/_atidlas.cpp +++ b/python/pyatidlas/src/_atidlas.cpp @@ -209,7 +209,8 @@ namespace detail else if(name=="uint64") return atd::ULONG_TYPE; else if(name=="float32") return atd::FLOAT_TYPE; else if(name=="float64") return atd::DOUBLE_TYPE; - else{ + else + { PyErr_SetString(PyExc_TypeError, "Data type not understood"); bp::throw_error_already_set(); throw; @@ -233,7 +234,8 @@ namespace detail else if(name=="mproduct_tn") return atd::MATRIX_PRODUCT_TN_TYPE; else if(name=="mproduct_nt") return atd::MATRIX_PRODUCT_NT_TYPE; else if(name=="mproduct_tt") return atd::MATRIX_PRODUCT_TT_TYPE; - else{ + else + { PyErr_SetString(PyExc_TypeError, "Template type not understood"); bp::throw_error_already_set(); throw;