Implemented simple operation cache

This commit is contained in:
Philippe Tillet
2015-02-01 23:56:05 -05:00
parent 535706f35a
commit 2afc574724
17 changed files with 90 additions and 57 deletions

View File

@@ -1,5 +1,5 @@
#include "atidlas/array.h" #include "atidlas/array.h"
#include "atidlas/model/model.h" #include "atidlas/symbolic/execute.h"
#include "atidlas/tools/timer.hpp" #include "atidlas/tools/timer.hpp"
#include "common.hpp" #include "common.hpp"
#ifdef BENCH_CLAMDBLAS #ifdef BENCH_CLAMDBLAS
@@ -32,7 +32,7 @@ void bench(ad::numeric_type dtype)
times.clear();\ times.clear();\
total_time = 0;\ total_time = 0;\
OP;\ OP;\
ad::cl_ext::synchronize(ad::cl_ext::default_context());\ SYNC;\
while(total_time < 5e-1){\ while(total_time < 5e-1){\
timer.start(); \ timer.start(); \
OP;\ OP;\
@@ -44,7 +44,7 @@ void bench(ad::numeric_type dtype)
std::cout << " " << PERF << std::flush;\ std::cout << " " << PERF << std::flush;\
} }
#define CL_BENCHMARK(OP, PERF) BENCHMARK(OP, PERF, ad::cl_ext::synchronize(ad::cl_ext::default_context())) #define CL_BENCHMARK(OP, PERF) BENCHMARK(OP, PERF, queue.flush(); queue.finish();)
#define CPU_SYNCHRONIZE #define CPU_SYNCHRONIZE
#define CPU_BENCHMARK(OP, PERF) BENCHMARK(OP, PERF, CPU_SYNCHRONIZE) #define CPU_BENCHMARK(OP, PERF) BENCHMARK(OP, PERF, CPU_SYNCHRONIZE)
@@ -62,11 +62,15 @@ void bench(ad::numeric_type dtype)
std::cout << N; std::cout << N;
/* ATIDLAS */ /* ATIDLAS */
ad::array x(N, dtype), y(N, dtype); ad::array x(N, dtype), y(N, dtype);
cl::CommandQueue & queue = ad::cl_ext::get_queue(x.context(), 0);
ad::model & model = ad::get_model(queue, ad::VECTOR_AXPY_TYPE, dtype);
ad::array_expression E = ad::detail::assign(y, x + y); ad::array_expression E = ad::detail::assign(y, x + y);
ad::model & model = ad::get_model(ad::cl_ext::get_queue(x.context(), 0), ad::VECTOR_AXPY_TYPE, dtype);
ad::model::runtime_options opt("saxpy");
model.tune(E); model.tune(E);
CL_BENCHMARK(model.execute(E, opt), bandwidth(3*N, tres, dtsize)); ad::operation_cache cache;
model.execute(E, &cache);
queue.flush();
queue.finish();
CL_BENCHMARK(cache.enqueue(), bandwidth(3*N, tres, dtsize));
/* clAmdBlas */ /* clAmdBlas */
#ifdef BENCH_CLAMDBLAS #ifdef BENCH_CLAMDBLAS
CL_BENCHMARK(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &ad::cl_ext::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(3*N, tres, dtsize)) CL_BENCHMARK(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &ad::cl_ext::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(3*N, tres, dtsize))

View File

@@ -162,9 +162,8 @@ public:
virtual ~base(); virtual ~base();
std::vector<std::string> generate(unsigned int label, expressions_tuple const & expressions, cl::Device const & device); std::vector<std::string> generate(unsigned int label, expressions_tuple const & expressions, cl::Device const & device);
virtual int check_invalid(expressions_tuple const & expressions, cl::Device const & device) const = 0; virtual int check_invalid(expressions_tuple const & expressions, cl::Device const & device) const = 0;
virtual void enqueue(cl::CommandQueue & queue, virtual void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache* cache = NULL) = 0;
unsigned int label, expressions_tuple const & expressions) = 0;
virtual tools::shared_ptr<base> clone() const = 0; virtual tools::shared_ptr<base> clone() const = 0;
private: private:
binding_policy_t binding_policy_; binding_policy_t binding_policy_;

View File

@@ -27,7 +27,7 @@ public:
maxpy(parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE); maxpy(parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
maxpy(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE); maxpy(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE);
std::vector<int_t> input_sizes(expressions_tuple const & expressions); std::vector<int_t> input_sizes(expressions_tuple const & expressions);
void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions); void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache = NULL);
}; };
} }

View File

@@ -41,18 +41,14 @@ private:
void enqueue_block(cl::CommandQueue & queue, int_t M, int_t N, int_t K, void enqueue_block(cl::CommandQueue & queue, int_t M, int_t N, int_t K,
array_infos const & A, array_infos const & B, array_infos const & C, array_infos const & A, array_infos const & B, array_infos const & C,
value_scalar const & alpha, value_scalar const & beta, value_scalar const & alpha, value_scalar const & beta,
std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, int id); std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, int id, operation_cache * cache);
array_infos create_slice(array_infos & M, int_t s0_0, int_t s0_1, int_t s1_0, int_t s1_1, bool swap); array_infos create_slice(array_infos & M, int_t s0_0, int_t s0_1, int_t s1_0, int_t s1_1, bool swap);
std::vector<int_t> infos(expressions_tuple const & expressions, std::vector<int_t> infos(expressions_tuple const & expressions,
lhs_rhs_element & C, lhs_rhs_element & A, lhs_rhs_element & B); lhs_rhs_element & C, lhs_rhs_element & A, lhs_rhs_element & B);
public: public:
mproduct(mproduct::parameters_type const & parameters, char A_trans, char B_trans); mproduct(mproduct::parameters_type const & parameters, char A_trans, char B_trans);
std::vector<int_t> input_sizes(expressions_tuple const & expressions); std::vector<int_t> input_sizes(expressions_tuple const & expressions);
void enqueue(cl::CommandQueue & queue, void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache = NULL);
std::vector<cl_ext::lazy_compiler> & programs,
unsigned int label,
expressions_tuple const & expressions);
private: private:
const char A_trans_; const char A_trans_;
const char B_trans_; const char B_trans_;

View File

@@ -35,7 +35,7 @@ private:
std::vector<std::string> generate_impl(unsigned int, expressions_tuple const &, std::vector<mapping_type> const &) const; std::vector<std::string> generate_impl(unsigned int, expressions_tuple const &, std::vector<mapping_type> const &) const;
public: public:
virtual std::vector<int_t> input_sizes(expressions_tuple const & expressions); virtual std::vector<int_t> input_sizes(expressions_tuple const & expressions);
void enqueue(cl::CommandQueue & queue,std::vector<cl_ext::lazy_compiler> & programs,unsigned int label, expressions_tuple const & expressions); void enqueue(cl::CommandQueue & queue,std::vector<cl_ext::lazy_compiler> & programs,unsigned int label, expressions_tuple const & expressions, operation_cache * cache = NULL);
private: private:
reduction_type reduction_type_; reduction_type reduction_type_;
}; };

View File

@@ -29,10 +29,7 @@ public:
reduction(reduction::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE); reduction(reduction::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
reduction(unsigned int simd, unsigned int ls, unsigned int ng, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE); reduction(unsigned int simd, unsigned int ls, unsigned int ng, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE);
std::vector<int_t> input_sizes(expressions_tuple const & expressions); std::vector<int_t> input_sizes(expressions_tuple const & expressions);
void enqueue(cl::CommandQueue & queue, void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache = NULL);
std::vector<cl_ext::lazy_compiler> & programs,
unsigned int label,
expressions_tuple const & expressions);
private: private:
std::vector< cl::Buffer > tmp_; std::vector< cl::Buffer > tmp_;
std::vector< cl::Buffer > tmpidx_; std::vector< cl::Buffer > tmpidx_;

View File

@@ -23,8 +23,7 @@ public:
vaxpy(vaxpy::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE); vaxpy(vaxpy::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
vaxpy(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy, binding_policy_t binding_policy = BIND_ALL_UNIQUE); vaxpy(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
std::vector<int_t> input_sizes(expressions_tuple const & expressions); std::vector<int_t> input_sizes(expressions_tuple const & expressions);
void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache = NULL);
unsigned int label, expressions_tuple const & expressions);
}; };
} }

View File

@@ -38,7 +38,7 @@ namespace atidlas
model(std::vector< tools::shared_ptr<base> > const &, cl::CommandQueue &); model(std::vector< tools::shared_ptr<base> > const &, cl::CommandQueue &);
model(base const &, cl::CommandQueue &); model(base const &, cl::CommandQueue &);
void execute(expressions_tuple const &, runtime_options const & opt = runtime_options()); void execute(expressions_tuple const &, operation_cache * cache = NULL, runtime_options const & opt = runtime_options());
void tune(expressions_tuple const &); void tune(expressions_tuple const &);
templates_container const & templates() const; templates_container const & templates() const;

View File

@@ -9,7 +9,7 @@ namespace atidlas
{ {
/** @brief Executes a array_expression on the given queue for the given models map*/ /** @brief Executes a array_expression on the given queue for the given models map*/
void execute(array_expression &, model_map_t &); void execute(array_expression &, model_map_t &, operation_cache * cache = NULL);
} }

View File

@@ -2,6 +2,7 @@
#define ATIDLAS_TYPES_H #define ATIDLAS_TYPES_H
#include <CL/cl.hpp> #include <CL/cl.hpp>
#include <list>
#include "atidlas/exception/unknown_datatype.h" #include "atidlas/exception/unknown_datatype.h"
namespace atidlas namespace atidlas
@@ -60,6 +61,36 @@ struct array_infos
int_t ld; int_t ld;
}; };
class operation_cache
{
struct infos
{
infos(cl::CommandQueue & q, cl::Kernel const & k, cl::NDRange const & off, cl::NDRange const & g, cl::NDRange const & l)
: queue(q), kernel(k), offset(off), grange(g), lrange(l) {}
cl::CommandQueue & queue;
cl::Kernel kernel;
cl::NDRange offset;
cl::NDRange grange;
cl::NDRange lrange;
};
public:
void push_back(cl::CommandQueue & queue, cl::Kernel const & kernel, cl::NDRange const & offset, cl::NDRange const & grange, cl::NDRange const & lrange)
{
l_.push_back(infos(queue, kernel, offset, grange, lrange));
}
void enqueue()
{
for(std::list<infos>::iterator it = l_.begin() ; it != l_.end() ; ++it)
it->queue.enqueueNDRangeKernel(it->kernel, it->offset, it->grange, it->lrange);
}
private:
std::list<infos> l_;
};
inline std::string numeric_type_to_string(numeric_type const & type) inline std::string numeric_type_to_string(numeric_type const & type)
{ {
switch (type) switch (type)

View File

@@ -104,10 +104,8 @@ std::vector<int_t> maxpy::input_sizes(expressions_tuple const & expressions)
return tools::make_vector<int_t>() << size.first << size.second; return tools::make_vector<int_t>() << size.first << size.second;
} }
void maxpy::enqueue(cl::CommandQueue & queue, void maxpy::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache)
unsigned int label,
expressions_tuple const & expressions)
{ {
char kname[10]; char kname[10];
fill_kernel_name(kname, label, "d"); fill_kernel_name(kname, label, "d");
@@ -121,6 +119,9 @@ void maxpy::enqueue(cl::CommandQueue & queue,
kernel.setArg(current_arg++, cl_uint(MN[1])); kernel.setArg(current_arg++, cl_uint(MN[1]));
set_arguments(expressions, kernel, current_arg); set_arguments(expressions, kernel, current_arg);
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange); queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
if(cache)
cache->push_back(queue, kernel, cl::NullRange, grange, lrange);
} }
template class base_impl<maxpy, maxpy_parameters>; template class base_impl<maxpy, maxpy_parameters>;

View File

@@ -568,7 +568,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
void mproduct::enqueue_block(cl::CommandQueue & queue, int_t M, int_t N, int_t K, void mproduct::enqueue_block(cl::CommandQueue & queue, int_t M, int_t N, int_t K,
array_infos const & A, array_infos const & B, array_infos const & C, array_infos const & A, array_infos const & B, array_infos const & C,
value_scalar const & alpha, value_scalar const & beta, value_scalar const & alpha, value_scalar const & beta,
std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, int id) std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, int id, operation_cache * cache)
{ {
if (A.shape1==0 || A.shape2==0 || B.shape1==0 || B.shape2==0 || C.shape1==0 || C.shape2==0) if (A.shape1==0 || A.shape2==0 || B.shape1==0 || B.shape2==0 || C.shape1==0 || C.shape2==0)
return; return;
@@ -596,6 +596,9 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
fun.set_arguments(beta.dtype(), beta.values()); fun.set_arguments(beta.dtype(), beta.values());
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange); queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
if(cache)
cache->push_back(queue, kernel, cl::NullRange, grange, lrange);
} }
array_infos mproduct::create_slice(array_infos & M, int_t s0_0, int_t s0_1, int_t s1_0, int_t s1_1, bool swap) array_infos mproduct::create_slice(array_infos & M, int_t s0_0, int_t s0_1, int_t s1_0, int_t s1_1, bool swap)
@@ -646,7 +649,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
return infos(expressions, d0, d1, d2); return infos(expressions, d0, d1, d2);
} }
void mproduct::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions) void mproduct::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache)
{ {
using namespace tools; using namespace tools;
@@ -684,7 +687,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
{ {
enqueue_block(queue, M, N, K, create_slice(pA, 0, M, 0, K, swap_A), enqueue_block(queue, M, N, K, create_slice(pA, 0, M, 0, K, swap_A),
create_slice(pB, 0, K, 0, N, swap_B), create_slice(pB, 0, K, 0, N, swap_B),
create_slice(pC, 0, M, 0, N, false), *_1, *_0, programs, label, 1); create_slice(pC, 0, M, 0, N, false), *_1, *_0, programs, label, 1, cache);
return; return;
} }
@@ -692,17 +695,17 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
int_t lN = N / p_.nL * p_.nL; int_t lN = N / p_.nL * p_.nL;
int_t lK = K / p_.kL * p_.kL; int_t lK = K / p_.kL * p_.kL;
enqueue_block(queue, lM, lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_0, programs, label, 0); enqueue_block(queue, lM, lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_0, programs, label, 0, cache);
enqueue_block(queue, lM, lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_1, programs, label, 1); enqueue_block(queue, lM, lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_1, programs, label, 1, cache);
enqueue_block(queue, lM, N - lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_0, programs, label, 1); enqueue_block(queue, lM, N - lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_0, programs, label, 1, cache);
enqueue_block(queue, lM, N - lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_1, programs, label, 1); enqueue_block(queue, lM, N - lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_1, programs, label, 1, cache);
enqueue_block(queue, M - lM, lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_0, programs, label, 1); enqueue_block(queue, M - lM, lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_0, programs, label, 1, cache);
enqueue_block(queue, M - lM, lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_1, programs, label, 1); enqueue_block(queue, M - lM, lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_1, programs, label, 1, cache);
enqueue_block(queue, M - lM, N - lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_0, programs, label, 1); enqueue_block(queue, M - lM, N - lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_0, programs, label, 1, cache);
enqueue_block(queue, M - lM, N - lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_1, programs, label, 1); enqueue_block(queue, M - lM, N - lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_1, programs, label, 1, cache);
} }
// //

View File

@@ -214,10 +214,8 @@ std::vector<int_t> mreduction::input_sizes(expressions_tuple const & expressions
return tools::make_vector<int_t>() << MN.first << MN.second; return tools::make_vector<int_t>() << MN.first << MN.second;
} }
void mreduction::enqueue(cl::CommandQueue & queue, void mreduction::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache)
unsigned int label,
expressions_tuple const & expressions)
{ {
char kname[10]; char kname[10];
fill_kernel_name(kname, label, "d"); fill_kernel_name(kname, label, "d");
@@ -240,6 +238,9 @@ void mreduction::enqueue(cl::CommandQueue & queue,
set_arguments(expressions, kernel, current_arg); set_arguments(expressions, kernel, current_arg);
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange); queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
if(cache)
cache->push_back(queue, kernel, cl::NullRange, grange, lrange);
} }
mreduction_rows::mreduction_rows(mreduction_parameters const & parameters, mreduction_rows::mreduction_rows(mreduction_parameters const & parameters,

View File

@@ -280,10 +280,8 @@ std::vector<int_t> reduction::input_sizes(expressions_tuple const & expressions)
return tools::make_vector<int_t>() << N; return tools::make_vector<int_t>() << N;
} }
void reduction::enqueue(cl::CommandQueue & queue, void reduction::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache)
unsigned int label,
expressions_tuple const & expressions)
{ {
//Preprocessing //Preprocessing
int_t size = input_sizes(expressions)[0]; int_t size = input_sizes(expressions)[0];
@@ -343,6 +341,10 @@ void reduction::enqueue(cl::CommandQueue & queue,
for (unsigned int k = 0; k < 2; k++) for (unsigned int k = 0; k < 2; k++)
queue.enqueueNDRangeKernel(kernels[k], cl::NullRange, grange[k], lrange[k]); queue.enqueueNDRangeKernel(kernels[k], cl::NullRange, grange[k], lrange[k]);
if(cache)
for (unsigned int k = 0; k < 2; k++)
cache->push_back(queue, kernels[k], cl::NullRange, grange[k], lrange[k]);
} }
template class base_impl<reduction, reduction_parameters>; template class base_impl<reduction, reduction_parameters>;

View File

@@ -108,10 +108,8 @@ std::vector<int_t> vaxpy::input_sizes(expressions_tuple const & expressions)
return tools::make_vector<int_t>() << size; return tools::make_vector<int_t>() << size;
} }
void vaxpy::enqueue(cl::CommandQueue & queue, void vaxpy::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache)
unsigned int label,
expressions_tuple const & expressions)
{ {
//Size //Size
int_t size = input_sizes(expressions)[0]; int_t size = input_sizes(expressions)[0];
@@ -137,7 +135,9 @@ void vaxpy::enqueue(cl::CommandQueue & queue,
kernel.setArg(current_arg++, cl_uint(size)); kernel.setArg(current_arg++, cl_uint(size));
set_arguments(expressions, kernel, current_arg); set_arguments(expressions, kernel, current_arg);
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange); queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
queue.flush();
if(cache)
cache->push_back(queue, kernel, cl::NullRange, grange, lrange);
} }

View File

@@ -90,7 +90,7 @@ model::model(std::vector< tools::shared_ptr<base> > const & templates, cl::Comma
model::model(base const & tp, cl::CommandQueue & queue) : templates_(1,tp.clone()), queue_(queue) model::model(base const & tp, cl::CommandQueue & queue) : templates_(1,tp.clone()), queue_(queue)
{} {}
void model::execute(expressions_tuple const & expressions, runtime_options const & opt) void model::execute(expressions_tuple const & expressions, operation_cache *cache, runtime_options const & opt)
{ {
std::vector<cl_ext::lazy_compiler> & compilers = init(expressions, opt); std::vector<cl_ext::lazy_compiler> & compilers = init(expressions, opt);
@@ -115,7 +115,7 @@ void model::execute(expressions_tuple const & expressions, runtime_options const
} }
//Execution //Execution
templates_[label]->enqueue(queue_, compilers, label, expressions); return templates_[label]->enqueue(queue_, compilers, label, expressions, cache);
} }
void model::tune(expressions_tuple const & expressions) void model::tune(expressions_tuple const & expressions)

View File

@@ -147,7 +147,7 @@ namespace atidlas
} }
/** @brief Executes a array_expression on the given models map*/ /** @brief Executes a array_expression on the given models map*/
void execute(atidlas::array_expression & array_expression, model_map_t & models) void execute(atidlas::array_expression & array_expression, model_map_t & models, operation_cache * cache)
{ {
cl::Context const & context = array_expression.context(); cl::Context const & context = array_expression.context();
size_t rootidx = array_expression.root(); size_t rootidx = array_expression.root();
@@ -215,7 +215,7 @@ namespace atidlas
} }
/*-----Compute final expression-----*/ /*-----Compute final expression-----*/
models[std::make_pair(final_type, dtype)]->execute(array_expression); models[std::make_pair(final_type, dtype)]->execute(array_expression, cache);
} }
} }