Implemented simple operation cache
This commit is contained in:
@@ -1,5 +1,5 @@
|
|||||||
#include "atidlas/array.h"
|
#include "atidlas/array.h"
|
||||||
#include "atidlas/model/model.h"
|
#include "atidlas/symbolic/execute.h"
|
||||||
#include "atidlas/tools/timer.hpp"
|
#include "atidlas/tools/timer.hpp"
|
||||||
#include "common.hpp"
|
#include "common.hpp"
|
||||||
#ifdef BENCH_CLAMDBLAS
|
#ifdef BENCH_CLAMDBLAS
|
||||||
@@ -32,7 +32,7 @@ void bench(ad::numeric_type dtype)
|
|||||||
times.clear();\
|
times.clear();\
|
||||||
total_time = 0;\
|
total_time = 0;\
|
||||||
OP;\
|
OP;\
|
||||||
ad::cl_ext::synchronize(ad::cl_ext::default_context());\
|
SYNC;\
|
||||||
while(total_time < 5e-1){\
|
while(total_time < 5e-1){\
|
||||||
timer.start(); \
|
timer.start(); \
|
||||||
OP;\
|
OP;\
|
||||||
@@ -44,7 +44,7 @@ void bench(ad::numeric_type dtype)
|
|||||||
std::cout << " " << PERF << std::flush;\
|
std::cout << " " << PERF << std::flush;\
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CL_BENCHMARK(OP, PERF) BENCHMARK(OP, PERF, ad::cl_ext::synchronize(ad::cl_ext::default_context()))
|
#define CL_BENCHMARK(OP, PERF) BENCHMARK(OP, PERF, queue.flush(); queue.finish();)
|
||||||
|
|
||||||
#define CPU_SYNCHRONIZE
|
#define CPU_SYNCHRONIZE
|
||||||
#define CPU_BENCHMARK(OP, PERF) BENCHMARK(OP, PERF, CPU_SYNCHRONIZE)
|
#define CPU_BENCHMARK(OP, PERF) BENCHMARK(OP, PERF, CPU_SYNCHRONIZE)
|
||||||
@@ -62,11 +62,15 @@ void bench(ad::numeric_type dtype)
|
|||||||
std::cout << N;
|
std::cout << N;
|
||||||
/* ATIDLAS */
|
/* ATIDLAS */
|
||||||
ad::array x(N, dtype), y(N, dtype);
|
ad::array x(N, dtype), y(N, dtype);
|
||||||
|
cl::CommandQueue & queue = ad::cl_ext::get_queue(x.context(), 0);
|
||||||
|
ad::model & model = ad::get_model(queue, ad::VECTOR_AXPY_TYPE, dtype);
|
||||||
ad::array_expression E = ad::detail::assign(y, x + y);
|
ad::array_expression E = ad::detail::assign(y, x + y);
|
||||||
ad::model & model = ad::get_model(ad::cl_ext::get_queue(x.context(), 0), ad::VECTOR_AXPY_TYPE, dtype);
|
|
||||||
ad::model::runtime_options opt("saxpy");
|
|
||||||
model.tune(E);
|
model.tune(E);
|
||||||
CL_BENCHMARK(model.execute(E, opt), bandwidth(3*N, tres, dtsize));
|
ad::operation_cache cache;
|
||||||
|
model.execute(E, &cache);
|
||||||
|
queue.flush();
|
||||||
|
queue.finish();
|
||||||
|
CL_BENCHMARK(cache.enqueue(), bandwidth(3*N, tres, dtsize));
|
||||||
/* clAmdBlas */
|
/* clAmdBlas */
|
||||||
#ifdef BENCH_CLAMDBLAS
|
#ifdef BENCH_CLAMDBLAS
|
||||||
CL_BENCHMARK(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &ad::cl_ext::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(3*N, tres, dtsize))
|
CL_BENCHMARK(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &ad::cl_ext::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(3*N, tres, dtsize))
|
||||||
|
@@ -162,9 +162,8 @@ public:
|
|||||||
virtual ~base();
|
virtual ~base();
|
||||||
std::vector<std::string> generate(unsigned int label, expressions_tuple const & expressions, cl::Device const & device);
|
std::vector<std::string> generate(unsigned int label, expressions_tuple const & expressions, cl::Device const & device);
|
||||||
virtual int check_invalid(expressions_tuple const & expressions, cl::Device const & device) const = 0;
|
virtual int check_invalid(expressions_tuple const & expressions, cl::Device const & device) const = 0;
|
||||||
virtual void enqueue(cl::CommandQueue & queue,
|
virtual void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
|
||||||
std::vector<cl_ext::lazy_compiler> & programs,
|
unsigned int label, expressions_tuple const & expressions, operation_cache* cache = NULL) = 0;
|
||||||
unsigned int label, expressions_tuple const & expressions) = 0;
|
|
||||||
virtual tools::shared_ptr<base> clone() const = 0;
|
virtual tools::shared_ptr<base> clone() const = 0;
|
||||||
private:
|
private:
|
||||||
binding_policy_t binding_policy_;
|
binding_policy_t binding_policy_;
|
||||||
|
@@ -27,7 +27,7 @@ public:
|
|||||||
maxpy(parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
|
maxpy(parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
|
||||||
maxpy(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE);
|
maxpy(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE);
|
||||||
std::vector<int_t> input_sizes(expressions_tuple const & expressions);
|
std::vector<int_t> input_sizes(expressions_tuple const & expressions);
|
||||||
void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions);
|
void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache = NULL);
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -41,18 +41,14 @@ private:
|
|||||||
void enqueue_block(cl::CommandQueue & queue, int_t M, int_t N, int_t K,
|
void enqueue_block(cl::CommandQueue & queue, int_t M, int_t N, int_t K,
|
||||||
array_infos const & A, array_infos const & B, array_infos const & C,
|
array_infos const & A, array_infos const & B, array_infos const & C,
|
||||||
value_scalar const & alpha, value_scalar const & beta,
|
value_scalar const & alpha, value_scalar const & beta,
|
||||||
std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, int id);
|
std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, int id, operation_cache * cache);
|
||||||
array_infos create_slice(array_infos & M, int_t s0_0, int_t s0_1, int_t s1_0, int_t s1_1, bool swap);
|
array_infos create_slice(array_infos & M, int_t s0_0, int_t s0_1, int_t s1_0, int_t s1_1, bool swap);
|
||||||
std::vector<int_t> infos(expressions_tuple const & expressions,
|
std::vector<int_t> infos(expressions_tuple const & expressions,
|
||||||
lhs_rhs_element & C, lhs_rhs_element & A, lhs_rhs_element & B);
|
lhs_rhs_element & C, lhs_rhs_element & A, lhs_rhs_element & B);
|
||||||
public:
|
public:
|
||||||
mproduct(mproduct::parameters_type const & parameters, char A_trans, char B_trans);
|
mproduct(mproduct::parameters_type const & parameters, char A_trans, char B_trans);
|
||||||
std::vector<int_t> input_sizes(expressions_tuple const & expressions);
|
std::vector<int_t> input_sizes(expressions_tuple const & expressions);
|
||||||
void enqueue(cl::CommandQueue & queue,
|
void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache = NULL);
|
||||||
std::vector<cl_ext::lazy_compiler> & programs,
|
|
||||||
unsigned int label,
|
|
||||||
expressions_tuple const & expressions);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const char A_trans_;
|
const char A_trans_;
|
||||||
const char B_trans_;
|
const char B_trans_;
|
||||||
|
@@ -35,7 +35,7 @@ private:
|
|||||||
std::vector<std::string> generate_impl(unsigned int, expressions_tuple const &, std::vector<mapping_type> const &) const;
|
std::vector<std::string> generate_impl(unsigned int, expressions_tuple const &, std::vector<mapping_type> const &) const;
|
||||||
public:
|
public:
|
||||||
virtual std::vector<int_t> input_sizes(expressions_tuple const & expressions);
|
virtual std::vector<int_t> input_sizes(expressions_tuple const & expressions);
|
||||||
void enqueue(cl::CommandQueue & queue,std::vector<cl_ext::lazy_compiler> & programs,unsigned int label, expressions_tuple const & expressions);
|
void enqueue(cl::CommandQueue & queue,std::vector<cl_ext::lazy_compiler> & programs,unsigned int label, expressions_tuple const & expressions, operation_cache * cache = NULL);
|
||||||
private:
|
private:
|
||||||
reduction_type reduction_type_;
|
reduction_type reduction_type_;
|
||||||
};
|
};
|
||||||
|
@@ -29,10 +29,7 @@ public:
|
|||||||
reduction(reduction::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
|
reduction(reduction::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
|
||||||
reduction(unsigned int simd, unsigned int ls, unsigned int ng, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE);
|
reduction(unsigned int simd, unsigned int ls, unsigned int ng, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE);
|
||||||
std::vector<int_t> input_sizes(expressions_tuple const & expressions);
|
std::vector<int_t> input_sizes(expressions_tuple const & expressions);
|
||||||
void enqueue(cl::CommandQueue & queue,
|
void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache = NULL);
|
||||||
std::vector<cl_ext::lazy_compiler> & programs,
|
|
||||||
unsigned int label,
|
|
||||||
expressions_tuple const & expressions);
|
|
||||||
private:
|
private:
|
||||||
std::vector< cl::Buffer > tmp_;
|
std::vector< cl::Buffer > tmp_;
|
||||||
std::vector< cl::Buffer > tmpidx_;
|
std::vector< cl::Buffer > tmpidx_;
|
||||||
|
@@ -23,8 +23,7 @@ public:
|
|||||||
vaxpy(vaxpy::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
|
vaxpy(vaxpy::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
|
||||||
vaxpy(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
|
vaxpy(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
|
||||||
std::vector<int_t> input_sizes(expressions_tuple const & expressions);
|
std::vector<int_t> input_sizes(expressions_tuple const & expressions);
|
||||||
void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
|
void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache = NULL);
|
||||||
unsigned int label, expressions_tuple const & expressions);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -38,7 +38,7 @@ namespace atidlas
|
|||||||
model(std::vector< tools::shared_ptr<base> > const &, cl::CommandQueue &);
|
model(std::vector< tools::shared_ptr<base> > const &, cl::CommandQueue &);
|
||||||
model(base const &, cl::CommandQueue &);
|
model(base const &, cl::CommandQueue &);
|
||||||
|
|
||||||
void execute(expressions_tuple const &, runtime_options const & opt = runtime_options());
|
void execute(expressions_tuple const &, operation_cache * cache = NULL, runtime_options const & opt = runtime_options());
|
||||||
void tune(expressions_tuple const &);
|
void tune(expressions_tuple const &);
|
||||||
|
|
||||||
templates_container const & templates() const;
|
templates_container const & templates() const;
|
||||||
|
@@ -9,7 +9,7 @@ namespace atidlas
|
|||||||
{
|
{
|
||||||
|
|
||||||
/** @brief Executes a array_expression on the given queue for the given models map*/
|
/** @brief Executes a array_expression on the given queue for the given models map*/
|
||||||
void execute(array_expression &, model_map_t &);
|
void execute(array_expression &, model_map_t &, operation_cache * cache = NULL);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -2,6 +2,7 @@
|
|||||||
#define ATIDLAS_TYPES_H
|
#define ATIDLAS_TYPES_H
|
||||||
|
|
||||||
#include <CL/cl.hpp>
|
#include <CL/cl.hpp>
|
||||||
|
#include <list>
|
||||||
#include "atidlas/exception/unknown_datatype.h"
|
#include "atidlas/exception/unknown_datatype.h"
|
||||||
|
|
||||||
namespace atidlas
|
namespace atidlas
|
||||||
@@ -60,6 +61,36 @@ struct array_infos
|
|||||||
int_t ld;
|
int_t ld;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class operation_cache
|
||||||
|
{
|
||||||
|
struct infos
|
||||||
|
{
|
||||||
|
infos(cl::CommandQueue & q, cl::Kernel const & k, cl::NDRange const & off, cl::NDRange const & g, cl::NDRange const & l)
|
||||||
|
: queue(q), kernel(k), offset(off), grange(g), lrange(l) {}
|
||||||
|
|
||||||
|
cl::CommandQueue & queue;
|
||||||
|
cl::Kernel kernel;
|
||||||
|
cl::NDRange offset;
|
||||||
|
cl::NDRange grange;
|
||||||
|
cl::NDRange lrange;
|
||||||
|
};
|
||||||
|
|
||||||
|
public:
|
||||||
|
void push_back(cl::CommandQueue & queue, cl::Kernel const & kernel, cl::NDRange const & offset, cl::NDRange const & grange, cl::NDRange const & lrange)
|
||||||
|
{
|
||||||
|
l_.push_back(infos(queue, kernel, offset, grange, lrange));
|
||||||
|
}
|
||||||
|
|
||||||
|
void enqueue()
|
||||||
|
{
|
||||||
|
for(std::list<infos>::iterator it = l_.begin() ; it != l_.end() ; ++it)
|
||||||
|
it->queue.enqueueNDRangeKernel(it->kernel, it->offset, it->grange, it->lrange);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::list<infos> l_;
|
||||||
|
};
|
||||||
|
|
||||||
inline std::string numeric_type_to_string(numeric_type const & type)
|
inline std::string numeric_type_to_string(numeric_type const & type)
|
||||||
{
|
{
|
||||||
switch (type)
|
switch (type)
|
||||||
|
@@ -104,10 +104,8 @@ std::vector<int_t> maxpy::input_sizes(expressions_tuple const & expressions)
|
|||||||
return tools::make_vector<int_t>() << size.first << size.second;
|
return tools::make_vector<int_t>() << size.first << size.second;
|
||||||
}
|
}
|
||||||
|
|
||||||
void maxpy::enqueue(cl::CommandQueue & queue,
|
void maxpy::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
|
||||||
std::vector<cl_ext::lazy_compiler> & programs,
|
unsigned int label, expressions_tuple const & expressions, operation_cache * cache)
|
||||||
unsigned int label,
|
|
||||||
expressions_tuple const & expressions)
|
|
||||||
{
|
{
|
||||||
char kname[10];
|
char kname[10];
|
||||||
fill_kernel_name(kname, label, "d");
|
fill_kernel_name(kname, label, "d");
|
||||||
@@ -121,6 +119,9 @@ void maxpy::enqueue(cl::CommandQueue & queue,
|
|||||||
kernel.setArg(current_arg++, cl_uint(MN[1]));
|
kernel.setArg(current_arg++, cl_uint(MN[1]));
|
||||||
set_arguments(expressions, kernel, current_arg);
|
set_arguments(expressions, kernel, current_arg);
|
||||||
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
|
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
|
||||||
|
|
||||||
|
if(cache)
|
||||||
|
cache->push_back(queue, kernel, cl::NullRange, grange, lrange);
|
||||||
}
|
}
|
||||||
|
|
||||||
template class base_impl<maxpy, maxpy_parameters>;
|
template class base_impl<maxpy, maxpy_parameters>;
|
||||||
|
@@ -568,7 +568,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
|||||||
void mproduct::enqueue_block(cl::CommandQueue & queue, int_t M, int_t N, int_t K,
|
void mproduct::enqueue_block(cl::CommandQueue & queue, int_t M, int_t N, int_t K,
|
||||||
array_infos const & A, array_infos const & B, array_infos const & C,
|
array_infos const & A, array_infos const & B, array_infos const & C,
|
||||||
value_scalar const & alpha, value_scalar const & beta,
|
value_scalar const & alpha, value_scalar const & beta,
|
||||||
std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, int id)
|
std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, int id, operation_cache * cache)
|
||||||
{
|
{
|
||||||
if (A.shape1==0 || A.shape2==0 || B.shape1==0 || B.shape2==0 || C.shape1==0 || C.shape2==0)
|
if (A.shape1==0 || A.shape2==0 || B.shape1==0 || B.shape2==0 || C.shape1==0 || C.shape2==0)
|
||||||
return;
|
return;
|
||||||
@@ -596,6 +596,9 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
|||||||
fun.set_arguments(beta.dtype(), beta.values());
|
fun.set_arguments(beta.dtype(), beta.values());
|
||||||
|
|
||||||
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
|
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
|
||||||
|
|
||||||
|
if(cache)
|
||||||
|
cache->push_back(queue, kernel, cl::NullRange, grange, lrange);
|
||||||
}
|
}
|
||||||
|
|
||||||
array_infos mproduct::create_slice(array_infos & M, int_t s0_0, int_t s0_1, int_t s1_0, int_t s1_1, bool swap)
|
array_infos mproduct::create_slice(array_infos & M, int_t s0_0, int_t s0_1, int_t s1_0, int_t s1_1, bool swap)
|
||||||
@@ -646,7 +649,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
|||||||
return infos(expressions, d0, d1, d2);
|
return infos(expressions, d0, d1, d2);
|
||||||
}
|
}
|
||||||
|
|
||||||
void mproduct::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions)
|
void mproduct::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache)
|
||||||
{
|
{
|
||||||
using namespace tools;
|
using namespace tools;
|
||||||
|
|
||||||
@@ -684,7 +687,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
|||||||
{
|
{
|
||||||
enqueue_block(queue, M, N, K, create_slice(pA, 0, M, 0, K, swap_A),
|
enqueue_block(queue, M, N, K, create_slice(pA, 0, M, 0, K, swap_A),
|
||||||
create_slice(pB, 0, K, 0, N, swap_B),
|
create_slice(pB, 0, K, 0, N, swap_B),
|
||||||
create_slice(pC, 0, M, 0, N, false), *_1, *_0, programs, label, 1);
|
create_slice(pC, 0, M, 0, N, false), *_1, *_0, programs, label, 1, cache);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -692,17 +695,17 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
|||||||
int_t lN = N / p_.nL * p_.nL;
|
int_t lN = N / p_.nL * p_.nL;
|
||||||
int_t lK = K / p_.kL * p_.kL;
|
int_t lK = K / p_.kL * p_.kL;
|
||||||
|
|
||||||
enqueue_block(queue, lM, lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_0, programs, label, 0);
|
enqueue_block(queue, lM, lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_0, programs, label, 0, cache);
|
||||||
enqueue_block(queue, lM, lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_1, programs, label, 1);
|
enqueue_block(queue, lM, lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_1, programs, label, 1, cache);
|
||||||
|
|
||||||
enqueue_block(queue, lM, N - lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_0, programs, label, 1);
|
enqueue_block(queue, lM, N - lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_0, programs, label, 1, cache);
|
||||||
enqueue_block(queue, lM, N - lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_1, programs, label, 1);
|
enqueue_block(queue, lM, N - lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_1, programs, label, 1, cache);
|
||||||
|
|
||||||
enqueue_block(queue, M - lM, lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_0, programs, label, 1);
|
enqueue_block(queue, M - lM, lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_0, programs, label, 1, cache);
|
||||||
enqueue_block(queue, M - lM, lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_1, programs, label, 1);
|
enqueue_block(queue, M - lM, lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_1, programs, label, 1, cache);
|
||||||
|
|
||||||
enqueue_block(queue, M - lM, N - lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_0, programs, label, 1);
|
enqueue_block(queue, M - lM, N - lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_0, programs, label, 1, cache);
|
||||||
enqueue_block(queue, M - lM, N - lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_1, programs, label, 1);
|
enqueue_block(queue, M - lM, N - lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_1, programs, label, 1, cache);
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@@ -214,10 +214,8 @@ std::vector<int_t> mreduction::input_sizes(expressions_tuple const & expressions
|
|||||||
return tools::make_vector<int_t>() << MN.first << MN.second;
|
return tools::make_vector<int_t>() << MN.first << MN.second;
|
||||||
}
|
}
|
||||||
|
|
||||||
void mreduction::enqueue(cl::CommandQueue & queue,
|
void mreduction::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
|
||||||
std::vector<cl_ext::lazy_compiler> & programs,
|
unsigned int label, expressions_tuple const & expressions, operation_cache * cache)
|
||||||
unsigned int label,
|
|
||||||
expressions_tuple const & expressions)
|
|
||||||
{
|
{
|
||||||
char kname[10];
|
char kname[10];
|
||||||
fill_kernel_name(kname, label, "d");
|
fill_kernel_name(kname, label, "d");
|
||||||
@@ -240,6 +238,9 @@ void mreduction::enqueue(cl::CommandQueue & queue,
|
|||||||
set_arguments(expressions, kernel, current_arg);
|
set_arguments(expressions, kernel, current_arg);
|
||||||
|
|
||||||
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
|
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
|
||||||
|
|
||||||
|
if(cache)
|
||||||
|
cache->push_back(queue, kernel, cl::NullRange, grange, lrange);
|
||||||
}
|
}
|
||||||
|
|
||||||
mreduction_rows::mreduction_rows(mreduction_parameters const & parameters,
|
mreduction_rows::mreduction_rows(mreduction_parameters const & parameters,
|
||||||
|
@@ -280,10 +280,8 @@ std::vector<int_t> reduction::input_sizes(expressions_tuple const & expressions)
|
|||||||
return tools::make_vector<int_t>() << N;
|
return tools::make_vector<int_t>() << N;
|
||||||
}
|
}
|
||||||
|
|
||||||
void reduction::enqueue(cl::CommandQueue & queue,
|
void reduction::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
|
||||||
std::vector<cl_ext::lazy_compiler> & programs,
|
unsigned int label, expressions_tuple const & expressions, operation_cache * cache)
|
||||||
unsigned int label,
|
|
||||||
expressions_tuple const & expressions)
|
|
||||||
{
|
{
|
||||||
//Preprocessing
|
//Preprocessing
|
||||||
int_t size = input_sizes(expressions)[0];
|
int_t size = input_sizes(expressions)[0];
|
||||||
@@ -343,6 +341,10 @@ void reduction::enqueue(cl::CommandQueue & queue,
|
|||||||
|
|
||||||
for (unsigned int k = 0; k < 2; k++)
|
for (unsigned int k = 0; k < 2; k++)
|
||||||
queue.enqueueNDRangeKernel(kernels[k], cl::NullRange, grange[k], lrange[k]);
|
queue.enqueueNDRangeKernel(kernels[k], cl::NullRange, grange[k], lrange[k]);
|
||||||
|
|
||||||
|
if(cache)
|
||||||
|
for (unsigned int k = 0; k < 2; k++)
|
||||||
|
cache->push_back(queue, kernels[k], cl::NullRange, grange[k], lrange[k]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template class base_impl<reduction, reduction_parameters>;
|
template class base_impl<reduction, reduction_parameters>;
|
||||||
|
@@ -108,10 +108,8 @@ std::vector<int_t> vaxpy::input_sizes(expressions_tuple const & expressions)
|
|||||||
return tools::make_vector<int_t>() << size;
|
return tools::make_vector<int_t>() << size;
|
||||||
}
|
}
|
||||||
|
|
||||||
void vaxpy::enqueue(cl::CommandQueue & queue,
|
void vaxpy::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
|
||||||
std::vector<cl_ext::lazy_compiler> & programs,
|
unsigned int label, expressions_tuple const & expressions, operation_cache * cache)
|
||||||
unsigned int label,
|
|
||||||
expressions_tuple const & expressions)
|
|
||||||
{
|
{
|
||||||
//Size
|
//Size
|
||||||
int_t size = input_sizes(expressions)[0];
|
int_t size = input_sizes(expressions)[0];
|
||||||
@@ -137,7 +135,9 @@ void vaxpy::enqueue(cl::CommandQueue & queue,
|
|||||||
kernel.setArg(current_arg++, cl_uint(size));
|
kernel.setArg(current_arg++, cl_uint(size));
|
||||||
set_arguments(expressions, kernel, current_arg);
|
set_arguments(expressions, kernel, current_arg);
|
||||||
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
|
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
|
||||||
queue.flush();
|
|
||||||
|
if(cache)
|
||||||
|
cache->push_back(queue, kernel, cl::NullRange, grange, lrange);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -90,7 +90,7 @@ model::model(std::vector< tools::shared_ptr<base> > const & templates, cl::Comma
|
|||||||
model::model(base const & tp, cl::CommandQueue & queue) : templates_(1,tp.clone()), queue_(queue)
|
model::model(base const & tp, cl::CommandQueue & queue) : templates_(1,tp.clone()), queue_(queue)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
void model::execute(expressions_tuple const & expressions, runtime_options const & opt)
|
void model::execute(expressions_tuple const & expressions, operation_cache *cache, runtime_options const & opt)
|
||||||
{
|
{
|
||||||
std::vector<cl_ext::lazy_compiler> & compilers = init(expressions, opt);
|
std::vector<cl_ext::lazy_compiler> & compilers = init(expressions, opt);
|
||||||
|
|
||||||
@@ -115,7 +115,7 @@ void model::execute(expressions_tuple const & expressions, runtime_options const
|
|||||||
}
|
}
|
||||||
|
|
||||||
//Execution
|
//Execution
|
||||||
templates_[label]->enqueue(queue_, compilers, label, expressions);
|
return templates_[label]->enqueue(queue_, compilers, label, expressions, cache);
|
||||||
}
|
}
|
||||||
|
|
||||||
void model::tune(expressions_tuple const & expressions)
|
void model::tune(expressions_tuple const & expressions)
|
||||||
|
@@ -147,7 +147,7 @@ namespace atidlas
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** @brief Executes a array_expression on the given models map*/
|
/** @brief Executes a array_expression on the given models map*/
|
||||||
void execute(atidlas::array_expression & array_expression, model_map_t & models)
|
void execute(atidlas::array_expression & array_expression, model_map_t & models, operation_cache * cache)
|
||||||
{
|
{
|
||||||
cl::Context const & context = array_expression.context();
|
cl::Context const & context = array_expression.context();
|
||||||
size_t rootidx = array_expression.root();
|
size_t rootidx = array_expression.root();
|
||||||
@@ -215,7 +215,7 @@ namespace atidlas
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*-----Compute final expression-----*/
|
/*-----Compute final expression-----*/
|
||||||
models[std::make_pair(final_type, dtype)]->execute(array_expression);
|
models[std::make_pair(final_type, dtype)]->execute(array_expression, cache);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user