Better control flow through options
This commit is contained in:
@@ -105,23 +105,22 @@ std::vector<int_t> maxpy::input_sizes(expressions_tuple const & expressions)
|
||||
}
|
||||
|
||||
void maxpy::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
|
||||
unsigned int label, expressions_tuple const & expressions, operation_cache * cache)
|
||||
unsigned int label, controller<expressions_tuple> const & controller)
|
||||
{
|
||||
expressions_tuple const & expressions = controller.x();
|
||||
char kname[10];
|
||||
fill_kernel_name(kname, label, "d");
|
||||
cl::Program & program = programs[0].program();
|
||||
cl::Kernel kernel(program, kname);
|
||||
cl::NDRange grange(p_.local_size_0*p_.num_groups_0, p_.local_size_1*p_.num_groups_1);
|
||||
cl::NDRange lrange(p_.local_size_0, p_.local_size_1);
|
||||
cl::NDRange global(p_.local_size_0*p_.num_groups_0, p_.local_size_1*p_.num_groups_1);
|
||||
cl::NDRange local(p_.local_size_0, p_.local_size_1);
|
||||
unsigned int current_arg = 0;
|
||||
std::vector<int_t> MN = input_sizes(expressions);
|
||||
kernel.setArg(current_arg++, cl_uint(MN[0]));
|
||||
kernel.setArg(current_arg++, cl_uint(MN[1]));
|
||||
set_arguments(expressions, kernel, current_arg);
|
||||
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
|
||||
|
||||
if(cache)
|
||||
cache->push_back(queue, kernel, cl::NullRange, grange, lrange);
|
||||
controller.execution_options().enqueue_cache(queue, kernel, cl::NullRange, global, local);
|
||||
}
|
||||
|
||||
template class base_impl<maxpy, maxpy_parameters>;
|
||||
|
@@ -568,7 +568,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
||||
void mproduct::enqueue_block(cl::CommandQueue & queue, int_t M, int_t N, int_t K,
|
||||
array_infos const & A, array_infos const & B, array_infos const & C,
|
||||
value_scalar const & alpha, value_scalar const & beta,
|
||||
std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, int id, operation_cache * cache)
|
||||
std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, int id, execution_options_type const & options)
|
||||
{
|
||||
if (A.shape1==0 || A.shape2==0 || B.shape1==0 || B.shape2==0 || C.shape1==0 || C.shape2==0)
|
||||
return;
|
||||
@@ -578,8 +578,8 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
||||
|
||||
cl::Program & program = programs[id].program();
|
||||
cl::Kernel kernel(program, kname);
|
||||
cl::NDRange lrange(p_.local_size_0, p_.local_size_1);
|
||||
cl::NDRange grange = (id==1)?cl::NDRange(align(align(M,p_.mS)/p_.mS, p_.local_size_0), align(align(N,p_.nS)/p_.nS, p_.local_size_1)):
|
||||
cl::NDRange local(p_.local_size_0, p_.local_size_1);
|
||||
cl::NDRange global = (id==1)?cl::NDRange(align(align(M,p_.mS)/p_.mS, p_.local_size_0), align(align(N,p_.nS)/p_.nS, p_.local_size_1)):
|
||||
cl::NDRange(M/p_.mS, N/p_.nS);
|
||||
|
||||
unsigned int current_arg = 0;
|
||||
@@ -595,10 +595,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
||||
fun.set_arguments(B);
|
||||
fun.set_arguments(beta.dtype(), beta.values());
|
||||
|
||||
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
|
||||
|
||||
if(cache)
|
||||
cache->push_back(queue, kernel, cl::NullRange, grange, lrange);
|
||||
options.enqueue_cache(queue,kernel, cl::NullRange, global, local);
|
||||
}
|
||||
|
||||
array_infos mproduct::create_slice(array_infos & M, int_t s0_0, int_t s0_1, int_t s1_0, int_t s1_1, bool swap)
|
||||
@@ -649,13 +646,17 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
||||
return infos(expressions, d0, d1, d2);
|
||||
}
|
||||
|
||||
void mproduct::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, expressions_tuple const & expressions, operation_cache * cache)
|
||||
void mproduct::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, controller<expressions_tuple> const & controller)
|
||||
{
|
||||
using namespace tools;
|
||||
|
||||
expressions_tuple const & expressions = controller.x();
|
||||
|
||||
lhs_rhs_element C, A, B;
|
||||
std::vector<int_t> MNK = infos(expressions, C, A, B);
|
||||
|
||||
execution_options_type const & options = controller.execution_options();
|
||||
|
||||
int_t M = MNK[0];
|
||||
int_t N = MNK[1];
|
||||
int_t K = MNK[2];
|
||||
@@ -687,7 +688,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
||||
{
|
||||
enqueue_block(queue, M, N, K, create_slice(pA, 0, M, 0, K, swap_A),
|
||||
create_slice(pB, 0, K, 0, N, swap_B),
|
||||
create_slice(pC, 0, M, 0, N, false), *_1, *_0, programs, label, 1, cache);
|
||||
create_slice(pC, 0, M, 0, N, false), *_1, *_0, programs, label, 1, options);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -695,17 +696,17 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
||||
int_t lN = N / p_.nL * p_.nL;
|
||||
int_t lK = K / p_.kL * p_.kL;
|
||||
|
||||
enqueue_block(queue, lM, lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_0, programs, label, 0, cache);
|
||||
enqueue_block(queue, lM, lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_1, programs, label, 1, cache);
|
||||
enqueue_block(queue, lM, lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_0, programs, label, 0, options);
|
||||
enqueue_block(queue, lM, lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, 0, lM, 0, lN, false), *_1, *_1, programs, label, 1, options);
|
||||
|
||||
enqueue_block(queue, lM, N - lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_0, programs, label, 1, cache);
|
||||
enqueue_block(queue, lM, N - lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_1, programs, label, 1, cache);
|
||||
enqueue_block(queue, lM, N - lN, lK, create_slice(pA, 0, lM, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_0, programs, label, 1, options);
|
||||
enqueue_block(queue, lM, N - lN, K - lK, create_slice(pA, 0, lM, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, 0, lM, lN, N, false), *_1, *_1, programs, label, 1, options);
|
||||
|
||||
enqueue_block(queue, M - lM, lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_0, programs, label, 1, cache);
|
||||
enqueue_block(queue, M - lM, lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_1, programs, label, 1, cache);
|
||||
enqueue_block(queue, M - lM, lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_0, programs, label, 1, options);
|
||||
enqueue_block(queue, M - lM, lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, 0, lN, swap_B), create_slice(pC, lM, M, 0, lN, false), *_1, *_1, programs, label, 1, options);
|
||||
|
||||
enqueue_block(queue, M - lM, N - lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_0, programs, label, 1, cache);
|
||||
enqueue_block(queue, M - lM, N - lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_1, programs, label, 1, cache);
|
||||
enqueue_block(queue, M - lM, N - lN, lK, create_slice(pA, lM, M, 0, lK, swap_A), create_slice(pB, 0, lK, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_0, programs, label, 1, options);
|
||||
enqueue_block(queue, M - lM, N - lN, K - lK, create_slice(pA, lM, M, lK, K, swap_A), create_slice(pB, lK, K, lN, N, swap_B), create_slice(pC, lM, M, lN, N, false), *_1, *_1, programs, label, 1, options);
|
||||
}
|
||||
|
||||
//
|
||||
|
@@ -214,9 +214,10 @@ std::vector<int_t> mreduction::input_sizes(expressions_tuple const & expressions
|
||||
return tools::make_vector<int_t>() << MN.first << MN.second;
|
||||
}
|
||||
|
||||
void mreduction::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
|
||||
unsigned int label, expressions_tuple const & expressions, operation_cache * cache)
|
||||
void mreduction::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, controller<expressions_tuple> const & controller)
|
||||
{
|
||||
expressions_tuple const & expressions = controller.x();
|
||||
|
||||
char kname[10];
|
||||
fill_kernel_name(kname, label, "d");
|
||||
std::vector<int_t> MN = input_sizes(expressions);
|
||||
@@ -229,18 +230,15 @@ void mreduction::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_comp
|
||||
cl::Kernel kernel(program, kname);
|
||||
|
||||
//NDRange
|
||||
cl::NDRange grange(p_.local_size_0*p_.num_groups_0, p_.local_size_1);
|
||||
cl::NDRange lrange(p_.local_size_0, p_.local_size_1);
|
||||
cl::NDRange global(p_.local_size_0*p_.num_groups_0, p_.local_size_1);
|
||||
cl::NDRange local(p_.local_size_0, p_.local_size_1);
|
||||
|
||||
unsigned int current_arg = 0;
|
||||
kernel.setArg(current_arg++, cl_uint(MN[0]));
|
||||
kernel.setArg(current_arg++, cl_uint(MN[1]));
|
||||
set_arguments(expressions, kernel, current_arg);
|
||||
|
||||
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
|
||||
|
||||
if(cache)
|
||||
cache->push_back(queue, kernel, cl::NullRange, grange, lrange);
|
||||
controller.execution_options().enqueue_cache(queue, kernel, cl::NullRange, global, local);
|
||||
}
|
||||
|
||||
mreduction_rows::mreduction_rows(mreduction_parameters const & parameters,
|
||||
|
@@ -280,9 +280,10 @@ std::vector<int_t> reduction::input_sizes(expressions_tuple const & expressions)
|
||||
return tools::make_vector<int_t>() << N;
|
||||
}
|
||||
|
||||
void reduction::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
|
||||
unsigned int label, expressions_tuple const & expressions, operation_cache * cache)
|
||||
void reduction::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, controller<expressions_tuple> const & controller)
|
||||
{
|
||||
expressions_tuple const & expressions = controller.x();
|
||||
|
||||
//Preprocessing
|
||||
int_t size = input_sizes(expressions)[0];
|
||||
std::vector<array_expression::node const *> reductions;
|
||||
@@ -307,8 +308,8 @@ void reduction::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compi
|
||||
cl::Kernel(program, fallback?kfallback[1]:kopt[1]) };
|
||||
|
||||
//NDRange
|
||||
cl::NDRange grange[2] = { cl::NDRange(p_.local_size_0*p_.num_groups), cl::NDRange(p_.local_size_0) };
|
||||
cl::NDRange lrange[2] = { cl::NDRange(p_.local_size_0), cl::NDRange(p_.local_size_0) };
|
||||
cl::NDRange global[2] = { cl::NDRange(p_.local_size_0*p_.num_groups), cl::NDRange(p_.local_size_0) };
|
||||
cl::NDRange local[2] = { cl::NDRange(p_.local_size_0), cl::NDRange(p_.local_size_0) };
|
||||
|
||||
//Arguments
|
||||
cl::Context context = expressions.context();
|
||||
@@ -340,11 +341,7 @@ void reduction::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compi
|
||||
}
|
||||
|
||||
for (unsigned int k = 0; k < 2; k++)
|
||||
queue.enqueueNDRangeKernel(kernels[k], cl::NullRange, grange[k], lrange[k]);
|
||||
|
||||
if(cache)
|
||||
for (unsigned int k = 0; k < 2; k++)
|
||||
cache->push_back(queue, kernels[k], cl::NullRange, grange[k], lrange[k]);
|
||||
controller.execution_options().enqueue_cache(queue, kernels[k], cl::NullRange, global[k], local[k]);
|
||||
}
|
||||
|
||||
template class base_impl<reduction, reduction_parameters>;
|
||||
|
@@ -108,9 +108,9 @@ std::vector<int_t> vaxpy::input_sizes(expressions_tuple const & expressions)
|
||||
return tools::make_vector<int_t>() << size;
|
||||
}
|
||||
|
||||
void vaxpy::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
|
||||
unsigned int label, expressions_tuple const & expressions, operation_cache * cache)
|
||||
void vaxpy::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, controller<expressions_tuple> const & controller)
|
||||
{
|
||||
expressions_tuple const & expressions = controller.x();
|
||||
//Size
|
||||
int_t size = input_sizes(expressions)[0];
|
||||
//Kernel
|
||||
@@ -128,16 +128,14 @@ void vaxpy::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler>
|
||||
cl::Kernel & kernel = it->second;
|
||||
|
||||
//NDRange
|
||||
cl::NDRange grange(p_.local_size_0*p_.num_groups);
|
||||
cl::NDRange lrange(p_.local_size_0);
|
||||
cl::NDRange global(p_.local_size_0*p_.num_groups);
|
||||
cl::NDRange local(p_.local_size_0);
|
||||
//Arguments
|
||||
unsigned int current_arg = 0;
|
||||
kernel.setArg(current_arg++, cl_uint(size));
|
||||
set_arguments(expressions, kernel, current_arg);
|
||||
queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange);
|
||||
|
||||
if(cache)
|
||||
cache->push_back(queue, kernel, cl::NullRange, grange, lrange);
|
||||
controller.execution_options().enqueue_cache(queue, kernel, cl::NullRange, global, local);
|
||||
}
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user