Kernels: Fixed various corner cases for the kernel templates and BLAS

This commit is contained in:
Philippe Tillet
2015-11-25 18:42:25 -05:00
parent 6be5929b0d
commit 6fc94c0c0b
15 changed files with 107 additions and 38 deletions

View File

@@ -20,6 +20,8 @@ class ISAACAPI Context
friend class CommandQueue;
friend class Buffer;
static std::string cache_path();
public:
explicit Context(CUcontext const & context, CUdevice const & device, bool take_ownership = true);
explicit Context(cl_context const & context, bool take_ownership = true);

View File

@@ -903,6 +903,8 @@ math_expression dot(LTYPE const & x, RTYPE const & y)\
{\
numeric_type dtype = x.dtype();\
driver::Context const & context = x.context();\
if(x.shape().max()==1 || y.shape().max()==1)\
return x*y;\
if(x.dim()==2 && x.shape()[1]==0)\
return zeros(x.shape()[0], y.shape()[1], dtype, context);\
if(x.shape()[0]==0 || (y.dim()==2 && y.shape()[1]==0))\
@@ -927,10 +929,12 @@ math_expression dot(LTYPE const & x, RTYPE const & y)\
else\
return trans(detail::matvecprod(trans(y), trans(x)));\
}\
if(x.shape()[0]==1)\
if(x.shape()[0]==1 && y.shape()[1]==1)\
return sum(x*trans(y));\
if(x.shape()[1]==1)\
return outer(x, y);\
if(x.shape()[0]==1 && y.shape()[1]==2)\
return trans(detail::matvecprod(trans(y), trans(x)));\
if(x.shape()[1]==1 && y.shape()[0]==1)\
return x*y;\
else /*if(x.dim()==2 && y.dim()==2)*/\
return detail::matmatprod(x, y);\
}
@@ -995,7 +999,7 @@ void copy(void const * data, array_base& x, driver::CommandQueue & queue, bool b
void copy(array_base const & x, void* data, driver::CommandQueue & queue, bool blocking)
{
unsigned int dtypesize = size_of(x.dtype());
if(x.start()==0 && x.shape()[0]*x.stride().prod()==x.shape().prod())
if(x.start()==0 && x.stride().prod()==x.shape().prod())
{
queue.read(x.data(), blocking, 0, x.shape().prod()*dtypesize, data);
}

View File

@@ -4,7 +4,9 @@
#include "isaac/driver/program.h"
#include "helpers/ocl/infos.hpp"
#include "getenv.hpp"
#include "mkdir.hpp"
namespace isaac
{
@@ -12,17 +14,37 @@ namespace isaac
namespace driver
{
Context::Context(CUcontext const & context, CUdevice const & device, bool take_ownership) : backend_(CUDA), device_(device, false), cache_path_(tools::getenv("ISAAC_CACHE_PATH")), h_(backend_, take_ownership)
std::string Context::cache_path()
{
//user-specified cache path
std::string result = tools::getenv("ISAAC_CACHE_PATH");
if(!result.empty())
return result;
//create in home
result = tools::getenv("HOME");
if(!result.empty())
{
result = result + "/.isaac/cache/";
if(tools::mkpath(result)==0)
return result;
}
//couldn't find a directory
return "";
}
Context::Context(CUcontext const & context, CUdevice const & device, bool take_ownership) : backend_(CUDA), device_(device, false), cache_path_(cache_path()), h_(backend_, take_ownership)
{
h_.cu() = context;
}
Context::Context(cl_context const & context, bool take_ownership) : backend_(OPENCL), device_(ocl::info<CL_CONTEXT_DEVICES>(context)[0], false), cache_path_(tools::getenv("ISAAC_CACHE_PATH")), h_(backend_, take_ownership)
Context::Context(cl_context const & context, bool take_ownership) : backend_(OPENCL), device_(ocl::info<CL_CONTEXT_DEVICES>(context)[0], false), cache_path_(cache_path()), h_(backend_, take_ownership)
{
h_.cl() = context;
}
Context::Context(Device const & device) : backend_(device.backend_), device_(device), cache_path_(tools::getenv("ISAAC_CACHE_PATH")), h_(backend_, true)
Context::Context(Device const & device) : backend_(device.backend_), device_(device), cache_path_(cache_path()), h_(backend_, true)
{
switch(backend_)
{

View File

@@ -309,6 +309,8 @@ void mapped_repeat::postprocess(std::string &res) const
replace_macro(res, "$VALUE", MorphValue(type_));
accessors["arrayn"] = res;
accessors["array1n"] = res;
accessors["arrayn1"] = res;
accessors["arraynn"] = res;
res = isaac::evaluate(LHS_NODE_TYPE, accessors, *info_.math_expression, info_.root_idx, *info_.mapping);
}

View File

@@ -419,15 +419,9 @@ void math_expression_representation_functor::append(lhs_rhs_element const & lhs_
{
if(lhs_rhs.subtype==DENSE_ARRAY_TYPE)
{
char prefix;
if(lhs_rhs.array->shape().max()==1)
prefix = '0';
else if(lhs_rhs.array->dim()==1 || lhs_rhs.array->shape().min()==1)
prefix = '1';
else
prefix = '2';
for(int i = 0 ; i < lhs_rhs.array->dim() ; ++i)
*ptr_++= lhs_rhs.array->shape()[i]>1?'n':'1';
numeric_type dtype = lhs_rhs.array->dtype();
*ptr_++=prefix;
*ptr_++=(char)dtype;
append_id(ptr_, binder_.get(lhs_rhs.array, is_assigned));

View File

@@ -46,6 +46,7 @@ std::string axpy::generate_impl(std::string const & suffix, math_expression cons
std::vector<size_t> assigned_scalar = filter_nodes([](math_expression::node const & node) {
return detail::is_assignment(node.op) && node.lhs.subtype==DENSE_ARRAY_TYPE && node.lhs.array->shape().max()==1;
}, expressions, expressions.root(), true);
switch(backend)
{
case driver::CUDA:
@@ -59,6 +60,7 @@ std::string axpy::generate_impl(std::string const & suffix, math_expression cons
stream.inc_tab();
process(stream, PARENT_NODE_TYPE, {{"array1", "#scalartype #namereg = #pointer[#start];"},
{"array11", "#scalartype #namereg = #pointer[#start];"},
{"arrayn", "#pointer += #start;"}}, expressions, mappings);
stream << _size_t << " idx = " << GlobalIdx0(backend) << ";" << std::endl;
@@ -107,7 +109,7 @@ std::string axpy::generate_impl(std::string const & suffix, math_expression cons
//Declares register to store results
for(std::size_t idx: assigned)
{
process(stream, LHS_NODE_TYPE, {{"arrayn", dtype + " #namereg;"}, {"arraynn", dtype + " #namereg;"}, {"matrix_row", "#scalartype #namereg;"},
process(stream, LHS_NODE_TYPE, {{"arrayn", dtype + " #namereg;"}, {"arrayn1", dtype + " #namereg;"}, {"array1n", dtype + " #namereg;"}, {"arraynn", dtype + " #namereg;"}, {"matrix_row", "#scalartype #namereg;"},
{"matrix_column", "#scalartype #namereg;"}, {"matrix_diag", "#scalartype #namereg;"}}, expressions, idx, mappings, processed);
}
@@ -119,17 +121,21 @@ std::string axpy::generate_impl(std::string const & suffix, math_expression cons
std::string matrix_row = dtype + " #namereg = " + vload(p_.simd_width, "#scalartype", "i*#ld", "#pointer + #row*#stride", "#ld", backend, false) + ";";
std::string matrix_column = dtype + " #namereg = " + vload(p_.simd_width, "#scalartype", "i*#stride", "#pointer + #column*#ld", "#stride", backend, false) + ";";
std::string matrix_diag = dtype + " #namereg = " + vload(p_.simd_width, "#scalartype", "i*(#ld + #stride)", "#pointer + ((#diag_offset<0)?-#diag_offset:(#diag_offset*#ld))", "#ld + #stride", backend, false) + ";";
process(stream, RHS_NODE_TYPE, {{"arrayn", arrayn}, {"matrix_row", matrix_row}, {"matrix_column", matrix_column},
process(stream, RHS_NODE_TYPE, {{"arrayn", arrayn}, {"arrayn1", arrayn}, {"array1n", arrayn}, {"matrix_row", matrix_row}, {"matrix_column", matrix_column},
{"matrix_diag", matrix_diag}, {"array_access", array_access}}, expressions, idx, mappings, processed);
}
//Compute expressions
for(std::size_t idx: assigned)
stream << evaluate(PARENT_NODE_TYPE, {{"array1", "#namereg"}, {"arrayn", "#namereg"},
for(std::size_t idx: assigned){
std::string host_scalar_access = "#name";
if(p_.simd_width>1 && std::find(assigned_scalar.begin(), assigned_scalar.end(), idx)==assigned_scalar.end())
host_scalar_access = InitPrefix(backend, dtype).get() + "(#name)";
stream << evaluate(PARENT_NODE_TYPE, {{"array1", "#namereg"}, {"arrayn1", "#namereg"}, {"array1n", "#namereg"}, {"array11", "#namereg"}, {"arrayn", "#namereg"},
{"matrix_row", "#namereg"}, {"matrix_column", "#namereg"}, {"matrix_diag", "#namereg"}, {"array_access", "#namereg"},
{"cast", CastPrefix(backend, dtype).get()}, {"placeholder", "#name"}, {"host_scalar", p_.simd_width==1?"#name": InitPrefix(backend, dtype).get() + "(#name)"}},
{"cast", CastPrefix(backend, dtype).get()}, {"placeholder", "#name"}, {"host_scalar", host_scalar_access}},
expressions, idx, mappings) << ";" << std::endl;
}
//Writes back to registers
processed.clear();
@@ -139,7 +145,7 @@ std::string axpy::generate_impl(std::string const & suffix, math_expression cons
std::string matrix_row = vstore(p_.simd_width, "#scalartype", "#namereg", "i*#ld", "#pointer + #row*#stride", "#ld", backend, false) + ";";
std::string matrix_column = vstore(p_.simd_width, "#scalartype", "#namereg", "i*#stride", "#pointer + #column*#ld", "#stride", backend, false) + ";";
std::string matrix_diag = vstore(p_.simd_width, "#scalartype", "#namereg", "i*(#ld + #stride)", "#pointer + (#diag_offset<0)?-#diag_offset:(#diag_offset*#ld)", "#ld + #stride", backend, false) + ";";
process(stream, LHS_NODE_TYPE, {{"arrayn", arrayn}, {"matrix_row", matrix_row}, {"matrix_column", matrix_column}, {"matrix_diag", matrix_diag}}, expressions, idx, mappings, processed);
process(stream, LHS_NODE_TYPE, {{"arrayn", arrayn}, {"array1n", arrayn}, {"arrayn1", arrayn}, {"matrix_row", matrix_row}, {"matrix_column", matrix_column}, {"matrix_diag", matrix_diag}}, expressions, idx, mappings, processed);
}
if(sfors.size()){
@@ -157,14 +163,13 @@ std::string axpy::generate_impl(std::string const & suffix, math_expression cons
stream << "{" << std::endl;
stream.inc_tab();
for(std::size_t idx: assigned)
process(stream, LHS_NODE_TYPE, { {"array1", "#pointer[#start] = #namereg;"} }, expressions, idx, mappings, processed);
process(stream, LHS_NODE_TYPE, { {"array1", "#pointer[#start] = #namereg;"}, {"array11", "#pointer[#start] = #namereg;"} }, expressions, idx, mappings, processed);
stream.dec_tab();
stream << "}" << std::endl;
}
stream.dec_tab();
stream << "}" << std::endl;
// std::cout << stream.str() << std::endl;
return stream.str();
}

View File

@@ -258,6 +258,7 @@ std::string dot::generate_impl(std::string const & suffix, math_expression const
std::map<std::string, std::string> accessors;
accessors["scalar_dot"] = "#name_buf[0]";
accessors["array1"] = "#pointer[#start]";
accessors["array11"] = "#pointer[#start]";
stream << evaluate(PARENT_NODE_TYPE, accessors, expressions, expressions.root(), mapping) << ";" << std::endl;
stream.dec_tab();
stream << "}" << std::endl;

View File

@@ -516,7 +516,10 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
{
string Ci = to_string((m/p_.simd_width)*(p_.local_size_0*p_.simd_width) + m%p_.simd_width);
stream << "if(" << Ci << "< M) ";
stream << "C[" << Ci << CSTRIDE1 << "] = rC[" << m << "][" << n << "] + (beta?beta*" << "C[" << Ci << CSTRIDE1 << "]:0);" << std::endl;
if(has_depth)
stream << "C[" << Ci << CSTRIDE1 << "] = rC[" << m << "][" << n << "];" << std::endl;
else
stream << "C[" << Ci << CSTRIDE1 << "] = rC[" << m << "][" << n << "] + (beta?(beta*" << "C[" << Ci << CSTRIDE1 << "]):0);" << std::endl;
}
if((n+1)%p_.simd_width==0){
stream << "C += ldc*" << p_.local_size_1*p_.simd_width - p_.simd_width + 1 << ";" << std::endl;
@@ -552,7 +555,7 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
stream.inc_tab();
stream << "acc += Z[i + j*Zld + k*Zld*N];" << std::endl;
stream.dec_tab();
stream << "C[i*Cstride + j*ldc] = acc + beta*C[i + j*ldc];" << std::endl;
stream << "C[i*Cstride + j*ldc] = acc + beta*C[i*Cstride + j*ldc];" << std::endl;
stream.dec_tab();
stream << "}" << std::endl;
stream.dec_tab();

View File

@@ -190,6 +190,8 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
if(col_simd_width > 1)
accessors["gemv"] = access_vector_type(accessors["gemv"], s);
accessors["arrayn"] = "#pointer[(r +" + to_string(s) + ")*#stride]";
accessors["array1n"] = "#pointer[(r +" + to_string(s) + ")*#stride]";
accessors["arrayn1"] = "#pointer[(r +" + to_string(s) + ")*#stride]";
stream << evaluate(PARENT_NODE_TYPE, accessors, expression, expression.root(), mapping) << ";" << std::endl;
}
}
@@ -244,6 +246,8 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
process(stream, PARENT_NODE_TYPE,
{{"array1", "#scalartype #namereg = #pointer[#start];"},
{"arrayn", "#pointer += #start;"},
{"array1n", "#pointer += #start;"},
{"arrayn1", "#pointer += #start;"},
{"arraynn", "#pointer += #start; "}}, expression, mapping);
for (const auto & e : dots)
@@ -309,6 +313,8 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
std::map<std::string, std::string> accessors;
accessors["gemv"] = "#name_buf[lidy*" + local_size_0_ld_str + "]";
accessors["arrayn"] = "#pointer[r*#stride]";
accessors["array1n"] = "#pointer[r*#stride]";
accessors["arrayn1"] = "#pointer[r*#stride]";
stream << evaluate(PARENT_NODE_TYPE, accessors, expression, expression.root(), mapping) << ";" << std::endl;
stream.dec_tab();

View File

@@ -102,7 +102,6 @@ std::string ger::generate_impl(std::string const & suffix, math_expression const
stream.dec_tab();
stream << "}" << std::endl;
return stream.str();
}

View File

@@ -72,9 +72,10 @@ public:
{
kernel_.setArg(current_arg_++, a->data());
kernel_.setSizeArg(current_arg_++, a->start());
for(int_t i = 0 ; i < a->dim() ; i++)
for(int_t i = 0 ; i < a->dim() ; i++){
if(a->shape()[i] > 1)
kernel_.setSizeArg(current_arg_++, a->stride()[i]);
}
}
}

View File

@@ -55,7 +55,7 @@ inline std::string vstore(unsigned int simd_width, std::string const & dtype, st
std::string stridestr = (stride=="1")?"":("*" + stride);
std::string res;
for(unsigned int s = 0 ; s < simd_width ; ++s)
res += (s>0?";(":"(") + ptr + ")[" + offset + " + " + tools::to_string(s) + stridestr + "] = " + access_vector_type(value, s);
res += (s>0?";(":"(") + ptr + ")[" + offset + "*" + tools::to_string(simd_width) + " + " + tools::to_string(s) + stridestr + "] = " + access_vector_type(value, s);
return res;
}
}

View File

@@ -112,13 +112,14 @@ namespace isaac
{
math_expression::node & node = array[idx];
auto ng1 = [](shape_t const & shape){ size_t res = 0 ; for(size_t i = 0 ; i < shape.size() ; ++i) res += (shape[i] > 1); return res;};
//Left
expression_type type_left = INVALID_EXPRESSION_TYPE;
if (node.lhs.type_family == COMPOSITE_OPERATOR_FAMILY)
parse(array, node.lhs.node_index, breakpoints, type_left, false);
else if(node.lhs.subtype == DENSE_ARRAY_TYPE)
{
if(node.op.type==OPERATOR_MATRIX_ROW_TYPE || node.op.type==OPERATOR_MATRIX_COLUMN_TYPE || node.lhs.array->dim()==1)
if(node.op.type==OPERATOR_MATRIX_ROW_TYPE || node.op.type==OPERATOR_MATRIX_COLUMN_TYPE || ng1(node.lhs.array->shape())<=1)
type_left = AXPY_TYPE;
else
type_left = GER_TYPE;
@@ -130,13 +131,12 @@ namespace isaac
parse(array, node.rhs.node_index, breakpoints, type_right, false);
else if(node.rhs.subtype == DENSE_ARRAY_TYPE)
{
if(node.op.type==OPERATOR_MATRIX_ROW_TYPE || node.op.type==OPERATOR_MATRIX_COLUMN_TYPE || node.rhs.array->dim()==1)
if(node.op.type==OPERATOR_MATRIX_ROW_TYPE || node.op.type==OPERATOR_MATRIX_COLUMN_TYPE || ng1(node.rhs.array->shape())<=1)
type_right = AXPY_TYPE;
else
type_right = GER_TYPE;
}
final_type = merge(array[idx].op, type_left, type_right);
std::pair<bool, bool> tmp = has_temporary(array[idx].op, type_left, type_right, is_first);
if(tmp.first)
@@ -172,7 +172,8 @@ namespace isaac
//Init
expression_type current_type;
if(expression.dim()==1)
auto ng1 = [](shape_t const & shape){ size_t res = 0 ; for(size_t i = 0 ; i < shape.size() ; ++i) res += (shape[i] > 1); return res;};
if(ng1(expression.shape())<=1)
current_type=AXPY_TYPE;
else
current_type=GER_TYPE;
@@ -222,6 +223,7 @@ namespace isaac
}
/*-----Compute final expression-----*/
// std::cout << final_type << std::endl;
profiles[std::make_pair(final_type, dtype)]->execute(execution_handler(expression, c.execution_options(), c.dispatcher_options(), c.compilation_options()));
}

View File

@@ -169,6 +169,27 @@ extern "C"
MAKE_GEMV(S, sc::FLOAT_TYPE, float)
MAKE_GEMV(D, sc::DOUBLE_TYPE, double)
#define MAKE_GER(TYPE_CHAR, TYPE_ISAAC, TYPE_CU) \
void cublas ## TYPE_CHAR ## ger (int m, int n, TYPE_CU alpha, const TYPE_CU *x, int incx,\
const TYPE_CU *y, int incy, TYPE_CU *A, int lda)\
{\
sc::array dx((sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)x,false), 0, incx); \
sc::array dy((sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)y,false), 0, incy); \
sc::array dA((sc::int_t)m, (sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)A, false), 0, (sc::int_t)lda);\
sc::execute(sc::assign(dA, alpha*sc::outer(dx, dy) + dA));\
}\
cublasStatus_t cublas ## TYPE_CHAR ## ger_v2 (cublasHandle_t, int m, int n, const TYPE_CU * alpha, const TYPE_CU *x, int incx,\
const TYPE_CU *y, int incy, TYPE_CU *A, int lda)\
{\
cublas ## TYPE_CHAR ## ger(m, n, *alpha, x, incx, y, incy, A, lda);\
return CUBLAS_STATUS_SUCCESS;\
}
MAKE_GER(S, sc::FLOAT_TYPE, float)
MAKE_GER(D, sc::DOUBLE_TYPE, double)
//*****************
//BLAS3
//*****************
@@ -179,6 +200,13 @@ extern "C"
const TYPE_CU *B, int ldb, TYPE_CU beta, TYPE_CU *C,\
int ldc)\
{\
if(k==1 && m>1 && n>1){\
sc::array dA((sc::int_t)m, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)A, false), 0, transa=='N'?1:lda);\
sc::array dB((sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)B, false), 0, transb=='T'?1:ldb);\
sc::array dC((sc::int_t)m, (sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)C, false), 0, (sc::int_t)ldc);\
sc::execute(sc::assign(dC, alpha*sc::outer(dA, dB) + beta*dC));\
return;\
}\
sc::int_t As1 = (sc::int_t)m, As2 = (sc::int_t)k;\
sc::int_t Bs1 = (sc::int_t)k, Bs2 = (sc::int_t)n;\
if(transa=='T') std::swap(As1, As2);\

View File

@@ -17,7 +17,7 @@ void test(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> const & c
sc::int_t N = C.shape()[1];
sc::int_t K = A.shape()[1];
T alpha = 1;
T alpha = 1.43;
T beta = 0;
sc::driver::CommandQueue queue = sc::driver::backend::queues::get(C.context(),0);
@@ -51,7 +51,7 @@ void test(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> const & c
std::cout << " [Failure!]" << std::endl;\
}\
else\
std::cout << std::endl;
std::cout << std::endl;\
if(C.context().backend()==sc::driver::OPENCL && interf==clBLAS)
{
@@ -95,10 +95,10 @@ void test(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> const & c
if(interf==CPP)
{
RUN_TEST("C = A * B", C = dot(A,B))
RUN_TEST("C = A' * B", C = dot(trans(AT),B))
RUN_TEST("C = A * B'", C = dot(A,trans(BT)))
RUN_TEST("C = A' * B'", C = dot(trans(AT),trans(BT)))
RUN_TEST("C = A * B", C = alpha*dot(A,B) + beta*C)
RUN_TEST("C = A' * B", C = alpha*dot(AT.T,B) + beta*C)
RUN_TEST("C = A * B'", C = alpha*dot(A,BT.T) + beta*C)
RUN_TEST("C = A' * B'", C = alpha*dot(AT.T,BT.T) + beta*C)
}
if(failure_count>0)