Kernels: Fixed various corner cases for the kernel templates and BLAS
This commit is contained in:
@@ -20,6 +20,8 @@ class ISAACAPI Context
|
||||
friend class CommandQueue;
|
||||
friend class Buffer;
|
||||
|
||||
static std::string cache_path();
|
||||
|
||||
public:
|
||||
explicit Context(CUcontext const & context, CUdevice const & device, bool take_ownership = true);
|
||||
explicit Context(cl_context const & context, bool take_ownership = true);
|
||||
|
@@ -903,6 +903,8 @@ math_expression dot(LTYPE const & x, RTYPE const & y)\
|
||||
{\
|
||||
numeric_type dtype = x.dtype();\
|
||||
driver::Context const & context = x.context();\
|
||||
if(x.shape().max()==1 || y.shape().max()==1)\
|
||||
return x*y;\
|
||||
if(x.dim()==2 && x.shape()[1]==0)\
|
||||
return zeros(x.shape()[0], y.shape()[1], dtype, context);\
|
||||
if(x.shape()[0]==0 || (y.dim()==2 && y.shape()[1]==0))\
|
||||
@@ -927,10 +929,12 @@ math_expression dot(LTYPE const & x, RTYPE const & y)\
|
||||
else\
|
||||
return trans(detail::matvecprod(trans(y), trans(x)));\
|
||||
}\
|
||||
if(x.shape()[0]==1)\
|
||||
if(x.shape()[0]==1 && y.shape()[1]==1)\
|
||||
return sum(x*trans(y));\
|
||||
if(x.shape()[1]==1)\
|
||||
return outer(x, y);\
|
||||
if(x.shape()[0]==1 && y.shape()[1]==2)\
|
||||
return trans(detail::matvecprod(trans(y), trans(x)));\
|
||||
if(x.shape()[1]==1 && y.shape()[0]==1)\
|
||||
return x*y;\
|
||||
else /*if(x.dim()==2 && y.dim()==2)*/\
|
||||
return detail::matmatprod(x, y);\
|
||||
}
|
||||
@@ -995,7 +999,7 @@ void copy(void const * data, array_base& x, driver::CommandQueue & queue, bool b
|
||||
void copy(array_base const & x, void* data, driver::CommandQueue & queue, bool blocking)
|
||||
{
|
||||
unsigned int dtypesize = size_of(x.dtype());
|
||||
if(x.start()==0 && x.shape()[0]*x.stride().prod()==x.shape().prod())
|
||||
if(x.start()==0 && x.stride().prod()==x.shape().prod())
|
||||
{
|
||||
queue.read(x.data(), blocking, 0, x.shape().prod()*dtypesize, data);
|
||||
}
|
||||
|
@@ -4,7 +4,9 @@
|
||||
#include "isaac/driver/program.h"
|
||||
|
||||
#include "helpers/ocl/infos.hpp"
|
||||
|
||||
#include "getenv.hpp"
|
||||
#include "mkdir.hpp"
|
||||
|
||||
namespace isaac
|
||||
{
|
||||
@@ -12,17 +14,37 @@ namespace isaac
|
||||
namespace driver
|
||||
{
|
||||
|
||||
Context::Context(CUcontext const & context, CUdevice const & device, bool take_ownership) : backend_(CUDA), device_(device, false), cache_path_(tools::getenv("ISAAC_CACHE_PATH")), h_(backend_, take_ownership)
|
||||
std::string Context::cache_path()
|
||||
{
|
||||
//user-specified cache path
|
||||
std::string result = tools::getenv("ISAAC_CACHE_PATH");
|
||||
if(!result.empty())
|
||||
return result;
|
||||
|
||||
//create in home
|
||||
result = tools::getenv("HOME");
|
||||
if(!result.empty())
|
||||
{
|
||||
result = result + "/.isaac/cache/";
|
||||
if(tools::mkpath(result)==0)
|
||||
return result;
|
||||
}
|
||||
|
||||
//couldn't find a directory
|
||||
return "";
|
||||
}
|
||||
|
||||
Context::Context(CUcontext const & context, CUdevice const & device, bool take_ownership) : backend_(CUDA), device_(device, false), cache_path_(cache_path()), h_(backend_, take_ownership)
|
||||
{
|
||||
h_.cu() = context;
|
||||
}
|
||||
|
||||
Context::Context(cl_context const & context, bool take_ownership) : backend_(OPENCL), device_(ocl::info<CL_CONTEXT_DEVICES>(context)[0], false), cache_path_(tools::getenv("ISAAC_CACHE_PATH")), h_(backend_, take_ownership)
|
||||
Context::Context(cl_context const & context, bool take_ownership) : backend_(OPENCL), device_(ocl::info<CL_CONTEXT_DEVICES>(context)[0], false), cache_path_(cache_path()), h_(backend_, take_ownership)
|
||||
{
|
||||
h_.cl() = context;
|
||||
}
|
||||
|
||||
Context::Context(Device const & device) : backend_(device.backend_), device_(device), cache_path_(tools::getenv("ISAAC_CACHE_PATH")), h_(backend_, true)
|
||||
Context::Context(Device const & device) : backend_(device.backend_), device_(device), cache_path_(cache_path()), h_(backend_, true)
|
||||
{
|
||||
switch(backend_)
|
||||
{
|
||||
|
@@ -309,6 +309,8 @@ void mapped_repeat::postprocess(std::string &res) const
|
||||
|
||||
replace_macro(res, "$VALUE", MorphValue(type_));
|
||||
accessors["arrayn"] = res;
|
||||
accessors["array1n"] = res;
|
||||
accessors["arrayn1"] = res;
|
||||
accessors["arraynn"] = res;
|
||||
res = isaac::evaluate(LHS_NODE_TYPE, accessors, *info_.math_expression, info_.root_idx, *info_.mapping);
|
||||
}
|
||||
|
@@ -419,15 +419,9 @@ void math_expression_representation_functor::append(lhs_rhs_element const & lhs_
|
||||
{
|
||||
if(lhs_rhs.subtype==DENSE_ARRAY_TYPE)
|
||||
{
|
||||
char prefix;
|
||||
if(lhs_rhs.array->shape().max()==1)
|
||||
prefix = '0';
|
||||
else if(lhs_rhs.array->dim()==1 || lhs_rhs.array->shape().min()==1)
|
||||
prefix = '1';
|
||||
else
|
||||
prefix = '2';
|
||||
for(int i = 0 ; i < lhs_rhs.array->dim() ; ++i)
|
||||
*ptr_++= lhs_rhs.array->shape()[i]>1?'n':'1';
|
||||
numeric_type dtype = lhs_rhs.array->dtype();
|
||||
*ptr_++=prefix;
|
||||
*ptr_++=(char)dtype;
|
||||
|
||||
append_id(ptr_, binder_.get(lhs_rhs.array, is_assigned));
|
||||
|
@@ -46,6 +46,7 @@ std::string axpy::generate_impl(std::string const & suffix, math_expression cons
|
||||
std::vector<size_t> assigned_scalar = filter_nodes([](math_expression::node const & node) {
|
||||
return detail::is_assignment(node.op) && node.lhs.subtype==DENSE_ARRAY_TYPE && node.lhs.array->shape().max()==1;
|
||||
}, expressions, expressions.root(), true);
|
||||
|
||||
switch(backend)
|
||||
{
|
||||
case driver::CUDA:
|
||||
@@ -59,6 +60,7 @@ std::string axpy::generate_impl(std::string const & suffix, math_expression cons
|
||||
stream.inc_tab();
|
||||
|
||||
process(stream, PARENT_NODE_TYPE, {{"array1", "#scalartype #namereg = #pointer[#start];"},
|
||||
{"array11", "#scalartype #namereg = #pointer[#start];"},
|
||||
{"arrayn", "#pointer += #start;"}}, expressions, mappings);
|
||||
|
||||
stream << _size_t << " idx = " << GlobalIdx0(backend) << ";" << std::endl;
|
||||
@@ -107,7 +109,7 @@ std::string axpy::generate_impl(std::string const & suffix, math_expression cons
|
||||
//Declares register to store results
|
||||
for(std::size_t idx: assigned)
|
||||
{
|
||||
process(stream, LHS_NODE_TYPE, {{"arrayn", dtype + " #namereg;"}, {"arraynn", dtype + " #namereg;"}, {"matrix_row", "#scalartype #namereg;"},
|
||||
process(stream, LHS_NODE_TYPE, {{"arrayn", dtype + " #namereg;"}, {"arrayn1", dtype + " #namereg;"}, {"array1n", dtype + " #namereg;"}, {"arraynn", dtype + " #namereg;"}, {"matrix_row", "#scalartype #namereg;"},
|
||||
{"matrix_column", "#scalartype #namereg;"}, {"matrix_diag", "#scalartype #namereg;"}}, expressions, idx, mappings, processed);
|
||||
}
|
||||
|
||||
@@ -119,17 +121,21 @@ std::string axpy::generate_impl(std::string const & suffix, math_expression cons
|
||||
std::string matrix_row = dtype + " #namereg = " + vload(p_.simd_width, "#scalartype", "i*#ld", "#pointer + #row*#stride", "#ld", backend, false) + ";";
|
||||
std::string matrix_column = dtype + " #namereg = " + vload(p_.simd_width, "#scalartype", "i*#stride", "#pointer + #column*#ld", "#stride", backend, false) + ";";
|
||||
std::string matrix_diag = dtype + " #namereg = " + vload(p_.simd_width, "#scalartype", "i*(#ld + #stride)", "#pointer + ((#diag_offset<0)?-#diag_offset:(#diag_offset*#ld))", "#ld + #stride", backend, false) + ";";
|
||||
process(stream, RHS_NODE_TYPE, {{"arrayn", arrayn}, {"matrix_row", matrix_row}, {"matrix_column", matrix_column},
|
||||
process(stream, RHS_NODE_TYPE, {{"arrayn", arrayn}, {"arrayn1", arrayn}, {"array1n", arrayn}, {"matrix_row", matrix_row}, {"matrix_column", matrix_column},
|
||||
{"matrix_diag", matrix_diag}, {"array_access", array_access}}, expressions, idx, mappings, processed);
|
||||
}
|
||||
|
||||
|
||||
//Compute expressions
|
||||
for(std::size_t idx: assigned)
|
||||
stream << evaluate(PARENT_NODE_TYPE, {{"array1", "#namereg"}, {"arrayn", "#namereg"},
|
||||
for(std::size_t idx: assigned){
|
||||
std::string host_scalar_access = "#name";
|
||||
if(p_.simd_width>1 && std::find(assigned_scalar.begin(), assigned_scalar.end(), idx)==assigned_scalar.end())
|
||||
host_scalar_access = InitPrefix(backend, dtype).get() + "(#name)";
|
||||
stream << evaluate(PARENT_NODE_TYPE, {{"array1", "#namereg"}, {"arrayn1", "#namereg"}, {"array1n", "#namereg"}, {"array11", "#namereg"}, {"arrayn", "#namereg"},
|
||||
{"matrix_row", "#namereg"}, {"matrix_column", "#namereg"}, {"matrix_diag", "#namereg"}, {"array_access", "#namereg"},
|
||||
{"cast", CastPrefix(backend, dtype).get()}, {"placeholder", "#name"}, {"host_scalar", p_.simd_width==1?"#name": InitPrefix(backend, dtype).get() + "(#name)"}},
|
||||
{"cast", CastPrefix(backend, dtype).get()}, {"placeholder", "#name"}, {"host_scalar", host_scalar_access}},
|
||||
expressions, idx, mappings) << ";" << std::endl;
|
||||
}
|
||||
|
||||
//Writes back to registers
|
||||
processed.clear();
|
||||
@@ -139,7 +145,7 @@ std::string axpy::generate_impl(std::string const & suffix, math_expression cons
|
||||
std::string matrix_row = vstore(p_.simd_width, "#scalartype", "#namereg", "i*#ld", "#pointer + #row*#stride", "#ld", backend, false) + ";";
|
||||
std::string matrix_column = vstore(p_.simd_width, "#scalartype", "#namereg", "i*#stride", "#pointer + #column*#ld", "#stride", backend, false) + ";";
|
||||
std::string matrix_diag = vstore(p_.simd_width, "#scalartype", "#namereg", "i*(#ld + #stride)", "#pointer + (#diag_offset<0)?-#diag_offset:(#diag_offset*#ld)", "#ld + #stride", backend, false) + ";";
|
||||
process(stream, LHS_NODE_TYPE, {{"arrayn", arrayn}, {"matrix_row", matrix_row}, {"matrix_column", matrix_column}, {"matrix_diag", matrix_diag}}, expressions, idx, mappings, processed);
|
||||
process(stream, LHS_NODE_TYPE, {{"arrayn", arrayn}, {"array1n", arrayn}, {"arrayn1", arrayn}, {"matrix_row", matrix_row}, {"matrix_column", matrix_column}, {"matrix_diag", matrix_diag}}, expressions, idx, mappings, processed);
|
||||
}
|
||||
|
||||
if(sfors.size()){
|
||||
@@ -157,14 +163,13 @@ std::string axpy::generate_impl(std::string const & suffix, math_expression cons
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
for(std::size_t idx: assigned)
|
||||
process(stream, LHS_NODE_TYPE, { {"array1", "#pointer[#start] = #namereg;"} }, expressions, idx, mappings, processed);
|
||||
process(stream, LHS_NODE_TYPE, { {"array1", "#pointer[#start] = #namereg;"}, {"array11", "#pointer[#start] = #namereg;"} }, expressions, idx, mappings, processed);
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
}
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
// std::cout << stream.str() << std::endl;
|
||||
|
||||
return stream.str();
|
||||
}
|
||||
|
@@ -258,6 +258,7 @@ std::string dot::generate_impl(std::string const & suffix, math_expression const
|
||||
std::map<std::string, std::string> accessors;
|
||||
accessors["scalar_dot"] = "#name_buf[0]";
|
||||
accessors["array1"] = "#pointer[#start]";
|
||||
accessors["array11"] = "#pointer[#start]";
|
||||
stream << evaluate(PARENT_NODE_TYPE, accessors, expressions, expressions.root(), mapping) << ";" << std::endl;
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
@@ -516,7 +516,10 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
|
||||
{
|
||||
string Ci = to_string((m/p_.simd_width)*(p_.local_size_0*p_.simd_width) + m%p_.simd_width);
|
||||
stream << "if(" << Ci << "< M) ";
|
||||
stream << "C[" << Ci << CSTRIDE1 << "] = rC[" << m << "][" << n << "] + (beta?beta*" << "C[" << Ci << CSTRIDE1 << "]:0);" << std::endl;
|
||||
if(has_depth)
|
||||
stream << "C[" << Ci << CSTRIDE1 << "] = rC[" << m << "][" << n << "];" << std::endl;
|
||||
else
|
||||
stream << "C[" << Ci << CSTRIDE1 << "] = rC[" << m << "][" << n << "] + (beta?(beta*" << "C[" << Ci << CSTRIDE1 << "]):0);" << std::endl;
|
||||
}
|
||||
if((n+1)%p_.simd_width==0){
|
||||
stream << "C += ldc*" << p_.local_size_1*p_.simd_width - p_.simd_width + 1 << ";" << std::endl;
|
||||
@@ -552,7 +555,7 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
|
||||
stream.inc_tab();
|
||||
stream << "acc += Z[i + j*Zld + k*Zld*N];" << std::endl;
|
||||
stream.dec_tab();
|
||||
stream << "C[i*Cstride + j*ldc] = acc + beta*C[i + j*ldc];" << std::endl;
|
||||
stream << "C[i*Cstride + j*ldc] = acc + beta*C[i*Cstride + j*ldc];" << std::endl;
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
stream.dec_tab();
|
||||
|
@@ -190,6 +190,8 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
|
||||
if(col_simd_width > 1)
|
||||
accessors["gemv"] = access_vector_type(accessors["gemv"], s);
|
||||
accessors["arrayn"] = "#pointer[(r +" + to_string(s) + ")*#stride]";
|
||||
accessors["array1n"] = "#pointer[(r +" + to_string(s) + ")*#stride]";
|
||||
accessors["arrayn1"] = "#pointer[(r +" + to_string(s) + ")*#stride]";
|
||||
stream << evaluate(PARENT_NODE_TYPE, accessors, expression, expression.root(), mapping) << ";" << std::endl;
|
||||
}
|
||||
}
|
||||
@@ -244,6 +246,8 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
|
||||
process(stream, PARENT_NODE_TYPE,
|
||||
{{"array1", "#scalartype #namereg = #pointer[#start];"},
|
||||
{"arrayn", "#pointer += #start;"},
|
||||
{"array1n", "#pointer += #start;"},
|
||||
{"arrayn1", "#pointer += #start;"},
|
||||
{"arraynn", "#pointer += #start; "}}, expression, mapping);
|
||||
|
||||
for (const auto & e : dots)
|
||||
@@ -309,6 +313,8 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
|
||||
std::map<std::string, std::string> accessors;
|
||||
accessors["gemv"] = "#name_buf[lidy*" + local_size_0_ld_str + "]";
|
||||
accessors["arrayn"] = "#pointer[r*#stride]";
|
||||
accessors["array1n"] = "#pointer[r*#stride]";
|
||||
accessors["arrayn1"] = "#pointer[r*#stride]";
|
||||
stream << evaluate(PARENT_NODE_TYPE, accessors, expression, expression.root(), mapping) << ";" << std::endl;
|
||||
|
||||
stream.dec_tab();
|
||||
|
@@ -102,7 +102,6 @@ std::string ger::generate_impl(std::string const & suffix, math_expression const
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
|
||||
return stream.str();
|
||||
}
|
||||
|
||||
|
@@ -72,9 +72,10 @@ public:
|
||||
{
|
||||
kernel_.setArg(current_arg_++, a->data());
|
||||
kernel_.setSizeArg(current_arg_++, a->start());
|
||||
for(int_t i = 0 ; i < a->dim() ; i++)
|
||||
for(int_t i = 0 ; i < a->dim() ; i++){
|
||||
if(a->shape()[i] > 1)
|
||||
kernel_.setSizeArg(current_arg_++, a->stride()[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -55,7 +55,7 @@ inline std::string vstore(unsigned int simd_width, std::string const & dtype, st
|
||||
std::string stridestr = (stride=="1")?"":("*" + stride);
|
||||
std::string res;
|
||||
for(unsigned int s = 0 ; s < simd_width ; ++s)
|
||||
res += (s>0?";(":"(") + ptr + ")[" + offset + " + " + tools::to_string(s) + stridestr + "] = " + access_vector_type(value, s);
|
||||
res += (s>0?";(":"(") + ptr + ")[" + offset + "*" + tools::to_string(simd_width) + " + " + tools::to_string(s) + stridestr + "] = " + access_vector_type(value, s);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
@@ -112,13 +112,14 @@ namespace isaac
|
||||
{
|
||||
math_expression::node & node = array[idx];
|
||||
|
||||
auto ng1 = [](shape_t const & shape){ size_t res = 0 ; for(size_t i = 0 ; i < shape.size() ; ++i) res += (shape[i] > 1); return res;};
|
||||
//Left
|
||||
expression_type type_left = INVALID_EXPRESSION_TYPE;
|
||||
if (node.lhs.type_family == COMPOSITE_OPERATOR_FAMILY)
|
||||
parse(array, node.lhs.node_index, breakpoints, type_left, false);
|
||||
else if(node.lhs.subtype == DENSE_ARRAY_TYPE)
|
||||
{
|
||||
if(node.op.type==OPERATOR_MATRIX_ROW_TYPE || node.op.type==OPERATOR_MATRIX_COLUMN_TYPE || node.lhs.array->dim()==1)
|
||||
if(node.op.type==OPERATOR_MATRIX_ROW_TYPE || node.op.type==OPERATOR_MATRIX_COLUMN_TYPE || ng1(node.lhs.array->shape())<=1)
|
||||
type_left = AXPY_TYPE;
|
||||
else
|
||||
type_left = GER_TYPE;
|
||||
@@ -130,13 +131,12 @@ namespace isaac
|
||||
parse(array, node.rhs.node_index, breakpoints, type_right, false);
|
||||
else if(node.rhs.subtype == DENSE_ARRAY_TYPE)
|
||||
{
|
||||
if(node.op.type==OPERATOR_MATRIX_ROW_TYPE || node.op.type==OPERATOR_MATRIX_COLUMN_TYPE || node.rhs.array->dim()==1)
|
||||
if(node.op.type==OPERATOR_MATRIX_ROW_TYPE || node.op.type==OPERATOR_MATRIX_COLUMN_TYPE || ng1(node.rhs.array->shape())<=1)
|
||||
type_right = AXPY_TYPE;
|
||||
else
|
||||
type_right = GER_TYPE;
|
||||
}
|
||||
|
||||
|
||||
final_type = merge(array[idx].op, type_left, type_right);
|
||||
std::pair<bool, bool> tmp = has_temporary(array[idx].op, type_left, type_right, is_first);
|
||||
if(tmp.first)
|
||||
@@ -172,7 +172,8 @@ namespace isaac
|
||||
|
||||
//Init
|
||||
expression_type current_type;
|
||||
if(expression.dim()==1)
|
||||
auto ng1 = [](shape_t const & shape){ size_t res = 0 ; for(size_t i = 0 ; i < shape.size() ; ++i) res += (shape[i] > 1); return res;};
|
||||
if(ng1(expression.shape())<=1)
|
||||
current_type=AXPY_TYPE;
|
||||
else
|
||||
current_type=GER_TYPE;
|
||||
@@ -222,6 +223,7 @@ namespace isaac
|
||||
}
|
||||
|
||||
/*-----Compute final expression-----*/
|
||||
// std::cout << final_type << std::endl;
|
||||
profiles[std::make_pair(final_type, dtype)]->execute(execution_handler(expression, c.execution_options(), c.dispatcher_options(), c.compilation_options()));
|
||||
}
|
||||
|
||||
|
@@ -169,6 +169,27 @@ extern "C"
|
||||
MAKE_GEMV(S, sc::FLOAT_TYPE, float)
|
||||
MAKE_GEMV(D, sc::DOUBLE_TYPE, double)
|
||||
|
||||
|
||||
#define MAKE_GER(TYPE_CHAR, TYPE_ISAAC, TYPE_CU) \
|
||||
void cublas ## TYPE_CHAR ## ger (int m, int n, TYPE_CU alpha, const TYPE_CU *x, int incx,\
|
||||
const TYPE_CU *y, int incy, TYPE_CU *A, int lda)\
|
||||
{\
|
||||
sc::array dx((sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)x,false), 0, incx); \
|
||||
sc::array dy((sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)y,false), 0, incy); \
|
||||
sc::array dA((sc::int_t)m, (sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)A, false), 0, (sc::int_t)lda);\
|
||||
sc::execute(sc::assign(dA, alpha*sc::outer(dx, dy) + dA));\
|
||||
}\
|
||||
cublasStatus_t cublas ## TYPE_CHAR ## ger_v2 (cublasHandle_t, int m, int n, const TYPE_CU * alpha, const TYPE_CU *x, int incx,\
|
||||
const TYPE_CU *y, int incy, TYPE_CU *A, int lda)\
|
||||
{\
|
||||
cublas ## TYPE_CHAR ## ger(m, n, *alpha, x, incx, y, incy, A, lda);\
|
||||
return CUBLAS_STATUS_SUCCESS;\
|
||||
}
|
||||
|
||||
MAKE_GER(S, sc::FLOAT_TYPE, float)
|
||||
MAKE_GER(D, sc::DOUBLE_TYPE, double)
|
||||
|
||||
|
||||
//*****************
|
||||
//BLAS3
|
||||
//*****************
|
||||
@@ -179,6 +200,13 @@ extern "C"
|
||||
const TYPE_CU *B, int ldb, TYPE_CU beta, TYPE_CU *C,\
|
||||
int ldc)\
|
||||
{\
|
||||
if(k==1 && m>1 && n>1){\
|
||||
sc::array dA((sc::int_t)m, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)A, false), 0, transa=='N'?1:lda);\
|
||||
sc::array dB((sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)B, false), 0, transb=='T'?1:ldb);\
|
||||
sc::array dC((sc::int_t)m, (sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)C, false), 0, (sc::int_t)ldc);\
|
||||
sc::execute(sc::assign(dC, alpha*sc::outer(dA, dB) + beta*dC));\
|
||||
return;\
|
||||
}\
|
||||
sc::int_t As1 = (sc::int_t)m, As2 = (sc::int_t)k;\
|
||||
sc::int_t Bs1 = (sc::int_t)k, Bs2 = (sc::int_t)n;\
|
||||
if(transa=='T') std::swap(As1, As2);\
|
||||
|
@@ -17,7 +17,7 @@ void test(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> const & c
|
||||
sc::int_t N = C.shape()[1];
|
||||
sc::int_t K = A.shape()[1];
|
||||
|
||||
T alpha = 1;
|
||||
T alpha = 1.43;
|
||||
T beta = 0;
|
||||
|
||||
sc::driver::CommandQueue queue = sc::driver::backend::queues::get(C.context(),0);
|
||||
@@ -51,7 +51,7 @@ void test(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> const & c
|
||||
std::cout << " [Failure!]" << std::endl;\
|
||||
}\
|
||||
else\
|
||||
std::cout << std::endl;
|
||||
std::cout << std::endl;\
|
||||
|
||||
if(C.context().backend()==sc::driver::OPENCL && interf==clBLAS)
|
||||
{
|
||||
@@ -95,10 +95,10 @@ void test(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> const & c
|
||||
|
||||
if(interf==CPP)
|
||||
{
|
||||
RUN_TEST("C = A * B", C = dot(A,B))
|
||||
RUN_TEST("C = A' * B", C = dot(trans(AT),B))
|
||||
RUN_TEST("C = A * B'", C = dot(A,trans(BT)))
|
||||
RUN_TEST("C = A' * B'", C = dot(trans(AT),trans(BT)))
|
||||
RUN_TEST("C = A * B", C = alpha*dot(A,B) + beta*C)
|
||||
RUN_TEST("C = A' * B", C = alpha*dot(AT.T,B) + beta*C)
|
||||
RUN_TEST("C = A * B'", C = alpha*dot(A,BT.T) + beta*C)
|
||||
RUN_TEST("C = A' * B'", C = alpha*dot(AT.T,BT.T) + beta*C)
|
||||
}
|
||||
|
||||
if(failure_count>0)
|
||||
|
Reference in New Issue
Block a user