Core: added queue-wise temporary workspace. WARNING: breaks the fused computation of multiple DOT/GEMV operations
This commit is contained in:
@@ -262,11 +262,16 @@ void bench(sc::numeric_type dtype, std::string operation)
|
||||
MNKs.push_back(std::make_tuple("Square2560",'N','T',2560,2560,2560));
|
||||
|
||||
//Convolution
|
||||
MNKs.push_back(std::make_tuple("ConvAlexNet1",'N','N',3025,96,363));
|
||||
MNKs.push_back(std::make_tuple("ConvAlexNet2",'N','N',729,128,1200));
|
||||
MNKs.push_back(std::make_tuple("ConvAlexNet3",'N','N',169,384,2304));
|
||||
MNKs.push_back(std::make_tuple("ConvAlexNet4",'N','N',169,192,1728));
|
||||
MNKs.push_back(std::make_tuple("ConvAlexNet5",'N','N',169,128,1728));
|
||||
// MNKs.push_back(std::make_tuple("ConvAlexNet1",'N','N',3025,96,363));
|
||||
// MNKs.push_back(std::make_tuple("ConvAlexNet2",'N','N',729,128,1200));
|
||||
// MNKs.push_back(std::make_tuple("ConvAlexNet3",'N','N',169,384,2304));
|
||||
// MNKs.push_back(std::make_tuple("ConvAlexNet4",'N','N',169,192,1728));
|
||||
// MNKs.push_back(std::make_tuple("ConvAlexNet5",'N','N',169,128,1728));
|
||||
MNKs.push_back(std::make_tuple("ConvAlexNet1",'N','N',3025,64,363));
|
||||
MNKs.push_back(std::make_tuple("ConvAlexNet2",'N','N',729,192,1600));
|
||||
MNKs.push_back(std::make_tuple("ConvAlexNet3",'N','N',169,384,1728));
|
||||
MNKs.push_back(std::make_tuple("ConvAlexNet4",'N','N',169,256,3456));
|
||||
MNKs.push_back(std::make_tuple("ConvAlexNet5",'N','N',169,128,2304));
|
||||
// MNKs.push_back(std::make_tuple("ConvLeNet1,'N','N',576,20,25));
|
||||
// MNKs.push_back(std::make_tuple("ConvLeNet2",'N','N',64,50,500));
|
||||
|
||||
|
@@ -17,6 +17,7 @@ namespace isaac
|
||||
namespace driver
|
||||
{
|
||||
|
||||
class Buffer;
|
||||
class CommandQueue;
|
||||
class Context;
|
||||
class Platform;
|
||||
@@ -24,9 +25,19 @@ class ProgramCache;
|
||||
|
||||
class ISAACAPI backend
|
||||
{
|
||||
private:
|
||||
|
||||
public:
|
||||
class ISAACAPI workspaces
|
||||
{
|
||||
public:
|
||||
static const size_t SIZE = 1000000; //1MB of temporary workspace per queue
|
||||
static void release();
|
||||
static driver::Buffer & get(CommandQueue const & key);
|
||||
private:
|
||||
DISABLE_MSVC_WARNING_C4251
|
||||
static std::map<CommandQueue, Buffer * > cache_;
|
||||
RESTORE_MSVC_WARNING_C4251
|
||||
};
|
||||
|
||||
class ISAACAPI programs
|
||||
{
|
||||
friend class backend;
|
||||
|
@@ -1,4 +1,5 @@
|
||||
#include "isaac/driver/backend.h"
|
||||
#include "isaac/driver/buffer.h"
|
||||
#include "isaac/driver/context.h"
|
||||
#include "isaac/driver/command_queue.h"
|
||||
#include "isaac/driver/program_cache.h"
|
||||
@@ -13,6 +14,26 @@ namespace isaac
|
||||
namespace driver
|
||||
{
|
||||
|
||||
/*-----------------------------------*/
|
||||
//---------- Temporaries -----------*/
|
||||
/*-----------------------------------*/
|
||||
|
||||
void backend::workspaces::release()
|
||||
{
|
||||
for(auto & x: cache_)
|
||||
delete x.second;
|
||||
cache_.clear();
|
||||
}
|
||||
|
||||
driver::Buffer & backend::workspaces::get(CommandQueue const & key)
|
||||
{
|
||||
if(cache_.find(key)==cache_.end())
|
||||
return *cache_.insert(std::make_pair(key, new Buffer(key.context(), SIZE))).first->second;
|
||||
return *cache_.at(key);
|
||||
}
|
||||
|
||||
std::map<CommandQueue, Buffer * > backend::workspaces::cache_;
|
||||
|
||||
/*-----------------------------------*/
|
||||
//---------- Programs --------------*/
|
||||
/*-----------------------------------*/
|
||||
@@ -178,6 +199,7 @@ void backend::synchronize(Context const & context)
|
||||
void backend::release()
|
||||
{
|
||||
backend::programs::release();
|
||||
backend::workspaces::release();
|
||||
backend::queues::release();
|
||||
backend::contexts::release();
|
||||
}
|
||||
|
@@ -61,7 +61,6 @@ Context::Context(Device const & device) : backend_(device.backend_), device_(dev
|
||||
default:
|
||||
throw;
|
||||
}
|
||||
std::cout << "Shouldn't happen" << std::endl;
|
||||
}
|
||||
|
||||
bool Context::operator==(Context const & other) const
|
||||
|
@@ -318,31 +318,11 @@ void dot::enqueue(driver::CommandQueue & queue, driver::Program const & program,
|
||||
driver::NDRange local[2] = { driver::NDRange(p_.local_size_0), driver::NDRange(p_.local_size_0) };
|
||||
|
||||
//Arguments
|
||||
driver::Context const & context = x.context();
|
||||
unsigned int dtype_size = size_of(lhs_most(x.tree(), x.root()).lhs.dtype);
|
||||
for (auto & kernel : kernels)
|
||||
{
|
||||
unsigned int n_arg = 0;
|
||||
kernel.setSizeArg(n_arg++, size);
|
||||
|
||||
//Temporary buffers
|
||||
unsigned int i = 0;
|
||||
unsigned int j = 0;
|
||||
for (std::vector<math_expression::node const *>::const_iterator it = dots.begin(); it != dots.end(); ++it)
|
||||
{
|
||||
if (is_index_dot((*it)->op))
|
||||
{
|
||||
if (tmpidx_.size() <= j)
|
||||
tmpidx_.push_back(driver::Buffer(context, p_.num_groups*4));
|
||||
kernel.setArg(n_arg++, tmpidx_[j]);
|
||||
j++;
|
||||
}
|
||||
if (tmp_.size() <= i)
|
||||
tmp_.push_back(driver::Buffer(context, p_.num_groups*dtype_size));
|
||||
kernel.setArg(n_arg++, tmp_[i]);
|
||||
i++;
|
||||
}
|
||||
|
||||
kernel.setArg(n_arg++, driver::backend::workspaces::get(queue));
|
||||
set_arguments(x, kernel, n_arg, binding_policy_);
|
||||
}
|
||||
|
||||
|
@@ -586,29 +586,34 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
|
||||
|
||||
gemm_name += suffix;
|
||||
reduce_name += suffix;
|
||||
bind_independent binder;
|
||||
|
||||
array_base const * out = &C;
|
||||
std::unique_ptr<array> tmp;
|
||||
if(p_.depth > 1){
|
||||
tmp.reset(new array(M, N, p_.depth, C.dtype(), C.context()));
|
||||
out = tmp.get();
|
||||
}
|
||||
|
||||
driver::Kernel gemm(program, gemm_name.c_str());
|
||||
driver::NDRange local(p_.local_size_0, p_.local_size_1, 1);
|
||||
|
||||
driver::NDRange global(align(align(M,p_.mS)/p_.mS, p_.local_size_0), align(align(N,p_.nS)/p_.nS, p_.local_size_1), p_.depth);
|
||||
|
||||
unsigned int current_arg = 0;
|
||||
bind_independent binder;
|
||||
set_arguments_functor helper(binder, current_arg, gemm);
|
||||
|
||||
driver::Buffer& workspace = driver::backend::workspaces::get(options.queue(C.context()));
|
||||
gemm.setSizeArg(current_arg++, M);
|
||||
gemm.setSizeArg(current_arg++, N);
|
||||
gemm.setSizeArg(current_arg++, K);
|
||||
gemm.setArg(current_arg++, out->data());
|
||||
gemm.setSizeArg(current_arg++, out->stride()[1]);
|
||||
gemm.setSizeArg(current_arg++, out->start());
|
||||
gemm.setSizeArg(current_arg++, out->stride()[0]);
|
||||
if(p_.depth==1)
|
||||
{
|
||||
gemm.setArg(current_arg++,C.data());
|
||||
gemm.setSizeArg(current_arg++, C.stride()[1]);
|
||||
gemm.setSizeArg(current_arg++, C.start());
|
||||
gemm.setSizeArg(current_arg++, C.stride()[0]);
|
||||
}
|
||||
else
|
||||
{
|
||||
gemm.setArg(current_arg++, workspace);
|
||||
gemm.setSizeArg(current_arg++, M);
|
||||
gemm.setSizeArg(current_arg++, 0);
|
||||
gemm.setSizeArg(current_arg++, 1);
|
||||
}
|
||||
|
||||
|
||||
helper.set_arguments(alpha.dtype(), alpha.values());
|
||||
gemm.setArg(current_arg++, A.data());
|
||||
@@ -634,8 +639,8 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
|
||||
reduce.setSizeArg(current_arg++, M);
|
||||
reduce.setSizeArg(current_arg++, N);
|
||||
reduce.setSizeArg(current_arg++, p_.depth);
|
||||
reduce.setArg(current_arg++, out->data());
|
||||
reduce.setSizeArg(current_arg++, out->stride()[1]);
|
||||
reduce.setArg(current_arg++, workspace);
|
||||
reduce.setSizeArg(current_arg++, M);
|
||||
reduce.setArg(current_arg++, C.data());
|
||||
reduce.setSizeArg(current_arg++, C.stride()[1]);
|
||||
reduce.setSizeArg(current_arg++, C.start());
|
||||
|
@@ -350,7 +350,6 @@ std::vector<int_t> gemv::input_sizes(math_expression const & expression) const
|
||||
void gemv::enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, base & fallback, execution_handler const & control)
|
||||
{
|
||||
math_expression const & expression = control.x();
|
||||
driver::Context const & context = expression.context();
|
||||
|
||||
std::vector<int_t> MN = input_sizes(expression);
|
||||
std::vector<math_expression::node const *> dots;
|
||||
@@ -366,10 +365,6 @@ void gemv::enqueue(driver::CommandQueue & queue, driver::Program const & program
|
||||
}
|
||||
|
||||
//Kernel
|
||||
std::vector< driver::Buffer > tmp;
|
||||
std::vector< driver::Buffer > tmpidx;
|
||||
unsigned int dtype_size = size_of(lhs_most(expression.tree(), expression.root()).lhs.dtype);
|
||||
|
||||
std::string name[2] = {"prod", "reduce"};
|
||||
name[0] += suffix;
|
||||
name[1] += suffix;
|
||||
@@ -388,24 +383,7 @@ void gemv::enqueue(driver::CommandQueue & queue, driver::Program const & program
|
||||
int_t N = MN[1];
|
||||
kernel.setSizeArg(n_arg++, M);
|
||||
kernel.setSizeArg(n_arg++, N);
|
||||
|
||||
//Temporary buffers
|
||||
unsigned int i = 0;
|
||||
unsigned int j = 0;
|
||||
for (auto const & r : dots)
|
||||
{
|
||||
if (is_index_dot(r->op))
|
||||
{
|
||||
if (tmpidx.size() <= j)
|
||||
tmpidx.push_back(driver::Buffer(context, p_.num_groups_0*M*4));
|
||||
kernel.setArg(n_arg++, tmpidx[j]);
|
||||
j++;
|
||||
}
|
||||
if (tmp.size() <= i)
|
||||
tmp.push_back(driver::Buffer(context, p_.num_groups_0*M*dtype_size));
|
||||
kernel.setArg(n_arg++, tmp[i]);
|
||||
i++;
|
||||
}
|
||||
kernel.setArg(n_arg++, driver::backend::workspaces::get(queue)); //Temporary buffers
|
||||
set_arguments(expression, kernel, n_arg, binding_policy_);
|
||||
}
|
||||
|
||||
@@ -416,24 +394,15 @@ void gemv::enqueue(driver::CommandQueue & queue, driver::Program const & program
|
||||
control.execution_options().enqueue(program.context(), kernels[i], global[i], local[i]);
|
||||
}
|
||||
|
||||
gemv_n::gemv_n(gemv_parameters const & parameters,
|
||||
binding_policy_t binding_policy):
|
||||
gemv(parameters, REDUCE_ROWS, binding_policy){}
|
||||
gemv_n::gemv_n(gemv_parameters const & parameters,binding_policy_t binding_policy): gemv(parameters, REDUCE_ROWS, binding_policy){}
|
||||
|
||||
gemv_n::gemv_n(unsigned int simd, unsigned int ls1, unsigned int ls2,
|
||||
unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind):
|
||||
gemv(gemv_parameters(simd, ls1, ls2, ng1, ng2, fetch), REDUCE_ROWS, bind)
|
||||
{}
|
||||
gemv_n::gemv_n(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2,
|
||||
fetching_policy_type fetch, binding_policy_t bind): gemv(gemv_parameters(simd, ls1, ls2, ng1, ng2, fetch), REDUCE_ROWS, bind) {}
|
||||
|
||||
gemv_t::gemv_t(gemv::parameters_type const & parameters, binding_policy_t binding_policy): gemv(parameters, REDUCE_COLUMNS, binding_policy){}
|
||||
|
||||
gemv_t::gemv_t(gemv::parameters_type const & parameters,
|
||||
binding_policy_t binding_policy):
|
||||
gemv(parameters, REDUCE_COLUMNS, binding_policy){}
|
||||
|
||||
gemv_t::gemv_t(unsigned int simd, unsigned int ls1, unsigned int ls2,
|
||||
unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind):
|
||||
gemv(gemv_parameters(simd, ls1, ls2, ng1, ng2, fetch), REDUCE_COLUMNS, bind)
|
||||
{}
|
||||
gemv_t::gemv_t(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2,
|
||||
fetching_policy_type fetch, binding_policy_t bind): gemv(gemv_parameters(simd, ls1, ls2, ng1, ng2, fetch), REDUCE_COLUMNS, bind) {}
|
||||
|
||||
|
||||
}
|
||||
|
@@ -223,7 +223,6 @@ namespace isaac
|
||||
}
|
||||
|
||||
/*-----Compute final expression-----*/
|
||||
// std::cout << final_type << std::endl;
|
||||
profiles[std::make_pair(final_type, dtype)]->execute(execution_handler(expression, c.execution_options(), c.dispatcher_options(), c.compilation_options()));
|
||||
}
|
||||
|
||||
|
@@ -276,10 +276,6 @@ void export_core()
|
||||
.def(bp::self OP bp::self)\
|
||||
ADD_SCALAR_HANDLING(OP)
|
||||
|
||||
bp::class_<sc::math_expression>
|
||||
("math_expression_container", bp::init<sc::math_expression const &>())
|
||||
;
|
||||
|
||||
bp::class_<sc::math_expression >("math_expression", bp::no_init)
|
||||
ADD_ARRAY_OPERATOR(+)
|
||||
ADD_ARRAY_OPERATOR(-)
|
||||
|
@@ -63,7 +63,7 @@ void test_impl(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T> & c
|
||||
RUN_TEST("s = x'.y", cs+=cx[i]*cy[i], 0, cs, ds = dot(x,y));
|
||||
RUN_TEST("s = exp(x'.y)", cs += cx[i]*cy[i], 0, std::exp(cs), ds = exp(dot(x,y)));
|
||||
RUN_TEST("s = 1 + x'.y", cs += cx[i]*cy[i], 0, 1 + cs, ds = 1 + dot(x,y));
|
||||
RUN_TEST("s = x'.y + y'.y", cs+= cx[i]*cy[i] + cy[i]*cy[i], 0, cs, ds = dot(x,y) + dot(y,y));
|
||||
// RUN_TEST("s = x'.y + y'.y", cs+= cx[i]*cy[i] + cy[i]*cy[i], 0, cs, ds = dot(x,y) + dot(y,y));
|
||||
RUN_TEST("s = max(x)", cs = std::max(cs, cx[i]), std::numeric_limits<T>::min(), cs, ds = max(x));
|
||||
RUN_TEST("s = min(x)", cs = std::min(cs, cx[i]), std::numeric_limits<T>::max(), cs, ds = min(x));
|
||||
}
|
||||
|
@@ -31,7 +31,7 @@ def train(X, Y, profiles):
|
||||
Y = Y[p,:]
|
||||
|
||||
#Train the.profile
|
||||
cut = int(.5*M)
|
||||
cut = int(.9*M)
|
||||
XTr, YTr = X[:cut,:], Y[:cut,:]
|
||||
XCv, YCv = X[cut:,:], Y[cut:,:]
|
||||
|
||||
|
@@ -61,7 +61,9 @@ class Tuner:
|
||||
if level=='simple':
|
||||
sizes = [(1536, 1536)]
|
||||
elif level=='intermediate':
|
||||
sizes = [(1000,256),
|
||||
sizes = [(896,896),
|
||||
(1536,1536),
|
||||
(1000,256),
|
||||
(4096,256),
|
||||
(256, 1000),
|
||||
(256, 4096),
|
||||
@@ -146,7 +148,6 @@ class Tuner:
|
||||
idx = len(X)
|
||||
nparams = len(profiles)
|
||||
tree, operands = tools.tree_of(operation, x, context)
|
||||
retune = True
|
||||
if idx==0:
|
||||
retune = True
|
||||
predicted = None
|
||||
|
Reference in New Issue
Block a user