Feature: Merged kernel-fusion branch

* Fuses multiple AXPY kernel
* Possibility to add thread-wise for loops in AXPY-like kernels
This commit is contained in:
Philippe Tillet
2015-09-30 15:31:41 -04:00
parent 149441b9e2
commit feeb1e9862
64 changed files with 10047 additions and 1119 deletions

View File

@@ -73,7 +73,7 @@ def main():
libraries += ['gnustl_shared']
#Source files
src = 'src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/dispatch.cpp src/lib/driver/program_cache.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/profiles/presets.cpp src/lib/profiles/profiles.cpp src/lib/profiles/predictors/random_forest.cpp src/lib/kernels/templates/gemv.cpp src/lib/kernels/templates/axpy.cpp src/lib/kernels/templates/gemm.cpp src/lib/kernels/templates/ger.cpp src/lib/kernels/templates/dot.cpp src/lib/kernels/templates/base.cpp src/lib/kernels/mapped_object.cpp src/lib/kernels/stream.cpp src/lib/kernels/parse.cpp src/lib/kernels/keywords.cpp src/lib/kernels/binder.cpp src/lib/wrap/clBLAS.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
src = 'src/lib/value_scalar.cpp src/lib/wrap/clBLAS.cpp src/lib/profiles/predictors/random_forest.cpp src/lib/profiles/profiles.cpp src/lib/profiles/presets.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/driver/handle.cpp src/lib/driver/dispatch.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/program_cache.cpp src/lib/driver/command_queue.cpp src/lib/driver/buffer.cpp src/lib/driver/backend.cpp src/lib/driver/platform.cpp src/lib/driver/program.cpp src/lib/driver/kernel.cpp src/lib/driver/device.cpp src/lib/driver/check.cpp src/lib/driver/context.cpp src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/expression.cpp src/lib/symbolic/io.cpp src/lib/array.cpp src/lib/kernels/parse.cpp src/lib/kernels/templates/ger.cpp src/lib/kernels/templates/axpy.cpp src/lib/kernels/templates/gemm.cpp src/lib/kernels/templates/base.cpp src/lib/kernels/templates/gemv.cpp src/lib/kernels/templates/dot.cpp src/lib/kernels/binder.cpp src/lib/kernels/keywords.cpp src/lib/kernels/mapped_object.cpp src/lib/kernels/stream.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
boostsrc = 'external/boost/libs/'
for s in ['numpy','python','smart_ptr','system','thread']:
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]

View File

@@ -276,11 +276,11 @@ void export_core()
.def(bp::self OP bp::self)\
ADD_SCALAR_HANDLING(OP)
bp::class_<sc::expressions_tuple>
("array_expression_container", bp::init<sc::array_expression const &>())
bp::class_<sc::math_expression>
("math_expression_container", bp::init<sc::math_expression const &>())
;
bp::class_<sc::array_expression >("array_expression", bp::no_init)
bp::class_<sc::math_expression >("math_expression", bp::no_init)
ADD_ARRAY_OPERATOR(+)
ADD_ARRAY_OPERATOR(-)
ADD_ARRAY_OPERATOR(*)
@@ -291,7 +291,7 @@ void export_core()
ADD_ARRAY_OPERATOR(<=)
ADD_ARRAY_OPERATOR(==)
ADD_ARRAY_OPERATOR(!=)
.add_property("context", bp::make_function(&sc::array_expression::context, bp::return_internal_reference<>()))
.add_property("context", bp::make_function(&sc::math_expression::context, bp::return_internal_reference<>()))
.def(bp::self_ns::abs(bp::self))
// .def(bp::self_ns::pow(bp::self))
;
@@ -299,15 +299,15 @@ void export_core()
#define ADD_ARRAY_OPERATOR(OP) \
.def(bp::self OP bp::self)\
.def(bp::self OP bp::other<sc::array_expression>())\
.def(bp::other<sc::array_expression>() OP bp::self) \
.def(bp::self OP bp::other<sc::math_expression>())\
.def(bp::other<sc::math_expression>() OP bp::self) \
ADD_SCALAR_HANDLING(OP)
bp::class_<sc::array,
std::shared_ptr<sc::array> >
( "array", bp::no_init)
.def("__init__", bp::make_constructor(detail::create_array, bp::default_call_policies(), (bp::arg("obj"), bp::arg("dtype") = bp::scope().attr("float32"), bp::arg("context")= bp::object())))
.def(bp::init<sc::array_expression>())
.def(bp::init<sc::math_expression>())
.add_property("dtype", &sc::array::dtype)
.add_property("context", bp::make_function(&sc::array::context, bp::return_internal_reference<>()))
.add_property("T", &sc::array::T)
@@ -336,15 +336,15 @@ void export_core()
bp::def("empty", &detail::create_empty_array, (bp::arg("shape"), bp::arg("dtype") = bp::scope().attr("float32"), bp::arg("context")=bp::object()));
//Assign
bp::def("assign", static_cast<sc::array_expression (*)(sc::array const &, sc::array const &)>(&sc::assign));\
bp::def("assign", static_cast<sc::array_expression (*)(sc::array const &, sc::array_expression const &)>(&sc::assign));\
bp::def("assign", static_cast<sc::math_expression (*)(sc::array const &, sc::array const &)>(&sc::assign));\
bp::def("assign", static_cast<sc::math_expression (*)(sc::array const &, sc::math_expression const &)>(&sc::assign));\
//Binary
#define MAP_FUNCTION(name) \
bp::def(#name, static_cast<sc::array_expression (*)(sc::array const &, sc::array const &)>(&sc::name));\
bp::def(#name, static_cast<sc::array_expression (*)(sc::array_expression const &, sc::array const &)>(&sc::name));\
bp::def(#name, static_cast<sc::array_expression (*)(sc::array const &, sc::array_expression const &)>(&sc::name));\
bp::def(#name, static_cast<sc::array_expression (*)(sc::array_expression const &, sc::array_expression const &)>(&sc::name));
bp::def(#name, static_cast<sc::math_expression (*)(sc::array const &, sc::array const &)>(&sc::name));\
bp::def(#name, static_cast<sc::math_expression (*)(sc::math_expression const &, sc::array const &)>(&sc::name));\
bp::def(#name, static_cast<sc::math_expression (*)(sc::array const &, sc::math_expression const &)>(&sc::name));\
bp::def(#name, static_cast<sc::math_expression (*)(sc::math_expression const &, sc::math_expression const &)>(&sc::name));
MAP_FUNCTION(maximum)
MAP_FUNCTION(minimum)
@@ -354,8 +354,8 @@ void export_core()
//Unary
#define MAP_FUNCTION(name) \
bp::def(#name, static_cast<sc::array_expression (*)(sc::array const &)>(&sc::name));\
bp::def(#name, static_cast<sc::array_expression (*)(sc::array_expression const &)>(&sc::name));
bp::def(#name, static_cast<sc::math_expression (*)(sc::array const &)>(&sc::name));\
bp::def(#name, static_cast<sc::math_expression (*)(sc::math_expression const &)>(&sc::name));
bp::def("zeros", &detail::create_zeros_array, (bp::arg("shape"), bp::arg("dtype") = bp::scope().attr("float32"), bp::arg("context")=bp::object()));
@@ -380,8 +380,8 @@ void export_core()
/*--- Reduction operators----*/
//---------------------------------------
#define MAP_FUNCTION(name) \
bp::def(#name, static_cast<sc::array_expression (*)(sc::array const &, sc::int_t)>(&sc::name));\
bp::def(#name, static_cast<sc::array_expression (*)(sc::array_expression const &, sc::int_t)>(&sc::name));
bp::def(#name, static_cast<sc::math_expression (*)(sc::array const &, sc::int_t)>(&sc::name));\
bp::def(#name, static_cast<sc::math_expression (*)(sc::math_expression const &, sc::int_t)>(&sc::name));
MAP_FUNCTION(sum)
MAP_FUNCTION(max)

View File

@@ -62,7 +62,7 @@ namespace detail
std::shared_ptr<sc::driver::Context> make_context(sc::driver::Device const & dev)
{ return std::shared_ptr<sc::driver::Context>(new sc::driver::Context(dev)); }
bp::object enqueue(sc::array_expression const & expression, unsigned int queue_id, bp::list dependencies, bool tune, int label, std::string const & program_name, bool force_recompile)
bp::object enqueue(sc::math_expression const & expression, unsigned int queue_id, bp::list dependencies, bool tune, int label, std::string const & program_name, bool force_recompile)
{
std::list<sc::driver::Event> events;
std::vector<sc::driver::Event> cdependencies = tools::to_vector<sc::driver::Event>(dependencies);
@@ -70,15 +70,15 @@ namespace detail
sc::execution_options_type execution_options(queue_id, &events, &cdependencies);
sc::dispatcher_options_type dispatcher_options(tune, label);
sc::compilation_options_type compilation_options(program_name, force_recompile);
sc::array_expression::container_type::value_type root = expression.tree()[expression.root()];
sc::math_expression::container_type::value_type root = expression.tree()[expression.root()];
if(sc::detail::is_assignment(root.op))
{
sc::execute(sc::control(expression, execution_options, dispatcher_options, compilation_options), isaac::profiles::get(execution_options.queue(expression.context())));
sc::execute(sc::execution_handler(expression, execution_options, dispatcher_options, compilation_options), isaac::profiles::get(execution_options.queue(expression.context())));
return bp::make_tuple(bp::ptr(root.lhs.array), tools::to_list(events.begin(), events.end()));
}
else
{
std::shared_ptr<sc::array> parray(new sc::array(sc::control(expression, execution_options, dispatcher_options, compilation_options)));
std::shared_ptr<sc::array> parray(new sc::array(sc::execution_handler(expression, execution_options, dispatcher_options, compilation_options)));
return bp::make_tuple(parray, tools::to_list(events.begin(), events.end()));
}
}

View File

@@ -13,7 +13,7 @@ namespace tpt = isaac::templates;
namespace detail
{
bp::list input_sizes(tpt::base & temp, sc::expressions_tuple const & tree)
bp::list input_sizes(tpt::base & temp, sc::math_expression const & tree)
{
std::vector<isaac::int_t> tmp = temp.input_sizes(tree);
return tools::to_list(tmp.begin(), tmp.end());