Bugfix in autotuner

2015-01-21 20:08:52 -05:00
parent d285bd81e0
commit 9a76be3edc
7 changed files with 191 additions and 117 deletions
--- a/include/atidlas/array.h
+++ b/include/atidlas/array.h
@@ -19,14 +19,14 @@ class array: public obj_base
 public:
  //1D Constructors
  array(int_t size1, numeric_type dtype, cl::Context context = cl::default_context());
-  template<typename T>
-  array(std::vector<T> const & data, cl::Context context = cl::default_context());
+  template<typename DT>
+  array(std::vector<DT> const & data, cl::Context context = cl::default_context());
  array(array & v, slice const & s1);

  //2D Constructors
  array(int_t size1, int_t size2, numeric_type dtype, cl::Context context = cl::default_context());
-  template<typename T>
-  array(int_t size1, int_t size2, std::vector<T> const & data, cl::Context context = cl::default_context());
+  template<typename DT>
+  array(int_t size1, int_t size2, std::vector<DT> const & data, cl::Context context = cl::default_context());
  array(array & M, slice const & s1, slice const & s2);

  //General constructor
@@ -72,6 +72,8 @@ public:
  scalar operator[](int_t);
  array operator[](slice const &);
  array operator()(slice const &, slice const &);
+
+  array_expression T() const;
 protected:
  numeric_type dtype_;

@@ -113,9 +115,7 @@ public:
 };


-atidlas::array_expression eye(std::size_t, std::size_t, atidlas::numeric_type, cl::Context ctx = cl::default_context());
-array_expression zeros(std::size_t M, std::size_t N, numeric_type dtype, cl::Context ctx = cl::default_context());
-array reshape(array const &, int_t, int_t);
+

 //copy

@@ -209,6 +209,10 @@ ATIDLAS_DECLARE_REDUCTION(max)
 ATIDLAS_DECLARE_REDUCTION(min)
 ATIDLAS_DECLARE_REDUCTION(argmin)

+atidlas::array_expression eye(std::size_t, std::size_t, atidlas::numeric_type, cl::Context ctx = cl::default_context());
+array_expression zeros(std::size_t M, std::size_t N, numeric_type dtype, cl::Context ctx = cl::default_context());
+array reshape(array const &, int_t, int_t);
+
 //
 std::ostream& operator<<(std::ostream &, array const &);
 std::ostream& operator<<(std::ostream & os, scalar const & s);
--- a/include/atidlas/symbolic/expression.h
+++ b/include/atidlas/symbolic/expression.h
@@ -104,6 +104,7 @@ enum operation_node_type
  OPERATOR_MATRIX_ROW_TYPE,
  OPERATOR_MATRIX_COLUMN_TYPE,
  OPERATOR_REPEAT_TYPE,
+  OPERATOR_SHIFT_TYPE,
  OPERATOR_VDIAG_TYPE,

  OPERATOR_MATRIX_PRODUCT_NN_TYPE,
--- a/lib/array.cpp
+++ b/lib/array.cpp
@@ -19,9 +19,9 @@ array::array(int_t size1, numeric_type dtype, cl::Context context) :
  context_(context), data_(context_, CL_MEM_READ_WRITE, size_of(dtype)*dsize())
 { }

-template<class T>
-array::array(std::vector<T> const & x, cl::Context context):
-  dtype_(to_numeric_type<T>::value), shape_(x.size(), 1), start_(0, 0), stride_(1, 1), ld_(shape_._1),
+template<class DT>
+array::array(std::vector<DT> const & x, cl::Context context):
+  dtype_(to_numeric_type<DT>::value), shape_(x.size(), 1), start_(0, 0), stride_(1, 1), ld_(shape_._1),
  context_(context), data_(context, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
 { *this = x; }

@@ -53,9 +53,9 @@ array::array(array & M, slice const & s1, slice const & s2) :  dtype_(M.dtype_),
                                                          context_(M.data_.getInfo<CL_MEM_CONTEXT>()), data_(M.data_)
 { }

-template<typename T>
-array::array(int_t size1, int_t size2, std::vector<T> const & data, cl::Context context)
-  : dtype_(to_numeric_type<T>::value),
+template<typename DT>
+array::array(int_t size1, int_t size2, std::vector<DT> const & data, cl::Context context)
+  : dtype_(to_numeric_type<DT>::value),
    shape_(size1, size2), start_(0, 0), stride_(1, 1), ld_(size1),
    context_(context), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
 {
@@ -146,8 +146,8 @@ array & array::operator=(array_expression const & rhs)
  return *this;
 }

-template<class T>
-array & array::operator=(std::vector<T> const & rhs)
+template<class DT>
+array & array::operator=(std::vector<DT> const & rhs)
 {
  assert(nshape()==1);
  atidlas::copy(rhs, *this);
@@ -208,6 +208,9 @@ array & array::operator/=(array const & rhs)
 array & array::operator/=(array_expression const & rhs)
 { return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_DIV_TYPE), shape_); }

+array_expression array::T() const
+{ return atidlas::trans(*this) ;}
+
 /*--- Indexing operators -----*/
 //---------------------------------------
 scalar array::operator [](int_t idx)
@@ -481,17 +484,17 @@ atidlas::array_expression zeros(std::size_t M, std::size_t N, atidlas::numeric_t
  return array_expression(value_scalar(0), lhs_rhs_element(), op_element(OPERATOR_UNARY_TYPE_FAMILY, OPERATOR_ADD_TYPE), ctx, dtype, size4(M, N));
 }

-inline size4 trans(size4 const & shape)
+inline size4 flip(size4 const & shape)
 { return size4(shape._2, shape._1);}

 inline size4 prod(size4 const & shape1, size4 const & shape2)
 { return size4(shape1._1*shape2._1, shape1._2*shape2._2);}

 array_expression trans(array  const & x) \
-{ return array_expression(x, lhs_rhs_element(), op_element(OPERATOR_UNARY_TYPE_FAMILY, OPERATOR_TRANS_TYPE), x.context(), x.dtype(), trans(x.shape())); }\
+{ return array_expression(x, lhs_rhs_element(), op_element(OPERATOR_UNARY_TYPE_FAMILY, OPERATOR_TRANS_TYPE), x.context(), x.dtype(), flip(x.shape())); }\
 \
 array_expression trans(array_expression const & x) \
-{ return array_expression(x, lhs_rhs_element(), op_element(OPERATOR_UNARY_TYPE_FAMILY, OPERATOR_TRANS_TYPE), trans(x.shape())); }
+{ return array_expression(x, lhs_rhs_element(), op_element(OPERATOR_UNARY_TYPE_FAMILY, OPERATOR_TRANS_TYPE), flip(x.shape())); }

 array_expression repmat(array const & A, int_t const & rep1, int_t const & rep2)
 {
--- a/python/autotune/pysrc/autotune.py
+++ b/python/autotune/pysrc/autotune.py
@@ -3,7 +3,6 @@ from __future__ import division
 import argparse, itertools, os, sys, json
 import misc_tools, optimize, dataset
 import pyatidlas as atd
-import pyopencl as cl
 import numpy as np

 from numpy import random
@@ -34,7 +33,8 @@ TYPES = { 'vaxpy': {'template':atd.vaxpy,

 def do_tuning(args):
    device = args.device
-
+    context = atd.context(device)
+    context.queues.append(atd.command_queue(context, device))
    if os.path.isfile(args.json_file):
        json_out = json.load(open(args.json_file, 'r'))
    else:
@@ -98,7 +98,7 @@ def do_tuning(args):
                  D = json_out[full_operation][dtypestr]

                  if args.method == 'simple':
-                      print default_tuning_sizes[operation]
+                      print 'Size : ', ','.join(map(str, default_tuning_sizes[operation]))
                      profiles = [execution_handler(map(int,default_tuning_sizes[operation]))]
                  else:
                      def compute_perf(x, t):
@@ -125,48 +125,48 @@ def do_tuning(args):
              #Vector AXPY
              if operation=='vaxpy':
                  def execution_handler(sizes, fname=os.devnull, parameters=None):
-                      x = atd.empty(sizes[0], datatype)
-                      y = atd.empty(sizes[0], datatype)
+                      x = atd.empty(sizes[0], datatype, context=context)
+                      y = atd.empty(sizes[0], datatype, context=context)
                      return execute(x + y, sizes, Template, parameters, fname)
                  tune(execution_handler, 1e3, 2e7, 1, (),'log', 'log')
              #dot
              if operation=='dot':
                  def execution_handler(sizes, fname=os.devnull, parameters=None):
-                      x = atd.empty(sizes[0], datatype)
-                      y = atd.empty(sizes[0], datatype)
+                      x = atd.empty(sizes[0], datatype, context=context)
+                      y = atd.empty(sizes[0], datatype, context=context)
                      s = atd.scalar(datatype)
                      return execute(atd.dot(x, y), sizes, Template, parameters, fname)
                  tune(execution_handler, 1e3, 2e7, 1, (),'log', 'log')
              #Matrix AXPY
              if operation=='maxpy':
                  def execution_handler(sizes, fname=os.devnull, parameters=None):
-                      A = atd.empty(sizes, datatype)
-                      C = atd.empty(sizes, datatype)
+                      A = atd.empty(sizes, datatype, context=context)
+                      C = atd.empty(sizes, datatype, context=context)
                      return execute(A + C, sizes, Template, parameters, fname)
                  tune(execution_handler, 100, 5000, 2, (),'log', 'log')
              #Row-wise dot
              if operation=='gemv':
                  for A_trans in  args.gemv_layouts:
-                      def execution_handler(sizes, fname=os.devnull, parameters=None):
                      Template = Template[A_trans]
-                          A = atd.empty(sizes if A_trans=='N' else sizes[::-1], datatype)
-                          x = atd.empty(sizes[1], datatype)
+                      def execution_handler(sizes, fname=os.devnull, parameters=None):
+                          A = atd.empty(sizes if A_trans=='N' else sizes[::-1], datatype, context=context)
+                          x = atd.empty(sizes[1], datatype, context=context)
                          LHS = A if A_trans=='N' else A.T
-                          return execute(device, atd.dot(LHS, x), sizes, Template, parameters, fname)
+                          return execute(atd.dot(LHS, x), sizes, Template, parameters, fname)
                      tune(execution_handler, 100, 5000, 2, (A_trans,),'log', 'log')
              #Matrix Product
              if operation=='gemm':
                  for L in args.gemm_layouts:
                      A_trans = L[0]
                      B_trans = L[1]
+                      Template = Template[(A_trans, B_trans)]
                      def execution_handler(sizes, fname=os.devnull, parameters=None):
-                          Template = Template[A_trans, B_trans]
-                          A = atd.empty((sizes[0], sizes[2]) if A_trans=='N' else (sizes[2], sizes[0]), datatype)
-                          B = atd.empty((sizes[2], sizes[1]) if B_trans=='N' else (sizes[1], sizes[2]), datatype)
+                          A = atd.empty((sizes[0], sizes[2]) if A_trans=='N' else (sizes[2], sizes[0]), datatype, context=context)
+                          B = atd.empty((sizes[2], sizes[1]) if B_trans=='N' else (sizes[1], sizes[2]), datatype, context=context)
                          LHS = A if A_trans=='N' else A.T
                          RHS = B if B_trans=='N' else B.T
-                          return execute(device, atd.dot(LHS, RHS),(A_trans,B_trans), sizes, fname, parameters)
-                      tune(execution_handler, 100, 2000, 3,(A_trans,B_trans), 'linear')
+                          return execute(atd.dot(LHS, RHS), sizes, Template, parameters, fname)
+                      tune(execution_handler, 100, 2000, 3,(A_trans,B_trans), 'linear', 'linear')

              json.dump(json_out, open(args.json_file,'w'))

@@ -177,25 +177,6 @@ class ArgumentsHandler:

    def __init__(self):

-        #No action argument -> interactive tuning
-        if len(sys.argv)==1:
-            def add_input(help, default):
-                return raw_input(help + "[" + default + "] : ") or default
-
-            self.device = add_input('Device to tune for','0')
-            self.operations = add_input('Operations to tune for','vaxpy,maxpy,dot,gemv,gemm-float32')
-            self.gemm_layouts = add_input('GEMV Layouts', 'NN,NT,TN,TT')
-            self.gemv_layouts =  add_input('GEMV Layouts', 'N,T')
-            self.json_file = add_input('JSON File', misc_tools.sanitize_string(devices[int(self.device)].name) + '.json')
-            self.method = add_input('Tuning type', 'simple')
-            if self.method == 'simple':
-                self.blas1_size = add_input('BLAS1 size', '10e6')
-                self.blas2_size = add_input('BLAS2 sizes (M,N)', '2560,2560').split(',')
-                self.blas3_size = add_input('BLAS3 sizes (M,N,K)', '1024,1024,1024').split(',')
-            else:
-              self.build_model = True
-              self.sample_size = 30
-        else:
        #Command line arguments
        parser = argparse.ArgumentParser()
        subparsers = parser.add_subparsers(dest='action')
@@ -236,12 +217,13 @@ class ArgumentsHandler:

 if __name__ == "__main__":

-    devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
+    platforms = atd.get_platforms()
+    devices = [d for platform in platforms for d in platform.get_devices()]
    print("----------------")
    print("Devices available:")
    print("----------------")
    for (i, d) in enumerate(devices):
-        print 'Device', i, '|',  cl.device_type.to_string(d.type), '|', d.name, 'on', d.platform.name
+        print 'Device', i, '|',  atd.device_type_to_string(d.type), '|', d.name, 'on', d.platform.name
    print("----------------")

    args = ArgumentsHandler()
--- a/python/autotune/pysrc/misc_tools.py
+++ b/python/autotune/pysrc/misc_tools.py
@@ -1,6 +1,5 @@
 from __future__ import division

-import pyopencl
 import time
 import os
 import sys
@@ -186,10 +185,13 @@ class OccupancyRecord:


    def __init__(self, dev, threads, shared_mem=0, registers=0):
-        if 'advanced micro devices' in dev.vendor.lower():
+        vendor = dev.vendor.lower()
+        if any(X in vendor for X in ['advanced micro devices', 'amd']):
            self.init_amd(dev, threads, shared_mem, registers)
-        elif 'nvidia' in dev.vendor.lower():
+        elif 'nvidia' in vendor:
            self.init_nvidia(dev, threads, shared_mem, registers)
+        elif 'intel' in vendor:
+            self.occupancy = 100



--- a/python/autotune/pysrc/misc_tools.pyc
+++ b/python/autotune/pysrc/misc_tools.pyc
--- a/python/pyatidlas/src/_atidlas.cpp
+++ b/python/pyatidlas/src/_atidlas.cpp
@@ -85,13 +85,13 @@ bp::tuple get_shape(atd::array const & x)
  return bp::make_tuple(x.shape()._1, x.shape()._2);
 }

-void set_shape(atd::array & x, bp::tuple const & t)
-{
-  unsigned int len = bp::len(t);
-  atd::int_t size1 = bp::extract<atd::int_t>(t[0]);
-  atd::int_t size2 = len<2?1:bp::extract<atd::int_t>(t[1]);
-  x.reshape(size1, size2);
-}
+//void set_shape(atd::array & x, bp::tuple const & t)
+//{
+//  unsigned int len = bp::len(t);
+//  atd::int_t size1 = bp::extract<atd::int_t>(t[0]);
+//  atd::int_t size2 = len<2?1:bp::extract<atd::int_t>(t[1]);
+//  x.reshape(size1, size2);
+//}

 boost::python::dict create_queues(atd::cl::queues_t queues)
 {
@@ -182,6 +182,15 @@ void export_symbolic()

 namespace detail
 {
+  template<class IT>
+  bp::list to_list(IT const & begin, IT const & end)
+  {
+    bp::list res;
+    for (IT it = begin; it != end; ++it)
+      res.append(*it);
+    return res;
+  }
+
  bp::list nv_compute_capability(atd::cl::Device const & device)
  {
    bp::list res;
@@ -190,16 +199,23 @@ namespace detail
    return res;
  }

-  std::string vendor(atd::cl::Device const & device){
-    return device.getInfo<CL_DEVICE_VENDOR>();
+  bp::list get_platforms()
+  {
+    std::vector<atd::cl::Platform> platforms;
+    atd::cl::Platform::get(&platforms);
+    return to_list(platforms.begin(), platforms.end());
+  }
+
+  bp::list get_devices(atd::cl::Platform const & platform)
+  {
+    std::vector<atd::cl::Device> devices;
+    platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
+    return to_list(devices.begin(), devices.end());
  }

  std::vector<atd::cl::CommandQueue> & get_queue(atd::cl::Context const & ctx)
  { return atd::cl::queues[ctx]; }

-  atd::cl::Device get_device(atd::cl::CommandQueue & queue)
-  { return queue.getInfo<CL_QUEUE_DEVICE>(); }
-
  atd::numeric_type extract_dtype(bp::object const & odtype)
  {
      std::string name = bp::extract<std::string>(odtype.attr("__class__").attr("__name__"))();
@@ -272,20 +288,50 @@ namespace detail
      }
  };

+  atd::cl::Platform get_platform(atd::cl::Device const & device)
+  {  return atd::cl::Platform(device.getInfo<CL_DEVICE_PLATFORM>());  }
+
+  template<cl_int INFO>
+  typename atd::cl::detail::param_traits<atd::cl::detail::cl_device_info, INFO>::param_type
+  wrap_device_info(atd::cl::Device const & x)
+  { return x.getInfo<INFO>(NULL); }
+
+  template<cl_int INFO>
+  typename atd::cl::detail::param_traits<atd::cl::detail::cl_context_info, INFO>::param_type
+  wrap_context_info(atd::cl::Context const & x)
+  { return x.getInfo<INFO>(NULL); }
+
+  template<cl_int INFO>
+  typename atd::cl::detail::param_traits<atd::cl::detail::cl_platform_info, INFO>::param_type
+  wrap_platform_info(atd::cl::Platform const & x)
+  { return x.getInfo<INFO>(NULL); }
+
+  template<cl_int INFO>
+  typename atd::cl::detail::param_traits<atd::cl::detail::cl_command_queue_info, INFO>::param_type
+  wrap_command_queue_info(atd::cl::CommandQueue const & x)
+  { return x.getInfo<INFO>(NULL); }
+
+
+  std::string to_string(cl_device_type type)
+  {
+    if(type==CL_DEVICE_TYPE_ALL) return "ALL";
+    if(type==CL_DEVICE_TYPE_CPU) return "CPU";
+    if(type==CL_DEVICE_TYPE_GPU) return "GPU";
+    if(type==CL_DEVICE_TYPE_ACCELERATOR) return "ACCELERATOR";
+    throw;
+  }
 }

+
 void export_cl()
 {
  typedef std::vector<atd::cl::CommandQueue> queues_t;
-
  bp::class_<queues_t>("queues")
+      .def("__len__", &queues_t::size)
      .def("__getitem__", &bp::vector_indexing_suite<queues_t>::get_item, bp::return_internal_reference<>())
      .def("__setitem__", &bp::vector_indexing_suite<queues_t>::set_item, bp::with_custodian_and_ward<1,2>())
-      ;
+      .def("append", &bp::vector_indexing_suite<queues_t>::append)

-  bp::class_<atd::cl::Device>("device", bp::no_init)
-      .add_property("nv_compute_capability", &detail::nv_compute_capability)
-      .add_property("vendor", &detail::vendor)
      ;

  bp::class_<atd::model_map_t>("models")
@@ -293,18 +339,50 @@ void export_cl()
      .def("__setitem__", &detail::model_map_indexing::set_item, bp::with_custodian_and_ward<1,2>())
      ;

-  bp::class_<atd::cl::Context>("context", bp::no_init)
+  bp::enum_<cl_device_type>("device_type")
+      .value("CL_DEVICE_TYPE_ALL", CL_DEVICE_TYPE_ALL)
+      .value("CL_DEVICE_TYPE_CPU", CL_DEVICE_TYPE_CPU)
+      .value("CL_DEVICE_TYPE_GPU", CL_DEVICE_TYPE_GPU)
+      .value("CL_DEVICE_TYPE_ACCELERATOR", CL_DEVICE_TYPE_ACCELERATOR)
+      ;
+
+  bp::def("device_type_to_string", &detail::to_string);
+
+
+  bp::class_<atd::cl::Platform>("platform", bp::no_init)
+    #define WRAP(PYNAME, NAME) .add_property(PYNAME, &detail::wrap_platform_info<NAME>)
+      WRAP("name", CL_PLATFORM_NAME)
+    #undef WRAP
+      .def("get_devices", &detail::get_devices)
+      ;
+
+  bp::class_<atd::cl::Device>("device", bp::no_init)
+    #define WRAP(PYNAME, NAME) .add_property(PYNAME, &detail::wrap_device_info<NAME>)
+      .add_property("nv_compute_capability", &detail::nv_compute_capability)
+      .add_property("platform", &detail::get_platform)
+      WRAP("double_fp_config", CL_DEVICE_DOUBLE_FP_CONFIG)
+      WRAP("name", CL_DEVICE_NAME)
+      WRAP("type", CL_DEVICE_TYPE)
+      WRAP("vendor", CL_DEVICE_VENDOR)
+    #undef WRAP
+      ;
+
+  bp::class_<atd::cl::Context>("context", bp::init<atd::cl::Device>())
+    #define WRAP(PYNAME, NAME) .add_property(PYNAME, &detail::wrap_context_info<NAME>)
+    #undef WRAP
      .add_property("queues", bp::make_function(&detail::get_queue, bp::return_internal_reference<>()))
      ;

-
-
-  bp::class_<atd::cl::CommandQueue>("command_queue", bp::no_init)
-      .add_property("device", &detail::get_device)
+  bp::class_<atd::cl::CommandQueue>("command_queue", bp::init<atd::cl::Context, atd::cl::Device>())
+    #define WRAP(PYNAME, NAME) .add_property(PYNAME, &detail::wrap_command_queue_info<NAME>)
+      WRAP("device", CL_QUEUE_DEVICE)
+    #undef WRAP
      .add_property("models", bp::make_function(&atd::get_model_map, bp::return_internal_reference<>()));
      ;

  bp::def("synchronize", &atd::cl::synchronize);
+  bp::def("get_platforms", &detail::get_platforms);
+
 }

 namespace detail
@@ -446,6 +524,7 @@ void export_array()
      .def(bp::init<atd::array_expression>())
      .add_property("dtype", &atd::array::dtype)
      .add_property("context", bp::make_function(&atd::array::context, bp::return_internal_reference<>()))
+      .add_property("T", &atd::array::T)
 //      .add_property("shape", &detail::get_shape, &detail::set_shape)
      ADD_ARRAY_OPERATOR(+)
      ADD_ARRAY_OPERATOR(-)
@@ -477,8 +556,8 @@ void export_array()
      bp::def(#name, static_cast<atd::array_expression (*)(atd::array const &, atd::array_expression const &)>(&atd::name));\
      bp::def(#name, static_cast<atd::array_expression (*)(atd::array_expression const &, atd::array_expression const &)>(&atd::name));

-  MAP_FUNCTION(max)
-  MAP_FUNCTION(min)
+  MAP_FUNCTION(maximum)
+  MAP_FUNCTION(minimum)
  MAP_FUNCTION(pow)
  MAP_FUNCTION(dot)
 #undef MAP_FUNCTION
@@ -551,21 +630,24 @@ void export_model()
    #undef __PROP
  }

-  #define WRAP_TEMPLATE(name, ...) bp::class_<atidlas::base_impl<atidlas::name, atidlas::name::parameters_type>, bp::bases<atidlas::base>, boost::noncopyable>(#name "_base_impl", bp::no_init);\
-                                   bp::class_<atidlas::name, bp::bases<atidlas::base_impl<atidlas::name, atidlas::name::parameters_type> > >(#name, bp::init<__VA_ARGS__>())\
+  #define WRAP_BASE(name) bp::class_<atidlas::base_impl<atidlas::name, atidlas::name::parameters_type>, bp::bases<atidlas::base>, boost::noncopyable>(#name "_base_impl", bp::no_init);
+  #define WRAP_TEMPLATE(name, basename, ...) bp::class_<atidlas::name, bp::bases<atidlas::base_impl<atidlas::basename, atidlas::basename::parameters_type> > >(#name, bp::init<__VA_ARGS__>())\
                                      .add_property("local_size_0", &atd::name::local_size_0)\
                                      .add_property("local_size_1", &atd::name::local_size_1);
+  #define WRAP_SINGLE_TEMPLATE(name, ...) WRAP_BASE(name) WRAP_TEMPLATE(name, name, __VA_ARGS__)

  //Vector AXPY
-  WRAP_TEMPLATE(vaxpy, uint, uint, uint, atidlas::fetching_policy_type)
-  WRAP_TEMPLATE(maxpy, uint, uint, uint, uint, uint, atidlas::fetching_policy_type)
-  WRAP_TEMPLATE(reduction, uint, uint, uint, atidlas::fetching_policy_type)
-  WRAP_TEMPLATE(mreduction_rows, uint, uint, uint, uint, atidlas::fetching_policy_type)
-  WRAP_TEMPLATE(mreduction_cols, uint, uint, uint, uint, atidlas::fetching_policy_type)
-  WRAP_TEMPLATE(mproduct_nn, uint, uint, uint, uint, uint, uint, uint, atidlas::fetching_policy_type, atidlas::fetching_policy_type, uint, uint)
-  WRAP_TEMPLATE(mproduct_tn, uint, uint, uint, uint, uint, uint, uint, atidlas::fetching_policy_type, atidlas::fetching_policy_type, uint, uint)
-  WRAP_TEMPLATE(mproduct_nt, uint, uint, uint, uint, uint, uint, uint, atidlas::fetching_policy_type, atidlas::fetching_policy_type, uint, uint)
-  WRAP_TEMPLATE(mproduct_tt, uint, uint, uint, uint, uint, uint, uint, atidlas::fetching_policy_type, atidlas::fetching_policy_type, uint, uint)
+  WRAP_SINGLE_TEMPLATE(vaxpy, uint, uint, uint, atidlas::fetching_policy_type)
+  WRAP_SINGLE_TEMPLATE(maxpy, uint, uint, uint, uint, uint, atidlas::fetching_policy_type)
+  WRAP_SINGLE_TEMPLATE(reduction, uint, uint, uint, atidlas::fetching_policy_type)
+  WRAP_BASE(mreduction)
+  WRAP_TEMPLATE(mreduction_rows, mreduction, uint, uint, uint, uint, atidlas::fetching_policy_type)
+  WRAP_TEMPLATE(mreduction_cols, mreduction, uint, uint, uint, uint, atidlas::fetching_policy_type)
+  WRAP_BASE(mproduct)
+  WRAP_TEMPLATE(mproduct_nn, mproduct, uint, uint, uint, uint, uint, uint, uint, atidlas::fetching_policy_type, atidlas::fetching_policy_type, uint, uint)
+  WRAP_TEMPLATE(mproduct_tn, mproduct, uint, uint, uint, uint, uint, uint, uint, atidlas::fetching_policy_type, atidlas::fetching_policy_type, uint, uint)
+  WRAP_TEMPLATE(mproduct_nt, mproduct, uint, uint, uint, uint, uint, uint, uint, atidlas::fetching_policy_type, atidlas::fetching_policy_type, uint, uint)
+  WRAP_TEMPLATE(mproduct_tt, mproduct, uint, uint, uint, uint, uint, uint, uint, atidlas::fetching_policy_type, atidlas::fetching_policy_type, uint, uint)


 }