Various fixes

2015-01-27 02:41:27 -05:00
parent 909e4b16a0
commit a96c897cb3
11 changed files with 141 additions and 139 deletions
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -36,86 +36,86 @@ void bench(ad::numeric_type dtype)
  std::cout << " " << PERF << std::flush;\
  }

-//  /*---------*/
-//  /*--BLAS1--*/
-//  /*---------*/
-//  std::cout << "#AXPY" << std::endl;
-//  for(std::vector<int_t>::const_iterator it = BLAS1_N.begin() ; it != BLAS1_N.end() ; ++it)
-//  {
-//    int_t N = *it;
-//    std::cout << N;
-//    /* ATIDLAS */
-//    atidlas::array x(N, dtype), y(N, dtype);
-//    BENCHMARK(y = x + y, bandwidth(3*N, tres, dtsize));
-//    /* clAmdBlas */
-//#ifdef BENCH_CLAMDBLAS
-//    BENCHMARK(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &atidlas::cl::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(3*N, tres, dtsize))
-//#endif
-//    /* BLAS */
-//#ifdef BENCH_CBLAS
-//    std::vector<float> cx(N), cy(N);
-//    atidlas::copy(x, cx);
-//    atidlas::copy(y, cy);
-//    BENCHMARK(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), bandwidth(3*N, tres, dtsize));
-//#endif
-//    std::cout << std::endl;
-//  }
-//  std::cout << "\n\n" << std::flush;
+  /*---------*/
+  /*--BLAS1--*/
+  /*---------*/
+  std::cout << "#AXPY" << std::endl;
+  for(std::vector<int_t>::const_iterator it = BLAS1_N.begin() ; it != BLAS1_N.end() ; ++it)
+  {
+    int_t N = *it;
+    std::cout << N;
+    /* ATIDLAS */
+    atidlas::array x(N, dtype), y(N, dtype);
+    BENCHMARK(y = x + y, bandwidth(3*N, tres, dtsize));
+    /* clAmdBlas */
+#ifdef BENCH_CLAMDBLAS
+    BENCHMARK(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &atidlas::cl::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(3*N, tres, dtsize))
+#endif
+    /* BLAS */
+#ifdef BENCH_CBLAS
+    std::vector<float> cx(N), cy(N);
+    atidlas::copy(x, cx);
+    atidlas::copy(y, cy);
+    BENCHMARK(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), bandwidth(3*N, tres, dtsize));
+#endif
+    std::cout << std::endl;
+  }
+  std::cout << "\n\n" << std::flush;

-//  std::cout << "#DOT" << std::endl;
-//  for(std::vector<int_t>::const_iterator it = BLAS1_N.begin() ; it != BLAS1_N.end() ; ++it)
-//  {
-//    int_t N = *it;
-//    std::cout << N;
-//    /* ATIDLAS */
-//    atidlas::array x(N, dtype), y(N, dtype);
-//    atidlas::array scratch(N, dtype);
-//    atidlas::scalar s(dtype);
-//    BENCHMARK(s = dot(x,y), bandwidth(2*N, tres, dtsize));
-//    /* clAmdBlas */
-//#ifdef BENCH_CLAMDBLAS
-//    BENCHMARK(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &atidlas::cl::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(2*N, tres, dtsize))
-//#endif
-//    /* BLAS */
-//#ifdef BENCH_CBLAS
-//    std::vector<float> cx(N), cy(N);
-//    atidlas::copy(x, cx);
-//    atidlas::copy(y, cy);
-//    BENCHMARK(cblas_sdot(N, cx.data(), 1, cy.data(), 1), bandwidth(2*N, tres, dtsize));
-//#endif
-//    std::cout << std::endl;
-//  }
-//  std::cout << "\n\n" << std::flush;
+  std::cout << "#DOT" << std::endl;
+  for(std::vector<int_t>::const_iterator it = BLAS1_N.begin() ; it != BLAS1_N.end() ; ++it)
+  {
+    int_t N = *it;
+    std::cout << N;
+    /* ATIDLAS */
+    atidlas::array x(N, dtype), y(N, dtype);
+    atidlas::array scratch(N, dtype);
+    atidlas::scalar s(dtype);
+    BENCHMARK(s = dot(x,y), bandwidth(2*N, tres, dtsize));
+    /* clAmdBlas */
+#ifdef BENCH_CLAMDBLAS
+    BENCHMARK(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &atidlas::cl::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(2*N, tres, dtsize))
+#endif
+    /* BLAS */
+#ifdef BENCH_CBLAS
+    std::vector<float> cx(N), cy(N);
+    atidlas::copy(x, cx);
+    atidlas::copy(y, cy);
+    BENCHMARK(cblas_sdot(N, cx.data(), 1, cy.data(), 1), bandwidth(2*N, tres, dtsize));
+#endif
+    std::cout << std::endl;
+  }
+  std::cout << "\n\n" << std::flush;

-//  /*---------*/
-//  /*--BLAS2--*/
-//  /*---------*/
-//  //T-layout
-//  std::cout << "#GEMV-T" << std::endl;
-//  for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
-//    for(std::vector<int_t>::const_iterator Nit = BLAS2_N.begin() ; Nit != BLAS2_N.end() ; ++Nit)
-//    {
-//      int_t M = *Mit;
-//      int_t N = *Nit;
-//      std::cout << M << "," << N;
-//      /* ATIDLAS */
-//      atidlas::array A(N, M, dtype), y(M, dtype), x(N, dtype);
-//      BENCHMARK(y = dot(trans(A),x), bandwidth(M*N + M + N, tres, dtsize));
-//      /* clAmdBlas */
-//  #ifdef BENCH_CLAMDBLAS
-//      BENCHMARK(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &atidlas::cl::get_queue(x.context(), 0)(),0, NULL, NULL), bandwidth(M*N + M + N, tres, dtsize))
-//  #endif
-//      /* BLAS */
-//  #ifdef BENCH_CBLAS
-//      std::vector<float> cA(N*M), cx(N), cy(M);
-//      atidlas::copy(x, cx);
-//      atidlas::copy(y, cy);
-//      atidlas::copy(A, cA);
-//      BENCHMARK(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), bandwidth(M*N + M + N, tres, dtsize));
-//  #endif
-//      std::cout << std::endl;
-//    }
-//    std::cout << "\n\n" << std::flush;
+  /*---------*/
+  /*--BLAS2--*/
+  /*---------*/
+  //T-layout
+  std::cout << "#GEMV-T" << std::endl;
+  for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
+    for(std::vector<int_t>::const_iterator Nit = BLAS2_N.begin() ; Nit != BLAS2_N.end() ; ++Nit)
+    {
+      int_t M = *Mit;
+      int_t N = *Nit;
+      std::cout << M << "," << N;
+      /* ATIDLAS */
+      atidlas::array A(N, M, dtype), y(M, dtype), x(N, dtype);
+      BENCHMARK(y = dot(trans(A),x), bandwidth(M*N + M + N, tres, dtsize));
+      /* clAmdBlas */
+  #ifdef BENCH_CLAMDBLAS
+      BENCHMARK(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &atidlas::cl::get_queue(x.context(), 0)(),0, NULL, NULL), bandwidth(M*N + M + N, tres, dtsize))
+  #endif
+      /* BLAS */
+  #ifdef BENCH_CBLAS
+      std::vector<float> cA(N*M), cx(N), cy(M);
+      atidlas::copy(x, cx);
+      atidlas::copy(y, cy);
+      atidlas::copy(A, cA);
+      BENCHMARK(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), bandwidth(M*N + M + N, tres, dtsize));
+  #endif
+      std::cout << std::endl;
+    }
+    std::cout << "\n\n" << std::flush;

 //  /*---------*/
 //  /*--BLAS3--*/
--- a/bench/common.hpp
+++ b/bench/common.hpp
@@ -47,8 +47,8 @@ private:
 static const std::vector<int> BLAS1_N = create_log_range(1e3, 2e7, 50, 64);

 // BLAS2 Sizes
-static const std::vector<int> BLAS2_M = make_vector<int>() << 256;
-static const std::vector<int> BLAS2_N = create_full_range(128, 5000, 64);
+static const std::vector<int> BLAS2_N = make_vector<int>() << 64;
+static const std::vector<int> BLAS2_M = create_full_range(128, 10000, 64);

 // BLAS3 Sizes
 static const std::vector<int> BLAS3_M = make_vector<int>() << 1024;
--- a/include/atidlas/backend/templates/maxpy.h
+++ b/include/atidlas/backend/templates/maxpy.h
@@ -10,10 +10,7 @@ namespace atidlas
 class maxpy_parameters : public base::parameters_type
 {
 public:
-  maxpy_parameters(unsigned int _simd_width,
-                              unsigned int _local_size_0, unsigned int _local_size_1,
-                              unsigned int _num_groups_0, unsigned int _num_groups_1,
-                              fetching_policy_type _fetching_policy);
+  maxpy_parameters(unsigned int _simd_width, unsigned int _local_size_0, unsigned int _local_size_1, unsigned int _num_groups_0, unsigned int _num_groups_1, fetching_policy_type _fetching_policy);

  unsigned int num_groups_0;
  unsigned int num_groups_1;
@@ -28,14 +25,9 @@ private:
  std::vector<std::string> generate_impl(unsigned int label, symbolic_expressions_container const & symbolic_expressions, std::vector<mapping_type> const & mappings) const;
 public:
  maxpy(parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
-  maxpy(unsigned int simd, unsigned int ls1, unsigned int ls2,
-                                 unsigned int ng1, unsigned int ng2, fetching_policy_type fetch,
-                                 binding_policy_t bind = BIND_ALL_UNIQUE);
+  maxpy(unsigned int simd, unsigned int ls1, unsigned int ls2,  unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE);
  std::vector<int_t> input_sizes(symbolic_expressions_container const & symbolic_expressions);
-  void enqueue(cl::CommandQueue & queue,
-               std::vector<cl::lazy_compiler> & programs,
-               unsigned int label,
-               symbolic_expressions_container const & symbolic_expressions);
+  void enqueue(cl::CommandQueue & queue, std::vector<cl::lazy_compiler> & programs,  unsigned int label, symbolic_expressions_container const & symbolic_expressions);
 };

 }
--- a/include/atidlas/backend/templates/mproduct.h
+++ b/include/atidlas/backend/templates/mproduct.h
@@ -36,7 +36,7 @@ private:
  unsigned int lmem_usage(symbolic_expressions_container const & symbolic_expressions) const;
  unsigned int registers_usage(symbolic_expressions_container const & symbolic_expressions) const;
  int check_invalid_impl(cl::Device const &, symbolic_expressions_container const &) const;
-  std::string generate_impl(unsigned int label, char id, const symbolic_expressions_container &symbolic_expressions, const std::vector<mapping_type> &, bool fallback) const;
+  std::string generate_impl(unsigned int label, const char *  id, const symbolic_expressions_container &symbolic_expressions, const std::vector<mapping_type> &, bool fallback) const;
  std::vector<std::string> generate_impl(unsigned int label, symbolic_expressions_container const & symbolic_expressions, std::vector<mapping_type> const & mappings) const;
  void enqueue_block(cl::CommandQueue & queue, int_t M, int_t N, int_t K,
                     array_infos const & A, array_infos const & B, array_infos const & C,
--- a/include/atidlas/backend/templates/reduction.h
+++ b/include/atidlas/backend/templates/reduction.h
@@ -22,7 +22,7 @@ private:
  int check_invalid_impl(cl::Device const &, symbolic_expressions_container const &) const;
  inline void reduce_1d_local_memory(kernel_generation_stream & stream, unsigned int size, std::vector<mapped_scalar_reduction*> exprs,
                                     std::string const & buf_str, std::string const & buf_value_str) const;
-  std::string generate_impl(unsigned int label, char type, symbolic_expressions_container const & symbolic_expressions, std::vector<mapping_type> const & mappings, unsigned int simd_width) const;
+  std::string generate_impl(unsigned int label, const char * type, symbolic_expressions_container const & symbolic_expressions, std::vector<mapping_type> const & mappings, unsigned int simd_width) const;
  std::vector<std::string> generate_impl(unsigned int label,  symbolic_expressions_container const & symbolic_expressions, std::vector<mapping_type> const & mappings) const;

 public:
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -5,15 +5,12 @@ add_library(atidlas SHARED ${LIBATIDLAS_SRC})
 set_target_properties(atidlas PROPERTIES
   COMPILE_FLAGS "-Wno-sign-compare -D__CL_ENABLE_EXCEPTIONS -Wall -Wextra -pedantic")

-set(PREINSTALL_PY "${CMAKE_CURRENT_BINARY_DIR}/preinstall.py")
+
+#install(TARGETS atidlas LIBRARY DESTINATION lib)
+#set(INSTALL_INCLUDE_DIR /usr/local/include)
+#install(DIRECTORY atidlas "${PROJECT_SOURCE_DIR}/include/atidlas"
+#        DESTINATION "${INSTALL_INCLUDE_DIR}"  FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp")
+
 set(POSTINSTALL_PY "${CMAKE_CURRENT_BINARY_DIR}/postinstall.py")
-
-configure_file("${PROJECT_SOURCE_DIR}/python/preinstall.py" ${PREINSTALL_PY})
 configure_file("${PROJECT_SOURCE_DIR}/python/postinstall.py" ${POSTINSTALL_PY})
-
-install(CODE "execute_process(COMMAND ${PYTHON} ${PREINSTALL_PY})")
-install(TARGETS atidlas LIBRARY DESTINATION lib)
-set(INSTALL_INCLUDE_DIR /usr/local/include)
-install(DIRECTORY atidlas "${PROJECT_SOURCE_DIR}/include/atidlas"
-        DESTINATION "${INSTALL_INCLUDE_DIR}"  FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp")
-install(CODE "execute_process(COMMAND ${PYTHON} ${POST_INSTALL_SCRIPT_PY})")
+install(CODE "execute_process(COMMAND ${PYTHON} ${POSTINSTALL_PY})")
--- a/lib/backend/templates/maxpy.cpp
+++ b/lib/backend/templates/maxpy.cpp
@@ -29,8 +29,11 @@ std::string maxpy::generate_impl(unsigned int label, symbolic_expressions_contai

  std::string init0, upper_bound0, inc0, init1, upper_bound1, inc1;

+  char kprefix[10];
+  fill_kernel_name(kprefix, label, "d");
+
  stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl;
-  stream << "__kernel void " << "k" << label << "d" << "(unsigned int M, unsigned int N, " << generate_arguments("#scalartype", mappings, symbolic_expressions) << ")" << std::endl;
+  stream << "__kernel void " << kprefix << "(unsigned int M, unsigned int N, " << generate_arguments("#scalartype", mappings, symbolic_expressions) << ")" << std::endl;
  stream << "{" << std::endl;
  stream.inc_tab();

--- a/lib/backend/templates/mproduct.cpp
+++ b/lib/backend/templates/mproduct.cpp
@@ -87,7 +87,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
    return TEMPLATE_VALID;
  }

-  std::string mproduct::generate_impl(unsigned int label, char id, const symbolic_expressions_container &symbolic_expressions, const std::vector<mapping_type> &, bool fallback) const
+  std::string mproduct::generate_impl(unsigned int label, const char * id, const symbolic_expressions_container &symbolic_expressions, const std::vector<mapping_type> &, bool fallback) const
  {
    using std::string;
    using tools::to_string;
@@ -121,7 +121,10 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
    /// //////////////
    std::string widthdtype = append_width("#scalartype", p.simd_width);
    stream << " __attribute__((reqd_work_group_size(" << p.local_size_0 << "," << p.local_size_1 << ",1)))" << std::endl;
-    stream << "__kernel void " << "k" << label << id << "(unsigned int M, unsigned int N,  unsigned int K, "
+    char kprefix[10];
+    fill_kernel_name(kprefix, label, id);
+
+    stream << "__kernel void " << kprefix << "(unsigned int M, unsigned int N,  unsigned int K, "
                               << C.process("__global #scalartype* #pointer, uint #ld, uint #start1, uint #start2, uint #stride1, uint #stride2,")
                               << alpha.process("#scalartype #name,")
                               << A.process("__global " + widthdtype + "* #pointer, uint #ld, uint #start1, uint #start2, uint #stride1, uint #stride2,")
@@ -557,8 +560,8 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
  std::vector<std::string> mproduct::generate_impl(unsigned int label, symbolic_expressions_container const & symbolic_expressions, std::vector<mapping_type> const & mappings) const
  {
    std::vector<std::string> res;
-    res.push_back(generate_impl(label, 'o', symbolic_expressions, mappings, false));
-    res.push_back(generate_impl(label, 'f', symbolic_expressions, mappings, true));
+    res.push_back(generate_impl(label, "o", symbolic_expressions, mappings, false));
+    res.push_back(generate_impl(label, "f", symbolic_expressions, mappings, true));
    return res;
  }

--- a/lib/backend/templates/mreduction.cpp
+++ b/lib/backend/templates/mreduction.cpp
@@ -36,8 +36,11 @@ std::string mreduction::generate_impl(unsigned int label, symbolic_expressions_c

  kernel_generation_stream stream;

+  char kprefix[10];
+  fill_kernel_name(kprefix, label, "d");
+
  stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl;
-  stream << "__kernel void " << "k" << label << "d" << "(unsigned int M, unsigned int N, " << generate_arguments("#scalartype", mappings, symbolic_expressions) << ")" << std::endl;
+  stream << "__kernel void " << kprefix << "(unsigned int M, unsigned int N, " << generate_arguments("#scalartype", mappings, symbolic_expressions) << ")" << std::endl;
  stream << "{" << std::endl;
  stream.inc_tab();

--- a/lib/backend/templates/reduction.cpp
+++ b/lib/backend/templates/reduction.cpp
@@ -56,7 +56,7 @@ inline void reduction::reduce_1d_local_memory(kernel_generation_stream & stream,
  stream << "}" << std::endl;
 }

-std::string reduction::generate_impl(unsigned int label, char type, symbolic_expressions_container const & symbolic_expressions, std::vector<mapping_type> const & mappings, unsigned int simd_width) const
+std::string reduction::generate_impl(unsigned int label, const char * type, symbolic_expressions_container const & symbolic_expressions, std::vector<mapping_type> const & mappings, unsigned int simd_width) const
 {
  kernel_generation_stream stream;

@@ -85,8 +85,11 @@ std::string reduction::generate_impl(unsigned int label, char type, symbolic_exp
  /* ------------------------
   * First Kernel
   * -----------------------*/
+  char kprefix[10];
+  fill_kernel_name(kprefix, label, type);
+
  stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl;
-  stream << "__kernel void " << "k" << label << type << "0" << "(" << arguments << generate_arguments("#scalartype", mappings, symbolic_expressions) << ")" << std::endl;
+  stream << "__kernel void " << kprefix << "0" << "(" << arguments << generate_arguments("#scalartype", mappings, symbolic_expressions) << ")" << std::endl;
  stream << "{" << std::endl;
  stream.inc_tab();

@@ -191,7 +194,7 @@ std::string reduction::generate_impl(unsigned int label, char type, symbolic_exp
   * Second kernel
   * -----------------------*/
  stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl;
-  stream << "__kernel void " << "k" << label << type << "1" << "(" << arguments << generate_arguments("#scalartype", mappings, symbolic_expressions) << ")" << std::endl;
+  stream << "__kernel void " << kprefix << "1" << "(" << arguments << generate_arguments("#scalartype", mappings, symbolic_expressions) << ")" << std::endl;
  stream << "{" << std::endl;
  stream.inc_tab();

@@ -256,8 +259,8 @@ std::string reduction::generate_impl(unsigned int label, char type, symbolic_exp
 std::vector<std::string> reduction::generate_impl(unsigned int label,  symbolic_expressions_container const & symbolic_expressions, std::vector<mapping_type> const & mappings) const
 {
  std::vector<std::string> result;
-  result.push_back(generate_impl(label, 'f', symbolic_expressions, mappings, 1));
-  result.push_back(generate_impl(label, 'o', symbolic_expressions, mappings, p_.simd_width));
+  result.push_back(generate_impl(label, "f", symbolic_expressions, mappings, 1));
+  result.push_back(generate_impl(label, "o", symbolic_expressions, mappings, p_.simd_width));
  return result;
 }

--- a/lib/model/model.cpp
+++ b/lib/model/model.cpp
@@ -142,14 +142,14 @@ namespace detail
  static expression_type get_expression_type(std::string const & name)
  {
    if(name=="vaxpy") return VECTOR_AXPY_TYPE;
-    if(name=="reduction") return REDUCTION_TYPE;
+    if(name=="dot") return REDUCTION_TYPE;
    if(name=="maxpy") return MATRIX_AXPY_TYPE;
-    if(name=="row-wise-reductionN") return ROW_WISE_REDUCTION_TYPE;
-    if(name=="row-wise-reductionT") return COL_WISE_REDUCTION_TYPE;
-    if(name=="matrix-productNN") return MATRIX_PRODUCT_NN_TYPE;
-    if(name=="matrix-productNT") return MATRIX_PRODUCT_NT_TYPE;
-    if(name=="matrix-productTN") return MATRIX_PRODUCT_TN_TYPE;
-    if(name=="matrix-productTT") return MATRIX_PRODUCT_TT_TYPE;
+    if(name=="gemvN") return ROW_WISE_REDUCTION_TYPE;
+    if(name=="gemvT") return COL_WISE_REDUCTION_TYPE;
+    if(name=="gemmNN") return MATRIX_PRODUCT_NN_TYPE;
+    if(name=="gemmNT") return MATRIX_PRODUCT_NT_TYPE;
+    if(name=="gemmTN") return MATRIX_PRODUCT_TN_TYPE;
+    if(name=="gemmTT") return MATRIX_PRODUCT_TT_TYPE;
    throw ;
  }

@@ -164,22 +164,23 @@ namespace detail
  {
    fetching_policy_type fetch[] = {FETCH_FROM_LOCAL, FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_CONTIGUOUS};
    if(template_name=="vaxpy")
-      return tools::shared_ptr<base>(new vaxpy( vaxpy_parameters(a[0], a[1], a[2], fetch[a[3]])));
-    else if(template_name=="reduction")
-      return tools::shared_ptr<base>(new reduction( reduction_parameters(a[0], a[1], a[2], fetch[a[3]])));
+      return tools::shared_ptr<base>(new vaxpy(a[0], a[1], a[2], fetch[a[3]]));
+    else if(template_name=="dot")
+      return tools::shared_ptr<base>(new reduction(a[0], a[1], a[2], fetch[a[3]]));
    else if(template_name=="maxpy")
-      return tools::shared_ptr<base>(new maxpy( maxpy_parameters(a[0], a[1], a[2], a[3], a[4], fetch[a[5]])));
-    else if(template_name.find("row-wise-reduction")!=std::string::npos)
-    {
-      return tools::shared_ptr<base>(new mreduction_rows( mreduction_parameters(a[0], a[1], a[2], a[3], fetch[a[4]])));
-    }
-    else if(template_name.find("matrix-product")!=std::string::npos)
-    {
-      char A_trans = template_name[15];
-      char B_trans = template_name[16];
-      return tools::shared_ptr<base>(new mproduct( mproduct_parameters(a[0], a[1], a[2], a[3], a[4], a[5], a[6],
-                                                                                                fetch[a[7]], fetch[a[8]], a[9], a[10]), A_trans, B_trans));
-    }
+      return tools::shared_ptr<base>(new maxpy(a[0], a[1], a[2], a[3], a[4], fetch[a[5]]));
+    else if(template_name.find("gemvN")!=std::string::npos)
+      return tools::shared_ptr<base>(new mreduction_rows(a[0], a[1], a[2], a[3], fetch[a[4]]));
+    else if(template_name.find("gemvT")!=std::string::npos)
+      return tools::shared_ptr<base>(new mreduction_cols(a[0], a[1], a[2], a[3], fetch[a[4]]));
+    else if(template_name.find("gemmNN")!=std::string::npos)
+      return tools::shared_ptr<base>(new mproduct_nn(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
+    else if(template_name.find("gemmTN")!=std::string::npos)
+      return tools::shared_ptr<base>(new mproduct_tn(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
+    else if(template_name.find("gemmNT")!=std::string::npos)
+      return tools::shared_ptr<base>(new mproduct_nt(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
+    else if(template_name.find("gemmTT")!=std::string::npos)
+      return tools::shared_ptr<base>(new mproduct_tt(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
    else
      throw operation_not_supported_exception("Cannot create the given operation");
  }
@@ -198,9 +199,9 @@ void import(std::string const & fname, cl::CommandQueue & queue, model_map_t& re
  str.assign((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
  document.Parse<0>(str.c_str());
  //Deserialize
-  std::vector<std::string> operations = tools::make_vector<std::string>() << "vaxpy" << "reduction"
-                                                                          << "maxpy" << "row-wise-reductionN" << "row-wise-reductionT"
-                                                                          << "matrix-productNN" << "matrix-productTN" << "matrix-productNT" << "matrix-productTT";
+  std::vector<std::string> operations = tools::make_vector<std::string>() << "vaxpy" << "dot"
+                                                                          << "maxpy" << "gemvN" << "gemvT"
+                                                                          << "gemmNN" << "gemmTN" << "gemmTT";
  std::vector<std::string> dtype = tools::make_vector<std::string>() << "float32" << "float64";
  for(std::vector<std::string>::iterator op = operations.begin() ; op != operations.end() ; ++op)
  {