Code Quality: More sensible names

2015-12-12 18:32:06 -05:00
parent 46dad59e10
commit 042aa070bb
31 changed files with 379 additions and 379 deletions
--- a/include/isaac/common/expression_type.h
+++ b/include/isaac/common/expression_type.h
@@ -23,15 +23,15 @@ enum expression_type

 inline expression_type expression_type_from_string(std::string const & name)
 {
-  if(name=="axpy") return AXPY_TYPE;
-  if(name=="dot") return DOT_TYPE;
-  if(name=="ger") return GER_TYPE;
-  if(name=="gemv_n") return GEMV_N_TYPE;
-  if(name=="gemv_t") return GEMV_T_TYPE;
-  if(name=="gemm_nn") return GEMM_NN_TYPE;
-  if(name=="gemm_nt") return GEMM_NT_TYPE;
-  if(name=="gemm_tn") return GEMM_TN_TYPE;
-  if(name=="gemm_tt") return GEMM_TT_TYPE;
+  if(name=="elementwise_1d") return AXPY_TYPE;
+  if(name=="reduce_1d") return DOT_TYPE;
+  if(name=="elementwise_2d") return GER_TYPE;
+  if(name=="reduce_2d_n") return GEMV_N_TYPE;
+  if(name=="reduce_2d_t") return GEMV_T_TYPE;
+  if(name=="matrix_product_nn") return GEMM_NN_TYPE;
+  if(name=="matrix_product_nt") return GEMM_NT_TYPE;
+  if(name=="matrix_product_tn") return GEMM_TN_TYPE;
+  if(name=="matrix_product_tt") return GEMM_TT_TYPE;
  throw std::invalid_argument("Unrecognized expression: " + name);
 }

--- a/include/isaac/driver/external/CL/cl_ext.h
+++ b/include/isaac/driver/external/CL/cl_ext.h
@@ -70,7 +70,7 @@ cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */,
 * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
 * before using.
 *
- * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logelementwise_2d 
 */
 #define cl_APPLE_ContextLoggingFunctions 1
 extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
--- a/include/isaac/driver/external/CUDA/cuda.h
+++ b/include/isaac/driver/external/CUDA/cuda.h
@@ -200,7 +200,7 @@ extern "C" {

 /**
 * CUDA device pointer
- * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
+ * CUdeviceptr is defined as an unsigned inteelementwise_2d type whose size matches the size of a pointer on the target platform.
 */ 
 #if __CUDA_API_VERSION >= 3020

@@ -337,12 +337,12 @@ typedef enum CUoccupancy_flags_enum {
 * Array formats
 */
 typedef enum CUarray_format_enum {
-    CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit integers */
-    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
-    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
-    CU_AD_FORMAT_SIGNED_INT8    = 0x08, /**< Signed 8-bit integers */
-    CU_AD_FORMAT_SIGNED_INT16   = 0x09, /**< Signed 16-bit integers */
-    CU_AD_FORMAT_SIGNED_INT32   = 0x0a, /**< Signed 32-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit inteelementwise_2ds */
+    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit inteelementwise_2ds */
+    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit inteelementwise_2ds */
+    CU_AD_FORMAT_SIGNED_INT8    = 0x08, /**< Signed 8-bit inteelementwise_2ds */
+    CU_AD_FORMAT_SIGNED_INT16   = 0x09, /**< Signed 16-bit inteelementwise_2ds */
+    CU_AD_FORMAT_SIGNED_INT32   = 0x0a, /**< Signed 32-bit inteelementwise_2ds */
    CU_AD_FORMAT_HALF           = 0x10, /**< 16-bit floating point */
    CU_AD_FORMAT_FLOAT          = 0x20  /**< 32-bit floating point */
 } CUarray_format;
@@ -558,8 +558,8 @@ typedef enum CUfunction_attribute_enum {
 */
 typedef enum CUfunc_cache_enum {
    CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
-    CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
-    CU_FUNC_CACHE_PREFER_L1      = 0x02, /**< prefer larger L1 cache and smaller shared memory */
+    CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larelementwise_2d shared memory and smaller L1 cache */
+    CU_FUNC_CACHE_PREFER_L1      = 0x02, /**< prefer larelementwise_2d L1 cache and smaller shared memory */
    CU_FUNC_CACHE_PREFER_EQUAL   = 0x03  /**< prefer equal sized L1 cache and shared memory */
 } CUfunc_cache;

@@ -909,7 +909,7 @@ typedef enum cudaError_enum {

    /**
     * \deprecated
-     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * This error return is deprecated as of CUDA 5.0. It is no lonelementwise_2d an error
     * to attempt to enable/disable the profiling via ::cuProfilerStart or
     * ::cuProfilerStop without initialization.
     */
@@ -917,14 +917,14 @@ typedef enum cudaError_enum {

    /**
     * \deprecated
-     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * This error return is deprecated as of CUDA 5.0. It is no lonelementwise_2d an error
     * to call cuProfilerStart() when profiling is already enabled.
     */
    CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,

    /**
     * \deprecated
-     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * This error return is deprecated as of CUDA 5.0. It is no lonelementwise_2d an error
     * to call cuProfilerStop() when profiling is already disabled.
     */
    CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
@@ -962,7 +962,7 @@ typedef enum cudaError_enum {
     * This indicated that the context being supplied as a parameter to the
     * API call was already the active context.
     * \deprecated
-     * This error return is deprecated as of CUDA 3.2. It is no longer an
+     * This error return is deprecated as of CUDA 3.2. It is no lonelementwise_2d an
     * error to attempt to push the active context via ::cuCtxPushCurrent().
     */
    CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
@@ -1163,7 +1163,7 @@ typedef enum cudaError_enum {
    CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,

    /**
-     * A device-side assert triggered during kernel execution. The context
+     * A device-side assert trigelementwise_2ded during kernel execution. The context
     * cannot be used anymore, and must be destroyed. All existing device 
     * memory allocations from this context are invalid and must be 
     * reconstructed if the program is to continue using CUDA.
@@ -1499,24 +1499,24 @@ typedef struct CUDA_TEXTURE_DESC_st {
 typedef enum CUresourceViewFormat_enum
 {
    CU_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
-    CU_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_4X8      = 0x03, /**< 4 channel unsigned 8-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_1X8      = 0x04, /**< 1 channel signed 8-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_2X8      = 0x05, /**< 2 channel signed 8-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_4X8      = 0x06, /**< 4 channel signed 8-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_1X16     = 0x07, /**< 1 channel unsigned 16-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_2X16     = 0x08, /**< 2 channel unsigned 16-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_4X16     = 0x09, /**< 4 channel unsigned 16-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_1X16     = 0x0a, /**< 1 channel signed 16-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_2X16     = 0x0b, /**< 2 channel signed 16-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_4X16     = 0x0c, /**< 4 channel signed 16-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_1X32     = 0x0d, /**< 1 channel unsigned 32-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_2X32     = 0x0e, /**< 2 channel unsigned 32-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_4X32     = 0x0f, /**< 4 channel unsigned 32-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_1X32     = 0x10, /**< 1 channel signed 32-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_2X32     = 0x11, /**< 2 channel signed 32-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_4X32     = 0x12, /**< 4 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_UINT_4X8      = 0x03, /**< 4 channel unsigned 8-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_SINT_1X8      = 0x04, /**< 1 channel signed 8-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_SINT_2X8      = 0x05, /**< 2 channel signed 8-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_SINT_4X8      = 0x06, /**< 4 channel signed 8-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_UINT_1X16     = 0x07, /**< 1 channel unsigned 16-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_UINT_2X16     = 0x08, /**< 2 channel unsigned 16-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_UINT_4X16     = 0x09, /**< 4 channel unsigned 16-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_SINT_1X16     = 0x0a, /**< 1 channel signed 16-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_SINT_2X16     = 0x0b, /**< 2 channel signed 16-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_SINT_4X16     = 0x0c, /**< 4 channel signed 16-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_UINT_1X32     = 0x0d, /**< 1 channel unsigned 32-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_UINT_2X32     = 0x0e, /**< 2 channel unsigned 32-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_UINT_4X32     = 0x0f, /**< 4 channel unsigned 32-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_SINT_1X32     = 0x10, /**< 1 channel signed 32-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_SINT_2X32     = 0x11, /**< 2 channel signed 32-bit inteelementwise_2ds */
+    CU_RES_VIEW_FORMAT_SINT_4X32     = 0x12, /**< 4 channel signed 32-bit inteelementwise_2ds */
    CU_RES_VIEW_FORMAT_FLOAT_1X16    = 0x13, /**< 1 channel 16-bit floating point */
    CU_RES_VIEW_FORMAT_FLOAT_2X16    = 0x14, /**< 2 channel 16-bit floating point */
    CU_RES_VIEW_FORMAT_FLOAT_4X16    = 0x15, /**< 4 channel 16-bit floating point */
@@ -1606,7 +1606,7 @@ typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st {
 #define CU_TRSA_OVERRIDE_FORMAT 0x01

 /**
- * Read the texture as integers rather than promoting the values to floats
+ * Read the texture as inteelementwise_2ds rather than promoting the values to floats
 * in the range [0,1].
 * Flag for ::cuTexRefSetFlags()
 */
@@ -1901,7 +1901,7 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
 /**
 * \brief Returns information about the device
 *
- * Returns in \p *pi the integer value of the attribute \p attrib on device
+ * Returns in \p *pi the inteelementwise_2d value of the attribute \p attrib on device
 * \p dev. The supported attributes are:
 * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per
 *   block;
@@ -2819,7 +2819,7 @@ CUresult CUDAAPI cuCtxSynchronize(void);
 *   violated. This limit can be set smaller than the default or up the maximum
 *   launch depth of 24. When setting this limit, keep in mind that additional
 *   levels of sync depth require the driver to reserve large amounts of device
- *   memory which can no longer be used for user allocations. If these 
+ *   memory which can no lonelementwise_2d be used for user allocations. If these 
 *   reservations of device memory fail, ::cuCtxSetLimit will return 
 *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
 *   This limit is only applicable to devices of compute capability 3.5 and
@@ -2836,7 +2836,7 @@ CUresult CUDAAPI cuCtxSynchronize(void);
 *   the default (2048 launches) are needed for a module using the device
 *   runtime, this limit can be increased. Keep in mind that being able to
 *   sustain additional pending launches will require the driver to reserve
- *   larger amounts of device memory upfront which can no longer be used for
+ *   larelementwise_2d amounts of device memory upfront which can no lonelementwise_2d be used for
 *   allocations. If these reservations fail, ::cuCtxSetLimit will return
 *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
 *   This limit is only applicable to devices of compute capability 3.5 and
@@ -2921,8 +2921,8 @@ CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit);
 *
 * The supported cache configurations are:
 * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
- * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
- * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larelementwise_2d shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larelementwise_2d L1 cache and smaller shared memory
 * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
 *
 * \param pconfig - Returned cache configuration
@@ -2971,8 +2971,8 @@ CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig);
 *
 * The supported cache configurations are:
 * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
- * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
- * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larelementwise_2d shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larelementwise_2d L1 cache and smaller shared memory
 * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
 *
 * \param config - Requested cache configuration
@@ -3054,7 +3054,7 @@ CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig);
 *
 * Changing the shared memory bank size will not increase shared memory usage
 * or affect occupancy of kernels, but may have major effects on performance. 
- * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * Larelementwise_2d bank sizes will allow for greater potential bandwidth to shared memory,
 * but will change what kinds of accesses to shared memory will result in bank 
 * conflicts.
 *
@@ -7358,7 +7358,7 @@ CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute
 *      See further documentation in the section titled "API synchronization behavior"
 *      to learn more about cases when synchronous memory operations can
 *      exhibit asynchronous behavior.
- *      \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set.
+ *      \p value will be considered as a pointer to an unsigned inteelementwise_2d to which this attribute is to be set.
 *
 * \param value     - Pointer to memory containing the value to be set
 * \param attribute - Pointer attribute to set
@@ -7534,7 +7534,7 @@ CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int fla
 * See ::cuStreamCreateWithPriority for details about priority clamping.
 *
 * \param hStream    - Handle to the stream to be queried
- * \param priority   - Pointer to a signed integer in which the stream's priority is returned
+ * \param priority   - Pointer to a signed inteelementwise_2d in which the stream's priority is returned
 * \return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
@@ -7560,7 +7560,7 @@ CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
 * and return the flags in \p flags.
 *
 * \param hStream    - Handle to the stream to be queried
- * \param flags      - Pointer to an unsigned integer in which the stream's flags are returned
+ * \param flags      - Pointer to an unsigned inteelementwise_2d in which the stream's flags are returned
 *                     The value returned in \p flags is a logical 'OR' of all flags that
 *                     were used while creating this stream. See ::cuStreamCreate for the list
 *                     of valid flags
@@ -8104,7 +8104,7 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven
 /**
 * \brief Returns information about a function
 *
- * Returns in \p *pi the integer value of the attribute \p attrib on the kernel
+ * Returns in \p *pi the inteelementwise_2d value of the attribute \p attrib on the kernel
 * given by \p hfunc. The supported attributes are:
 * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads
 *   per block, beyond which a launch of the function would fail. This number
@@ -8175,8 +8175,8 @@ CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunc
 *
 * The supported cache configurations are:
 * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
- * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
- * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larelementwise_2d shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larelementwise_2d L1 cache and smaller shared memory
 * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
 *
 * \param hfunc  - Kernel to configure cache for
@@ -8215,7 +8215,7 @@ CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
 *
 * Changing the shared memory bank size will not increase shared memory usage
 * or affect occupancy of kernels, but may have major effects on performance. 
- * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * Larelementwise_2d bank sizes will allow for greater potential bandwidth to shared memory,
 * but will change what kinds of accesses to shared memory will result in bank 
 * conflicts.
 *
@@ -8491,11 +8491,11 @@ CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
 CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes);

 /**
- * \brief Adds an integer parameter to the function's argument list
+ * \brief Adds an inteelementwise_2d parameter to the function's argument list
 *
 * \deprecated
 *
- * Sets an integer parameter that will be specified the next time the
+ * Sets an inteelementwise_2d parameter that will be specified the next time the
 * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
 *
 * \param hfunc  - Kernel to add parameter to
@@ -9299,8 +9299,8 @@ CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAnis
 * returned through the texture reference \p hTexRef. The valid flags are:
 *
 * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
- *   having the texture promote integer data to floating point data in the
- *   range [0, 1]. Note that texture with 32-bit integer format
+ *   having the texture promote inteelementwise_2d data to floating point data in the
+ *   range [0, 1]. Note that texture with 32-bit inteelementwise_2d format
 *   would not be promoted, regardless of whether or not this
 *   flag is specified;
 * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the 
@@ -9859,8 +9859,8 @@ CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
 *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR.
 *
 * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following:
- *   - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of having the texture promote integer data to floating point data in the
- *     range [0, 1]. Note that texture with 32-bit integer format would not be promoted, regardless of whether or not this flag is specified.
+ *   - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of having the texture promote inteelementwise_2d data to floating point data in the
+ *     range [0, 1]. Note that texture with 32-bit inteelementwise_2d format would not be promoted, regardless of whether or not this flag is specified.
 *   - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior of having the texture coordinates range from [0, Dim) where Dim is
 *     the width or height of the CUDA array. Instead, the texture coordinates [0, 1.0) reference the entire breadth of the array dimension; Note
 *     that for CUDA mipmapped arrays, this flag has to be set.
--- a/include/isaac/kernels/mapped_object.h
+++ b/include/isaac/kernels/mapped_object.h
@@ -89,46 +89,46 @@ protected:
  *
  * Maps prod(matrix_expression, matrix_expression)
  */
-class mapped_gemm : public mapped_object, public binary_leaf
+class mapped_matrix_product : public mapped_object, public binary_leaf
 {
 public:
-  mapped_gemm(std::string const & scalartype, unsigned int id, node_info info);
+  mapped_matrix_product(std::string const & scalartype, unsigned int id, node_info info);
 };

 /** @brief Reduction
 *
-* Base class for mapping a dot
+* Base class for mapping a reduce_1d
 */
-class mapped_dot : public mapped_object, public binary_leaf
+class mapped_reduce : public mapped_object, public binary_leaf
 {
 public:
-  mapped_dot(std::string const & scalartype, unsigned int id, node_info info, std::string const & type_key);
+  mapped_reduce(std::string const & scalartype, unsigned int id, node_info info, std::string const & type_key);

  size_t root_idx() const;
  isaac::math_expression const & math_expression() const;
  math_expression::node root_node() const;
-  bool is_index_dot() const;
+  bool is_index_reduction() const;
  op_element root_op() const;
 };

-/** @brief Scalar dot
+/** @brief 1D Reduction
 *
-* Maps a scalar dot (max, min, argmax, inner_prod, etc..)
+* Maps a 1d reduction (max, min, argmax, inner_prod, etc..)
 */
-class mapped_scalar_dot : public mapped_dot
+class mapped_reduce_1d : public mapped_reduce
 {
 public:
-  mapped_scalar_dot(std::string const & scalartype, unsigned int id, node_info info);
+  mapped_reduce_1d(std::string const & scalartype, unsigned int id, node_info info);
 };

-/** @brief Vector dot
+/** @brief 2D
 *
-* Maps a row-wise dot (max, min, argmax, matrix-vector product, etc..)
+* Maps a 2D reduction (max, min, argmax, matrix-vector product, etc..)
 */
-class mapped_gemv : public mapped_dot
+class mapped_reduce_2d : public mapped_reduce
 {
 public:
-  mapped_gemv(std::string const & scalartype, unsigned int id, node_info info);
+  mapped_reduce_2d(std::string const & scalartype, unsigned int id, node_info info);
 };

 /** @brief Host scalar
--- a/include/isaac/kernels/parse.h
+++ b/include/isaac/kernels/parse.h
@@ -13,8 +13,8 @@ namespace detail
 {

  bool is_node_leaf(op_element const & op);
-  bool is_scalar_dot(math_expression::node const & node);
-  bool is_vector_dot(math_expression::node const & node);
+  bool is_scalar_reduce_1d(math_expression::node const & node);
+  bool is_vector_reduce_1d(math_expression::node const & node);
  bool is_assignment(op_element const & op);
  bool is_elementwise_operator(op_element const & op);
  bool is_elementwise_function(op_element const & op);
--- a/include/isaac/kernels/templates/elementwise_1d.h
+++ b/include/isaac/kernels/templates/elementwise_1d.h
@@ -8,22 +8,22 @@ namespace isaac
 namespace templates
 {

-class axpy_parameters : public base::parameters_type
+class elementwise_1d_parameters : public base::parameters_type
 {
 public:
-  axpy_parameters(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy);
+  elementwise_1d_parameters(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy);
  unsigned int num_groups;
  fetching_policy_type fetching_policy;
 };

-class axpy : public base_impl<axpy, axpy_parameters>
+class elementwise_1d : public base_impl<elementwise_1d, elementwise_1d_parameters>
 {
 private:
  virtual int is_invalid_impl(driver::Device const &, math_expression const  &) const;
  std::string generate_impl(std::string const & suffix, math_expression const  & expressions, driver::Device const & device, mapping_type const & mappings) const;
 public:
-  axpy(axpy::parameters_type const & parameters, binding_policy_t binding_policy = BIND_INDEPENDENT);
-  axpy(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy, binding_policy_t binding_policy = BIND_INDEPENDENT);
+  elementwise_1d(elementwise_1d::parameters_type const & parameters, binding_policy_t binding_policy = BIND_INDEPENDENT);
+  elementwise_1d(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy, binding_policy_t binding_policy = BIND_INDEPENDENT);
  std::vector<int_t> input_sizes(math_expression const  & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, base & fallback, execution_handler const &);
 };
--- a/include/isaac/kernels/templates/elementwise_2d.h
+++ b/include/isaac/kernels/templates/elementwise_2d.h
@@ -9,24 +9,24 @@ namespace isaac
 namespace templates
 {

-class ger_parameters : public base::parameters_type
+class elementwise_2d_parameters : public base::parameters_type
 {
 public:
-  ger_parameters(unsigned int _simd_width, unsigned int _local_size_0, unsigned int _local_size_1, unsigned int _num_groups_0, unsigned int _num_groups_1, fetching_policy_type _fetching_policy);
+  elementwise_2d_parameters(unsigned int _simd_width, unsigned int _local_size_0, unsigned int _local_size_1, unsigned int _num_groups_0, unsigned int _num_groups_1, fetching_policy_type _fetching_policy);

  unsigned int num_groups_0;
  unsigned int num_groups_1;
  fetching_policy_type fetching_policy;
 };

-class ger : public base_impl<ger, ger_parameters>
+class elementwise_2d : public base_impl<elementwise_2d, elementwise_2d_parameters>
 {
 private:
  int is_invalid_impl(driver::Device const &, math_expression const  &) const;
  std::string generate_impl(std::string const & suffix, math_expression const  & expressions, driver::Device const & device, mapping_type const & mapping) const;
 public:
-  ger(parameters_type const & parameters, binding_policy_t binding_policy = BIND_INDEPENDENT);
-  ger(unsigned int simd, unsigned int ls1, unsigned int ls2,  unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind = BIND_INDEPENDENT);
+  elementwise_2d(parameters_type const & parameters, binding_policy_t binding_policy = BIND_INDEPENDENT);
+  elementwise_2d(unsigned int simd, unsigned int ls1, unsigned int ls2,  unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind = BIND_INDEPENDENT);
  std::vector<int_t> input_sizes(math_expression const  & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, base & fallback, execution_handler const &);
 };
--- a/include/isaac/kernels/templates/matrix_product.h
+++ b/include/isaac/kernels/templates/matrix_product.h
@@ -10,9 +10,9 @@ namespace isaac
 namespace templates
 {

-struct gemm_parameters : public base::parameters_type
+struct matrix_product_parameters : public base::parameters_type
 {
-  gemm_parameters(unsigned int simd_width
+  matrix_product_parameters(unsigned int simd_width
                            , unsigned int local_size_0, unsigned int KL, unsigned int local_size_1, unsigned int D
                            , unsigned int ms, unsigned int ks, unsigned int ns
                            , fetching_policy_type A_fetching_policy, fetching_policy_type B_fetching_policy
@@ -38,7 +38,7 @@ struct gemm_parameters : public base::parameters_type
  bool unroll_outer;
 };

-class gemm : public base_impl<gemm, gemm_parameters>
+class matrix_product : public base_impl<matrix_product, matrix_product_parameters>
 {
 private:
  unsigned int temporary_workspace(math_expression const & expressions) const;
@@ -48,9 +48,9 @@ private:
  std::string generate_impl(std::string const & suffix, math_expression const & expressions, driver::Device const & device, mapping_type const &) const;
  void enqueue_block(driver::CommandQueue & queue, int_t M, int_t N, int_t K, array_base const & A, array_base const & B, array_base const & C,
                     value_scalar const &alpha, value_scalar const &beta, driver::Program const & program, std::string const & suffix, execution_options_type const & options);
-  std::vector<int_t> infos(math_expression const & expressions,  isaac::symbolic::preset::gemm::args &arguments) const;
+  std::vector<int_t> infos(math_expression const & expressions,  isaac::symbolic::preset::matrix_product::args &arguments) const;
 public:
-  gemm(gemm::parameters_type const & parameters, bool check_bound, char A_trans, char B_trans);
+  matrix_product(matrix_product::parameters_type const & parameters, bool check_bound, char A_trans, char B_trans);
  std::vector<int_t> input_sizes(math_expression const & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, base & fallback, execution_handler const &ctr);
 private:
@@ -60,36 +60,36 @@ private:
  bool check_bounds_;
 };

-class gemm_nn : public gemm
+class matrix_product_nn : public matrix_product
 {
 public:
-  gemm_nn(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
+  matrix_product_nn(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
                      , int_t ms, int_t ks, int_t ns, fetching_policy_type Afetch , fetching_policy_type Bfetch
                      , int_t lfetch0, int_t lfetch1, bool check_bound = false);
 };

-class gemm_tn : public gemm
+class matrix_product_tn : public matrix_product
 {
 public:
-  gemm_tn(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
+  matrix_product_tn(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
                      , int_t ms, int_t ks, int_t ns, fetching_policy_type Afetch , fetching_policy_type Bfetch
                      , int_t lfetch0, int_t lfetch1, bool check_bound = false);
 };


-class gemm_nt : public gemm
+class matrix_product_nt : public matrix_product
 {
 public:
-  gemm_nt(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
+  matrix_product_nt(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
                      , int_t ms, int_t ks, int_t ns, fetching_policy_type Afetch , fetching_policy_type Bfetch
                      , int_t lfetch0, int_t lfetch1, bool check_bound = false);
 };


-class gemm_tt : public gemm
+class matrix_product_tt : public matrix_product
 {
 public:
-  gemm_tt(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
+  matrix_product_tt(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
                      , int_t ms, int_t ks, int_t ns, fetching_policy_type Afetch , fetching_policy_type Bfetch
                      , int_t lfetch0, int_t lfetch1, bool check_bound = false);
 };
--- a/include/isaac/kernels/templates/reduce_1d.h
+++ b/include/isaac/kernels/templates/reduce_1d.h
@@ -8,27 +8,27 @@ namespace isaac
 namespace templates
 {

-struct dot_parameters : public base::parameters_type
+struct reduce_1d_parameters : public base::parameters_type
 {
-  dot_parameters(unsigned int _simd_width,
+  reduce_1d_parameters(unsigned int _simd_width,
                       unsigned int _group_size, unsigned int _num_groups,
                       fetching_policy_type _fetching_policy);
  unsigned int num_groups;
  fetching_policy_type fetching_policy;
 };

-class dot : public base_impl<dot, dot_parameters>
+class reduce_1d : public base_impl<reduce_1d, reduce_1d_parameters>
 {
 private:
  unsigned int lmem_usage(math_expression const  & expressions) const;
  int is_invalid_impl(driver::Device const &, math_expression const  &) const;
-  inline void reduce_1d_local_memory(kernel_generation_stream & stream, unsigned int size, std::vector<mapped_scalar_dot*> exprs,
+  inline void reduce_1d_local_memory(kernel_generation_stream & stream, unsigned int size, std::vector<mapped_reduce_1d*> exprs,
                                     std::string const & buf_str, std::string const & buf_value_str, driver::backend_type backend) const;
  std::string generate_impl(std::string const & suffix,  math_expression const  & expressions, driver::Device const & device, mapping_type const & mapping) const;

 public:
-  dot(dot::parameters_type const & parameters, binding_policy_t binding_policy = BIND_INDEPENDENT);
-  dot(unsigned int simd, unsigned int ls, unsigned int ng, fetching_policy_type fetch, binding_policy_t bind = BIND_INDEPENDENT);
+  reduce_1d(reduce_1d::parameters_type const & parameters, binding_policy_t binding_policy = BIND_INDEPENDENT);
+  reduce_1d(unsigned int simd, unsigned int ls, unsigned int ng, fetching_policy_type fetch, binding_policy_t bind = BIND_INDEPENDENT);
  std::vector<int_t> input_sizes(math_expression const  & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, base & fallback, execution_handler const &);
 private:
--- a/include/isaac/kernels/templates/reduce_2d.h
+++ b/include/isaac/kernels/templates/reduce_2d.h
@@ -10,9 +10,9 @@ namespace isaac
 {
 namespace templates
 {
-struct gemv_parameters : public base::parameters_type
+struct reduce_2d_parameters : public base::parameters_type
 {
-  gemv_parameters(unsigned int _simd_width,
+  reduce_2d_parameters(unsigned int _simd_width,
                                unsigned int _local_size_0, unsigned int _local_size_1,
                                unsigned int _num_groups_0, unsigned int _num_groups_1, fetching_policy_type _fetch_policy);
  unsigned int num_groups_0;
@@ -21,15 +21,15 @@ struct gemv_parameters : public base::parameters_type
 };


-class gemv : public base_impl<gemv, gemv_parameters>
+class reduce_2d : public base_impl<reduce_2d, reduce_2d_parameters>
 {
 protected:
-  enum dot_type
+  enum reduce_1d_type
  {
    REDUCE_ROWS,
    REDUCE_COLUMNS
  };
-  gemv(gemv::parameters_type const & , dot_type, binding_policy_t);
+  reduce_2d(reduce_2d::parameters_type const & , reduce_1d_type, binding_policy_t);
 private:
  virtual int is_invalid_impl(driver::Device const &, math_expression const &) const;
  unsigned int lmem_usage(math_expression const &) const;
@@ -38,21 +38,21 @@ public:
  virtual std::vector<int_t> input_sizes(math_expression const & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, base & fallback, execution_handler const &);
 private:
-  dot_type dot_type_;
+  reduce_1d_type reduce_1d_type_;
 };

-class gemv_n : public gemv
+class reduce_2d_n : public reduce_2d
 {
 public:
-  gemv_n(gemv::parameters_type  const &, binding_policy_t binding_policy = BIND_INDEPENDENT);
-  gemv_n(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind = BIND_INDEPENDENT);
+  reduce_2d_n(reduce_2d::parameters_type  const &, binding_policy_t binding_policy = BIND_INDEPENDENT);
+  reduce_2d_n(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind = BIND_INDEPENDENT);
 };

-class gemv_t : public gemv
+class reduce_2d_t : public reduce_2d
 {
 public:
-  gemv_t(gemv::parameters_type  const &, binding_policy_t binding_policy = BIND_INDEPENDENT);
-  gemv_t(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind = BIND_INDEPENDENT);
+  reduce_2d_t(reduce_2d::parameters_type  const &, binding_policy_t binding_policy = BIND_INDEPENDENT);
+  reduce_2d_t(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind = BIND_INDEPENDENT);
 };

 }
--- a/include/isaac/symbolic/preset.h
+++ b/include/isaac/symbolic/preset.h
@@ -13,7 +13,7 @@ namespace preset
 {


-class gemm
+class matrix_product
 {

 public:
--- a/lib/external/rapidjson/allocators.h
+++ b/lib/external/rapidjson/allocators.h
@@ -115,7 +115,7 @@ public:
        The user buffer will not be deallocated when this allocator is destructed.

        \param buffer User supplied buffer.
-        \param size Size of the buffer in bytes. It must at least larger than sizeof(ChunkHeader).
+        \param size Size of the buffer in bytes. It must at least larelementwise_2d than sizeof(ChunkHeader).
        \param chunkSize The size of memory chunk. The default is kDefaultChunkSize.
        \param baseAllocator The allocator for allocating memory chunks.
    */
--- a/lib/external/rapidjson/document.h
+++ b/lib/external/rapidjson/document.h
@@ -128,7 +128,7 @@ public:
    typedef typename BaseType::pointer         Pointer;
    //! Reference to (const) GenericMember
    typedef typename BaseType::reference       Reference;
-    //! Signed integer type (e.g. \c ptrdiff_t)
+    //! Signed inteelementwise_2d type (e.g. \c ptrdiff_t)
    typedef typename BaseType::difference_type DifferenceType;

    //! Default constructor (singular value)
@@ -265,7 +265,7 @@ struct GenericStringRef {

        \tparam N length of the string, automatically inferred

-        \param str Constant character array, lifetime assumed to be longer
+        \param str Constant character array, lifetime assumed to be lonelementwise_2d
            than the use of the string in e.g. a GenericValue

        \post \ref s == str
@@ -289,7 +289,7 @@ struct GenericStringRef {

        \see StringRef(const CharType*)

-        \param str Constant character pointer, lifetime assumed to be longer
+        \param str Constant character pointer, lifetime assumed to be lonelementwise_2d
            than the use of the string in e.g. a GenericValue

        \post \ref s == str
@@ -305,7 +305,7 @@ struct GenericStringRef {
        : s(str), length(internal::StrLen(str)){ RAPIDJSON_ASSERT(s != NULL); }

    //! Create constant string reference from pointer and length
-    /*! \param str constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue
+    /*! \param str constant string, lifetime assumed to be lonelementwise_2d than the use of the string in e.g. a GenericValue
        \param len length of the string, excluding the trailing NULL terminator

        \post \ref s == str && \ref length == len
@@ -334,7 +334,7 @@ private:
    value in a JSON GenericValue object, if the string's lifetime is known
    to be valid long enough.
    \tparam CharType Character type of the string
-    \param str Constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue
+    \param str Constant string, lifetime assumed to be lonelementwise_2d than the use of the string in e.g. a GenericValue
    \return GenericStringRef string reference object
    \relatesalso GenericStringRef

@@ -355,7 +355,7 @@ inline GenericStringRef<CharType> StringRef(const CharType* str) {
    supports string containing null characters.

    \tparam CharType character type of the string
-    \param str Constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue
+    \param str Constant string, lifetime assumed to be lonelementwise_2d than the use of the string in e.g. a GenericValue
    \param length The length of source string.
    \return GenericStringRef string reference object
    \relatesalso GenericStringRef
@@ -373,7 +373,7 @@ inline GenericStringRef<CharType> StringRef(const CharType* str, size_t length)
    to be valid long enough.

    \tparam CharType character type of the string
-    \param str Constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue
+    \param str Constant string, lifetime assumed to be lonelementwise_2d than the use of the string in e.g. a GenericValue
    \return GenericStringRef string reference object
    \relatesalso GenericStringRef
    \note Requires the definition of the preprocessor symbol \ref RAPIDJSON_HAS_STDSTRING.
@@ -696,7 +696,7 @@ public:

        case kNumberType:
            if (IsDouble() || rhs.IsDouble())
-                return GetDouble() == rhs.GetDouble(); // May convert one operand from integer to double.
+                return GetDouble() == rhs.GetDouble(); // May convert one operand from inteelementwise_2d to double.
            else
                return data_.n.u64 == rhs.data_.n.u64;

@@ -1482,7 +1482,7 @@ private:
        inline SizeType GetLength() const       { return  (SizeType)(MaxSize -  str[LenPos]); }
    };  // at most as many bytes as "String" above => 12 bytes in 32-bit mode, 16 bytes in 64-bit mode

-    // By using proper binary layout, retrieval of different integer types do not need conversions.
+    // By using proper binary layout, retrieval of different inteelementwise_2d types do not need conversions.
    union Number {
 #if RAPIDJSON_ENDIAN == RAPIDJSON_LITTLEENDIAN
        struct I {
--- a/lib/external/rapidjson/internal/dtoa.h
+++ b/lib/external/rapidjson/internal/dtoa.h
@@ -20,7 +20,7 @@

 // This is a C++ header-only implementation of Grisu2 algorithm from the publication:
 // Loitsch, Florian. "Printing floating-point numbers quickly and accurately with
-// integers." ACM Sigplan Notices 45.6 (2010): 233-243.
+// inteelementwise_2ds." ACM Sigplan Notices 45.6 (2010): 233-243.

 #ifndef RAPIDJSON_DTOA_
 #define RAPIDJSON_DTOA_
--- a/lib/external/rapidjson/internal/pow10.h
+++ b/lib/external/rapidjson/internal/pow10.h
@@ -24,7 +24,7 @@
 namespace rapidjson {
 namespace internal {

-//! Computes integer powers of 10 in double (10.0^n).
+//! Computes inteelementwise_2d powers of 10 in double (10.0^n).
 /*! This function uses lookup table for fast and accurate results.
    \param n non-negative exponent. Must <= 308.
    \return 10.0^n
--- a/lib/external/rapidjson/rapidjson.h
+++ b/lib/external/rapidjson/rapidjson.h
@@ -53,9 +53,9 @@

 /*! \def RAPIDJSON_NO_INT64DEFINE
    \ingroup RAPIDJSON_CONFIG
-    \brief Use external 64-bit integer types.
+    \brief Use external 64-bit inteelementwise_2d types.

-    RapidJSON requires the 64-bit integer types \c int64_t and  \c uint64_t types
+    RapidJSON requires the 64-bit inteelementwise_2d types \c int64_t and  \c uint64_t types
    to be available at global scope.

    If users have their own definition, define RAPIDJSON_NO_INT64DEFINE to
@@ -171,11 +171,11 @@
 ///////////////////////////////////////////////////////////////////////////////
 // RAPIDJSON_UINT64_C2

-//! Construct a 64-bit literal by a pair of 32-bit integer.
+//! Construct a 64-bit literal by a pair of 32-bit inteelementwise_2d.
 /*!
    64-bit literal with or without ULL suffix is prone to compiler warnings.
    UINT64_C() is C macro which cause compilation problems.
-    Use this macro to define 64-bit constants by a pair of 32-bit integer.
+    Use this macro to define 64-bit constants by a pair of 32-bit inteelementwise_2d.
 */
 #ifndef RAPIDJSON_UINT64_C2
 #define RAPIDJSON_UINT64_C2(high32, low32) ((static_cast<uint64_t>(high32) << 32) | static_cast<uint64_t>(low32))
--- a/lib/external/rapidjson/reader.h
+++ b/lib/external/rapidjson/reader.h
@@ -792,7 +792,7 @@ private:
                }
        }

-        // Force double for big integer
+        // Force double for big inteelementwise_2d
        if (useDouble) {
            while (s.Peek() >= '0' && s.Peek() <= '9') {
                if (d >= 1.7976931348623157e307) // DBL_MAX / 10.0
--- a/lib/kernels/mapped_object.cpp
+++ b/lib/kernels/mapped_object.cpp
@@ -117,23 +117,23 @@ std::string binary_leaf::evaluate_recursive(leaf_t leaf, std::map<std::string, s
 }


-mapped_gemm::mapped_gemm(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "gemm"), binary_leaf(info) { }
+mapped_matrix_product::mapped_matrix_product(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_product"), binary_leaf(info) { }

 //
-mapped_dot::mapped_dot(std::string const & scalartype, unsigned int id, node_info info, std::string const & type_key) :
+mapped_reduce::mapped_reduce(std::string const & scalartype, unsigned int id, node_info info, std::string const & type_key) :
  mapped_object(scalartype, id, type_key), binary_leaf(info)
 { }

-size_t mapped_dot::root_idx() const
+size_t mapped_reduce::root_idx() const
 { return info_.root_idx; }

-isaac::math_expression const & mapped_dot::math_expression() const
+isaac::math_expression const & mapped_reduce::math_expression() const
 { return *info_.math_expression; }

-math_expression::node mapped_dot::root_node() const
+math_expression::node mapped_reduce::root_node() const
 { return math_expression().tree()[root_idx()]; }

-bool mapped_dot::is_index_dot() const
+bool mapped_reduce::is_index_reduction() const
 {
  op_element const & op = root_op();
  return op.type==OPERATOR_ELEMENT_ARGFMAX_TYPE
@@ -142,17 +142,17 @@ bool mapped_dot::is_index_dot() const
      || op.type==OPERATOR_ELEMENT_ARGMIN_TYPE;
 }

-op_element mapped_dot::root_op() const
+op_element mapped_reduce::root_op() const
 {
    return info_.math_expression->tree()[info_.root_idx].op;
 }


 //
-mapped_scalar_dot::mapped_scalar_dot(std::string const & scalartype, unsigned int id, node_info info) : mapped_dot(scalartype, id, info, "scalar_dot"){ }
+mapped_reduce_1d::mapped_reduce_1d(std::string const & scalartype, unsigned int id, node_info info) : mapped_reduce(scalartype, id, info, "scalar_reduce_1d"){ }

 //
-mapped_gemv::mapped_gemv(std::string const & scalartype, unsigned int id, node_info info) : mapped_dot(scalartype, id, info, "gemv") { }
+mapped_reduce_2d::mapped_reduce_2d(std::string const & scalartype, unsigned int id, node_info info) : mapped_reduce(scalartype, id, info, "reduce_2d") { }

 //
 void mapped_host_scalar::preprocess(std::string & str) const
--- a/lib/kernels/parse.cpp
+++ b/lib/kernels/parse.cpp
@@ -14,12 +14,12 @@ namespace detail



-  bool is_scalar_dot(math_expression::node const & node)
+  bool is_scalar_reduce_1d(math_expression::node const & node)
  {
    return node.op.type_family==OPERATOR_VECTOR_DOT_TYPE_FAMILY;
  }

-  bool is_vector_dot(math_expression::node const & node)
+  bool is_vector_reduce_1d(math_expression::node const & node)
  {
    return node.op.type_family==OPERATOR_ROWS_DOT_TYPE_FAMILY
        || node.op.type_family==OPERATOR_COLUMNS_DOT_TYPE_FAMILY;
--- a/lib/kernels/templates/base.cpp
+++ b/lib/kernels/templates/base.cpp
@@ -5,11 +5,11 @@
 #include "isaac/array.h"
 #include "isaac/tuple.h"
 #include "isaac/kernels/keywords.h"
-#include "isaac/kernels/templates/axpy.h"
-#include "isaac/kernels/templates/dot.h"
-#include "isaac/kernels/templates/ger.h"
-#include "isaac/kernels/templates/gemv.h"
-#include "isaac/kernels/templates/gemm.h"
+#include "isaac/kernels/templates/elementwise_1d.h"
+#include "isaac/kernels/templates/reduce_1d.h"
+#include "isaac/kernels/templates/elementwise_2d.h"
+#include "isaac/kernels/templates/reduce_2d.h"
+#include "isaac/kernels/templates/matrix_product.h"
 #include "isaac/kernels/templates/base.h"
 #include "isaac/kernels/parse.h"
 #include "isaac/exception/unknown_datatype.h"
@@ -150,11 +150,11 @@ int base_impl<TType, PType>::is_invalid(math_expression const  & expressions, dr
  return is_invalid_impl(device, expressions);
 }

-template class base_impl<axpy, axpy_parameters>;
-template class base_impl<dot, dot_parameters>;
-template class base_impl<ger, ger_parameters>;
-template class base_impl<gemv, gemv_parameters>;
-template class base_impl<gemm, gemm_parameters>;
+template class base_impl<elementwise_1d, elementwise_1d_parameters>;
+template class base_impl<reduce_1d, reduce_1d_parameters>;
+template class base_impl<elementwise_2d, elementwise_2d_parameters>;
+template class base_impl<reduce_2d, reduce_2d_parameters>;
+template class base_impl<matrix_product, matrix_product_parameters>;

 }
 }
--- a/lib/kernels/templates/elementwise_1d.cpp
+++ b/lib/kernels/templates/elementwise_1d.cpp
@@ -2,7 +2,7 @@
 #include <cstring>
 #include <algorithm>

-#include "isaac/kernels/templates/axpy.h"
+#include "isaac/kernels/templates/elementwise_1d.h"
 #include "isaac/kernels/keywords.h"
 #include "isaac/driver/backend.h"

@@ -18,7 +18,7 @@ namespace isaac
 namespace templates
 {

-axpy_parameters::axpy_parameters(unsigned int _simd_width,
+elementwise_1d_parameters::elementwise_1d_parameters(unsigned int _simd_width,
                       unsigned int _group_size, unsigned int _num_groups,
                       fetching_policy_type _fetching_policy) :
      base::parameters_type(_simd_width, _group_size, 1, 1), num_groups(_num_groups), fetching_policy(_fetching_policy)
@@ -26,14 +26,14 @@ axpy_parameters::axpy_parameters(unsigned int _simd_width,
 }


-int axpy::is_invalid_impl(driver::Device const &, math_expression const &) const
+int elementwise_1d::is_invalid_impl(driver::Device const &, math_expression const &) const
 {
  if (p_.fetching_policy==FETCH_FROM_LOCAL)
    return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
  return TEMPLATE_VALID;
 }

-std::string axpy::generate_impl(std::string const & suffix, math_expression const & expressions, driver::Device const & device, mapping_type const & mappings) const
+std::string elementwise_1d::generate_impl(std::string const & suffix, math_expression const & expressions, driver::Device const & device, mapping_type const & mappings) const
 {
  driver::backend_type backend = device.backend();
  std::string _size_t = size_type(device);
@@ -55,7 +55,7 @@ std::string axpy::generate_impl(std::string const & suffix, math_expression cons
      stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break;
  }

-  stream << KernelPrefix(backend) << " void " << "axpy" << suffix << "(" << _size_t << " N," << generate_arguments(dtype, device, mappings, expressions) << ")" << std::endl;
+  stream << KernelPrefix(backend) << " void " << "elementwise_1d" << suffix << "(" << _size_t << " N," << generate_arguments(dtype, device, mappings, expressions) << ")" << std::endl;
  stream << "{" << std::endl;
  stream.inc_tab();

@@ -174,23 +174,23 @@ std::string axpy::generate_impl(std::string const & suffix, math_expression cons
  return stream.str();
 }

-axpy::axpy(axpy_parameters const & parameters,
+elementwise_1d::elementwise_1d(elementwise_1d_parameters const & parameters,
                               binding_policy_t binding_policy) :
-    base_impl<axpy, axpy_parameters>(parameters, binding_policy)
+    base_impl<elementwise_1d, elementwise_1d_parameters>(parameters, binding_policy)
 {}

-axpy::axpy(unsigned int simd, unsigned int ls, unsigned int ng,
+elementwise_1d::elementwise_1d(unsigned int simd, unsigned int ls, unsigned int ng,
                               fetching_policy_type fetch, binding_policy_t bind):
-    base_impl<axpy, axpy_parameters>(axpy_parameters(simd,ls,ng,fetch), bind)
+    base_impl<elementwise_1d, elementwise_1d_parameters>(elementwise_1d_parameters(simd,ls,ng,fetch), bind)
 {}


-std::vector<int_t> axpy::input_sizes(math_expression const & expressions) const
+std::vector<int_t> elementwise_1d::input_sizes(math_expression const & expressions) const
 {
  return {expressions.shape().max()};
 }

-void axpy::enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, base & fallback, execution_handler const & control)
+void elementwise_1d::enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, base & fallback, execution_handler const & control)
 {
  math_expression const & expressions = control.x();
  //Size
@@ -202,7 +202,7 @@ void axpy::enqueue(driver::CommandQueue & queue, driver::Program const & program
      return;
  }
  //Kernel
-  std::string name = "axpy";
+  std::string name = "elementwise_1d";
  name += suffix;
  driver::Kernel kernel(program, name.c_str());
  //NDRange
--- a/lib/kernels/templates/elementwise_2d.cpp
+++ b/lib/kernels/templates/elementwise_2d.cpp
@@ -1,6 +1,6 @@
 #include <cstring>
 #include <iostream>
-#include "isaac/kernels/templates/ger.h"
+#include "isaac/kernels/templates/elementwise_2d.h"
 #include "isaac/symbolic/io.h"
 #include "isaac/kernels/keywords.h"

@@ -13,14 +13,14 @@ namespace isaac
 namespace templates
 {

-ger_parameters::ger_parameters(unsigned int _simd_width,
+elementwise_2d_parameters::elementwise_2d_parameters(unsigned int _simd_width,
                          unsigned int _local_size_0, unsigned int _local_size_1,
                          unsigned int _num_groups_0, unsigned int _num_groups_1,
                          fetching_policy_type _fetching_policy) : base::parameters_type(_simd_width, _local_size_0, _local_size_1, 1), num_groups_0(_num_groups_0), num_groups_1(_num_groups_1), fetching_policy(_fetching_policy){ }



-int ger::is_invalid_impl(driver::Device const &, math_expression const  &) const
+int elementwise_2d::is_invalid_impl(driver::Device const &, math_expression const  &) const
 {
  if (p_.simd_width>1)
    return TEMPLATE_INVALID_SIMD_WIDTH;
@@ -29,7 +29,7 @@ int ger::is_invalid_impl(driver::Device const &, math_expression const  &) const
  return TEMPLATE_VALID;
 }

-std::string ger::generate_impl(std::string const & suffix, math_expression const  & expressions, driver::Device const & device, mapping_type const & mappings) const
+std::string elementwise_2d::generate_impl(std::string const & suffix, math_expression const  & expressions, driver::Device const & device, mapping_type const & mappings) const
 {
  kernel_generation_stream stream;
  std::string _size_t = size_type(device);
@@ -45,7 +45,7 @@ std::string ger::generate_impl(std::string const & suffix, math_expression const
      stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break;
  }

-  stream << KernelPrefix(backend) << " void axpy" << suffix << "(" << _size_t << " M, " << _size_t << " N, " << generate_arguments("#scalartype", device, mappings, expressions) << ")" << std::endl;
+  stream << KernelPrefix(backend) << " void elementwise_1d" << suffix << "(" << _size_t << " M, " << _size_t << " N, " << generate_arguments("#scalartype", device, mappings, expressions) << ")" << std::endl;
  stream << "{" << std::endl;
  stream.inc_tab();

@@ -105,25 +105,25 @@ std::string ger::generate_impl(std::string const & suffix, math_expression const
  return stream.str();
 }

-ger::ger(parameters_type const & parameters, binding_policy_t binding_policy) :
-  base_impl<ger, ger_parameters>(parameters, binding_policy){ }
+elementwise_2d::elementwise_2d(parameters_type const & parameters, binding_policy_t binding_policy) :
+  base_impl<elementwise_2d, elementwise_2d_parameters>(parameters, binding_policy){ }

-ger::ger(unsigned int simd, unsigned int ls1, unsigned int ls2,
+elementwise_2d::elementwise_2d(unsigned int simd, unsigned int ls1, unsigned int ls2,
                               unsigned int ng1, unsigned int ng2, fetching_policy_type fetch,
                               binding_policy_t bind):
-    base_impl<ger, ger_parameters>(ger_parameters(simd, ls1, ls2, ng1, ng2, fetch), bind)
+    base_impl<elementwise_2d, elementwise_2d_parameters>(elementwise_2d_parameters(simd, ls1, ls2, ng1, ng2, fetch), bind)
 {}

-std::vector<int_t> ger::input_sizes(math_expression const  & expression) const
+std::vector<int_t> elementwise_2d::input_sizes(math_expression const  & expression) const
 {
  std::pair<int_t, int_t> size = matrix_size(expression.tree(), lhs_most(expression.tree(), expression.root()));
  return {size.first, size.second};
 }

-void ger::enqueue(driver::CommandQueue & /*queue*/, driver::Program const & program, std::string const & suffix, base &, execution_handler const & control)
+void elementwise_2d::enqueue(driver::CommandQueue & /*queue*/, driver::Program const & program, std::string const & suffix, base &, execution_handler const & control)
 {
  math_expression const  & expressions = control.x();
-  std::string name = "axpy";
+  std::string name = "elementwise_1d";
  name +=suffix;
  driver::Kernel kernel(program, name.c_str());
  driver::NDRange global(p_.local_size_0*p_.num_groups_0, p_.local_size_1*p_.num_groups_1);
--- a/lib/kernels/templates/matrix_product.cpp
+++ b/lib/kernels/templates/matrix_product.cpp
@@ -1,5 +1,5 @@
 #include "isaac/array.h"
-#include "isaac/kernels/templates/gemm.h"
+#include "isaac/kernels/templates/matrix_product.h"
 #include "isaac/kernels/keywords.h"
 #include "isaac/symbolic/preset.h"
 #include "isaac/exception/operation_not_supported.h"
@@ -15,7 +15,7 @@ namespace isaac
 namespace templates
 {

-gemm_parameters::gemm_parameters(unsigned int simd_width
+matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
                                 , unsigned int local_size_0, unsigned int KL, unsigned int local_size_1, unsigned int D
                                 , unsigned int ms, unsigned int ks, unsigned int ns
                                 , fetching_policy_type A_fetching_policy, fetching_policy_type B_fetching_policy
@@ -27,7 +27,7 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
 }


-unsigned int gemm::lmem_usage(math_expression const & expression) const
+unsigned int matrix_product::lmem_usage(math_expression const & expression) const
 {
  numeric_type numeric_t = lhs_most(expression.tree(), expression.root()).lhs.dtype;
  unsigned int N = 0;
@@ -36,7 +36,7 @@ unsigned int gemm::lmem_usage(math_expression const & expression) const
  return N*size_of(numeric_t);
 }

-unsigned int gemm::registers_usage(math_expression const & expression) const
+unsigned int matrix_product::registers_usage(math_expression const & expression) const
 {
  numeric_type numeric_t = lhs_most(expression.tree(), expression.root()).lhs.dtype;

@@ -44,7 +44,7 @@ unsigned int gemm::registers_usage(math_expression const & expression) const
  return N*size_of(numeric_t);
 }

-unsigned int gemm::temporary_workspace(math_expression const & expressions) const
+unsigned int matrix_product::temporary_workspace(math_expression const & expressions) const
 {
  std::vector<int_t> MNK = input_sizes(expressions);
  int_t M = MNK[0]; int_t N = MNK[1];
@@ -53,7 +53,7 @@ unsigned int gemm::temporary_workspace(math_expression const & expressions) cons
  return 0;
 }

-int gemm::is_invalid_impl(driver::Device const &, math_expression const &) const
+int matrix_product::is_invalid_impl(driver::Device const &, math_expression const &) const
 {
  //    if(device.vendor()==driver::Device::Vendor::NVIDIA && p_.simd_width > 1)
  //      return TEMPLATE_INVALID_SIMD_WIDTH;
@@ -103,7 +103,7 @@ int gemm::is_invalid_impl(driver::Device const &, math_expression const &) const
  return TEMPLATE_VALID;
 }

-std::string gemm::generate_impl(std::string const & suffix, math_expression const & expression, driver::Device const & device, mapping_type const &) const
+std::string matrix_product::generate_impl(std::string const & suffix, math_expression const & expression, driver::Device const & device, mapping_type const &) const
 {
  using std::string;
  using tools::to_string;
@@ -132,10 +132,10 @@ std::string gemm::generate_impl(std::string const & suffix, math_expression cons
  //////////////////
  /// DECLARATIONS
  /// //////////////
-  std::string gemm_name = "gemm";
+  std::string matrix_product_name = "matrix_product";
  std::string reduce_name = "reduce";

-  gemm_name += suffix;
+  matrix_product_name += suffix;
  reduce_name += suffix;

  switch(backend)
@@ -146,7 +146,7 @@ std::string gemm::generate_impl(std::string const & suffix, math_expression cons
    stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break;
  }

-  stream << KernelPrefix(backend) << " void " << gemm_name << "(" << _size_t << " M, " << _size_t << " N, " << _size_t << " K, "
+  stream << KernelPrefix(backend) << " void " << matrix_product_name << "(" << _size_t << " M, " << _size_t << " N, " << _size_t << " K, "
         << Global(backend) << " " << sdtype << "* C, "  << _size_t << " ldc," << _size_t << " offc," << _size_t << " Cstride1, "
         << sdtype << " alpha,"
         << Global(backend) << " " << sdtype << "* A, "  << _size_t << " lda," << _size_t << " offa," << _size_t << " Astride1,"
@@ -572,7 +572,7 @@ std::string gemm::generate_impl(std::string const & suffix, math_expression cons
 #undef VST0RE
 }

-void gemm::enqueue_block(driver::CommandQueue & /*queue*/, int_t M, int_t N, int_t K,
+void matrix_product::enqueue_block(driver::CommandQueue & /*queue*/, int_t M, int_t N, int_t K,
                         array_base const & A, array_base const & B, array_base const & C,
                         value_scalar const & alpha, value_scalar const & beta,
                         driver::Program const & program, std::string const & suffix, execution_options_type const & options)
@@ -582,53 +582,53 @@ void gemm::enqueue_block(driver::CommandQueue & /*queue*/, int_t M, int_t N, int
  if(M==0 || N==0 || K==0)
    return;

-  std::string gemm_name = "gemm";
+  std::string matrix_product_name = "matrix_product";
  std::string reduce_name = "reduce";

-  gemm_name += suffix;
+  matrix_product_name += suffix;
  reduce_name += suffix;

-  driver::Kernel gemm(program, gemm_name.c_str());
+  driver::Kernel matrix_product(program, matrix_product_name.c_str());
  driver::NDRange local(p_.local_size_0, p_.local_size_1, 1);
  driver::NDRange global(align(align(M,p_.mS)/p_.mS, p_.local_size_0), align(align(N,p_.nS)/p_.nS, p_.local_size_1), p_.depth);

  unsigned int current_arg = 0;
  bind_independent binder;
-  set_arguments_functor helper(binder, current_arg, gemm);
+  set_arguments_functor helper(binder, current_arg, matrix_product);

  driver::Buffer& workspace = driver::backend::workspaces::get(options.queue(C.context()));
-  gemm.setSizeArg(current_arg++, M);
-  gemm.setSizeArg(current_arg++, N);
-  gemm.setSizeArg(current_arg++, K);
+  matrix_product.setSizeArg(current_arg++, M);
+  matrix_product.setSizeArg(current_arg++, N);
+  matrix_product.setSizeArg(current_arg++, K);
  if(p_.depth==1)
  {
-    gemm.setArg(current_arg++,C.data());
-    gemm.setSizeArg(current_arg++, C.stride()[1]);
-    gemm.setSizeArg(current_arg++, C.start());
-    gemm.setSizeArg(current_arg++, C.stride()[0]);
+    matrix_product.setArg(current_arg++,C.data());
+    matrix_product.setSizeArg(current_arg++, C.stride()[1]);
+    matrix_product.setSizeArg(current_arg++, C.start());
+    matrix_product.setSizeArg(current_arg++, C.stride()[0]);
  }
  else
  {
-    gemm.setArg(current_arg++, workspace);
-    gemm.setSizeArg(current_arg++, M);
-    gemm.setSizeArg(current_arg++, 0);
-    gemm.setSizeArg(current_arg++, 1);
+    matrix_product.setArg(current_arg++, workspace);
+    matrix_product.setSizeArg(current_arg++, M);
+    matrix_product.setSizeArg(current_arg++, 0);
+    matrix_product.setSizeArg(current_arg++, 1);
  }


  helper.set_arguments(alpha.dtype(), alpha.values());
-  gemm.setArg(current_arg++, A.data());
-  gemm.setSizeArg(current_arg++, A.stride()[1]);
-  gemm.setSizeArg(current_arg++, A.start());
-  gemm.setSizeArg(current_arg++, A.stride()[0]);
+  matrix_product.setArg(current_arg++, A.data());
+  matrix_product.setSizeArg(current_arg++, A.stride()[1]);
+  matrix_product.setSizeArg(current_arg++, A.start());
+  matrix_product.setSizeArg(current_arg++, A.stride()[0]);

-  gemm.setArg(current_arg++, B.data());
-  gemm.setSizeArg(current_arg++, B.stride()[1]);
-  gemm.setSizeArg(current_arg++, B.start());
-  gemm.setSizeArg(current_arg++, B.stride()[0]);
+  matrix_product.setArg(current_arg++, B.data());
+  matrix_product.setSizeArg(current_arg++, B.stride()[1]);
+  matrix_product.setSizeArg(current_arg++, B.start());
+  matrix_product.setSizeArg(current_arg++, B.stride()[0]);

  helper.set_arguments(beta.dtype(), beta.values());
-  options.enqueue(program.context(), gemm, global, local);
+  options.enqueue(program.context(), matrix_product, global, local);

  if(p_.depth > 1)
  {
@@ -652,18 +652,18 @@ void gemm::enqueue_block(driver::CommandQueue & /*queue*/, int_t M, int_t N, int

 }

-std::vector<int_t> gemm::infos(math_expression const & expression, symbolic::preset::gemm::args& arguments) const
+std::vector<int_t> matrix_product::infos(math_expression const & expression, symbolic::preset::matrix_product::args& arguments) const
 {
  math_expression::container_type const & array = expression.tree();
  std::size_t root = expression.root();
-  arguments = symbolic::preset::gemm::check(array, root);
+  arguments = symbolic::preset::matrix_product::check(array, root);
  int_t M = arguments.C->array->shape()[0];
  int_t N = arguments.C->array->shape()[1];
  int_t K = (A_trans_=='T')?arguments.A->array->shape()[0]:arguments.A->array->shape()[1];
  return {M, N, K};
 }

-gemm::gemm(gemm_parameters const & parameters, bool check_bounds, char A_trans, char B_trans) : base_impl<gemm, gemm_parameters>(parameters, BIND_INDEPENDENT), A_trans_(A_trans), B_trans_(B_trans), check_bounds_(check_bounds)
+matrix_product::matrix_product(matrix_product_parameters const & parameters, bool check_bounds, char A_trans, char B_trans) : base_impl<matrix_product, matrix_product_parameters>(parameters, BIND_INDEPENDENT), A_trans_(A_trans), B_trans_(B_trans), check_bounds_(check_bounds)
 {
  if(A_trans_=='N' && B_trans_=='N') type_ = GEMM_NN_TYPE;
  else if(A_trans_=='T' && B_trans_=='N') type_ = GEMM_TN_TYPE;
@@ -672,21 +672,21 @@ gemm::gemm(gemm_parameters const & parameters, bool check_bounds, char A_trans,
  else throw;
 }

-std::vector<int_t> gemm::input_sizes(math_expression const & expressions) const
+std::vector<int_t> matrix_product::input_sizes(math_expression const & expressions) const
 {
-  symbolic::preset::gemm::args dummy;
+  symbolic::preset::matrix_product::args dummy;
  return infos((math_expression&)expressions, dummy);
 }

-void gemm::enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, base & fallback_base, execution_handler const & control)
+void matrix_product::enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, base & fallback_base, execution_handler const & control)
 {
  using namespace tools;

-  gemm & fallback = (gemm&)fallback_base;
+  matrix_product & fallback = (matrix_product&)fallback_base;
  math_expression const & expressions = control.x();


-  symbolic::preset::gemm::args args;
+  symbolic::preset::matrix_product::args args;
  std::vector<int_t> MNK = infos(expressions, args);

  int_t M = MNK[0];
@@ -720,40 +720,40 @@ void gemm::enqueue(driver::CommandQueue & queue, driver::Program const & program
 }

 //
-gemm_nn::gemm_nn(unsigned int simd
+matrix_product_nn::matrix_product_nn(unsigned int simd
                 , int_t ls0, int_t KL, int_t ls1, int_t D
                 , int_t ms, int_t ks, int_t ns
                 , fetching_policy_type Afetch , fetching_policy_type Bfetch
                 , int_t lfetch0, int_t lfetch1, bool check_bound) :
-  gemm(gemm_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), check_bound, 'N', 'N')
+  matrix_product(matrix_product_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), check_bound, 'N', 'N')
 {
 }

 //
-gemm_tn::gemm_tn(unsigned int simd
+matrix_product_tn::matrix_product_tn(unsigned int simd
                 , int_t ls0, int_t KL, int_t ls1, int_t D
                 , int_t ms, int_t ks, int_t ns
                 , fetching_policy_type Afetch , fetching_policy_type Bfetch
                 , int_t lfetch0, int_t lfetch1, bool check_bound) :
-  gemm(gemm_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), check_bound, 'T', 'N')
+  matrix_product(matrix_product_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), check_bound, 'T', 'N')
 { }

 //
-gemm_nt::gemm_nt(unsigned int simd
+matrix_product_nt::matrix_product_nt(unsigned int simd
                 , int_t ls0, int_t KL, int_t ls1, int_t D
                 , int_t ms, int_t ks, int_t ns
                 , fetching_policy_type Afetch , fetching_policy_type Bfetch
                 , int_t lfetch0, int_t lfetch1, bool check_bound) :
-  gemm(gemm_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), check_bound, 'N', 'T')
+  matrix_product(matrix_product_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), check_bound, 'N', 'T')
 { }

 //
-gemm_tt::gemm_tt(unsigned int simd
+matrix_product_tt::matrix_product_tt(unsigned int simd
                 , int_t ls0, int_t KL, int_t ls1, int_t D
                 , int_t ms, int_t ks, int_t ns
                 , fetching_policy_type Afetch , fetching_policy_type Bfetch
                 , int_t lfetch0, int_t lfetch1, bool check_bound) :
-  gemm(gemm_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), check_bound, 'T', 'T')
+  matrix_product(matrix_product_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), check_bound, 'T', 'T')
 { }

 }
--- a/lib/kernels/templates/reduce_1d.cpp
+++ b/lib/kernels/templates/reduce_1d.cpp
@@ -1,6 +1,6 @@
 #include <cstring>
 #include <iostream>
-#include "isaac/kernels/templates/dot.h"
+#include "isaac/kernels/templates/reduce_1d.h"
 #include "isaac/kernels/keywords.h"

 #include "tools/loop.hpp"
@@ -15,25 +15,25 @@ namespace isaac
 {
 namespace templates
 {
-dot_parameters::dot_parameters(unsigned int _simd_width,
+reduce_1d_parameters::reduce_1d_parameters(unsigned int _simd_width,
                     unsigned int _group_size, unsigned int _num_groups,
                     fetching_policy_type _fetching_policy) : base::parameters_type(_simd_width, _group_size, 1, 2), num_groups(_num_groups), fetching_policy(_fetching_policy)
 { }

-unsigned int dot::lmem_usage(math_expression const  & x) const
+unsigned int reduce_1d::lmem_usage(math_expression const  & x) const
 {
  numeric_type numeric_t= lhs_most(x.tree(), x.root()).lhs.dtype;
  return p_.local_size_0*size_of(numeric_t);
 }

-int dot::is_invalid_impl(driver::Device const &, math_expression const  &) const
+int reduce_1d::is_invalid_impl(driver::Device const &, math_expression const  &) const
 {
  if (p_.fetching_policy==FETCH_FROM_LOCAL)
    return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
  return TEMPLATE_VALID;
 }

-inline void dot::reduce_1d_local_memory(kernel_generation_stream & stream, unsigned int size, std::vector<mapped_scalar_dot*> exprs,
+inline void reduce_1d::reduce_1d_local_memory(kernel_generation_stream & stream, unsigned int size, std::vector<mapped_reduce_1d*> exprs,
                                   std::string const & buf_str, std::string const & buf_value_str, driver::backend_type backend) const
 {
  stream << "#pragma unroll" << std::endl;
@@ -46,25 +46,25 @@ inline void dot::reduce_1d_local_memory(kernel_generation_stream & stream, unsig
  stream.inc_tab();

  for (auto & expr : exprs)
-    if (expr->is_index_dot())
-      compute_index_dot(stream, expr->process(buf_str+"[lid]"), expr->process(buf_str+"[lid+stride]")
+    if (expr->is_index_reduction())
+      compute_index_reduce_1d(stream, expr->process(buf_str+"[lid]"), expr->process(buf_str+"[lid+stride]")
                              , expr->process(buf_value_str+"[lid]"), expr->process(buf_value_str+"[lid+stride]"),
                              expr->root_op());
    else
-      compute_dot(stream, expr->process(buf_str+"[lid]"), expr->process(buf_str+"[lid+stride]"), expr->root_op());
+      compute_reduce_1d(stream, expr->process(buf_str+"[lid]"), expr->process(buf_str+"[lid+stride]"), expr->root_op());
  stream.dec_tab();
  stream << "}" << std::endl;
  stream.dec_tab();
  stream << "}" << std::endl;
 }

-std::string dot::generate_impl(std::string const & suffix, math_expression const  & expressions, driver::Device const & device, mapping_type const & mapping) const
+std::string reduce_1d::generate_impl(std::string const & suffix, math_expression const  & expressions, driver::Device const & device, mapping_type const & mapping) const
 {
  kernel_generation_stream stream;

-  std::vector<mapped_scalar_dot*> exprs;
+  std::vector<mapped_reduce_1d*> exprs;
  for (mapping_type::const_iterator iit = mapping.begin(); iit != mapping.end(); ++iit)
-    if (mapped_scalar_dot * p = dynamic_cast<mapped_scalar_dot*>(iit->second.get()))
+    if (mapped_reduce_1d * p = dynamic_cast<mapped_reduce_1d*>(iit->second.get()))
      exprs.push_back(p);
  std::size_t N = exprs.size();
  driver::backend_type backend = device.backend();
@@ -81,7 +81,7 @@ std::string dot::generate_impl(std::string const & suffix, math_expression const
      {
        numeric_type dtype = lhs_most(exprs[k]->math_expression().tree(),  exprs[k]->math_expression().root()).lhs.dtype;
        std::string sdtype = to_string(dtype);
-        if (exprs[k]->is_index_dot())
+        if (exprs[k]->is_index_reduction())
        {
          stream << exprs[k]->process("uint* #name_temp = (uint*)(tmp + " + tools::to_string(offset) + ");");
          offset += 4*p_.num_groups;
@@ -125,7 +125,7 @@ std::string dot::generate_impl(std::string const & suffix, math_expression const

  for (unsigned int k = 0; k < N; ++k)
  {
-    if (exprs[k]->is_index_dot())
+    if (exprs[k]->is_index_reduction())
    {
      stream << exprs[k]->process(Local(backend).get() + " #scalartype #name_buf_value[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
      stream << exprs[k]->process("#scalartype #name_acc_value = " + neutral_element(exprs[k]->root_op(), backend, "#scalartype") + ";") << std::endl;
@@ -174,11 +174,11 @@ std::string dot::generate_impl(std::string const & suffix, math_expression const
        accessors["matrix_diag"] = str[a];
        accessors["array1"] = "#namereg";
        std::string value = elem->evaluate_recursive(LHS_NODE_TYPE, accessors);
-        if (elem->is_index_dot())
-          compute_index_dot(stream, elem->process("#name_acc"),  "i*" + tools::to_string(simd_width) + "+"
+        if (elem->is_index_reduction())
+          compute_index_reduce_1d(stream, elem->process("#name_acc"),  "i*" + tools::to_string(simd_width) + "+"
                                  + tools::to_string(a), elem->process("#name_acc_value"), value,elem->root_op());
        else
-          compute_dot(stream, elem->process("#name_acc"), value,elem->root_op());
+          compute_reduce_1d(stream, elem->process("#name_acc"), value,elem->root_op());
      }
    }
  });
@@ -186,7 +186,7 @@ std::string dot::generate_impl(std::string const & suffix, math_expression const
  //Fills local memory
  for (unsigned int k = 0; k < N; ++k)
  {
-    if (exprs[k]->is_index_dot())
+    if (exprs[k]->is_index_reduction())
      stream << exprs[k]->process("#name_buf_value[lid] = #name_acc_value;") << std::endl;
    stream << exprs[k]->process("#name_buf[lid] = #name_acc;") << std::endl;
  }
@@ -200,7 +200,7 @@ std::string dot::generate_impl(std::string const & suffix, math_expression const
  stream.inc_tab();
  for (unsigned int k = 0; k < N; ++k)
  {
-    if (exprs[k]->is_index_dot())
+    if (exprs[k]->is_index_reduction())
      stream << exprs[k]->process("#name_temp_value[gpid] = #name_buf_value[0];") << std::endl;
    stream << exprs[k]->process("#name_temp[gpid] = #name_buf[0];") << std::endl;
  }
@@ -225,9 +225,9 @@ std::string dot::generate_impl(std::string const & suffix, math_expression const
  stream << "unsigned int lid = " <<LocalIdx0(backend) << ";" << std::endl;
  stream << "unsigned int lsize = " <<LocalSize0(backend) << ";" << std::endl;

-  for (mapped_scalar_dot* e: exprs)
+  for (mapped_reduce_1d* e: exprs)
  {
-    if (e->is_index_dot())
+    if (e->is_index_reduction())
    {
      stream << e->process(Local(backend).get() + " unsigned int #name_buf[" + tools::to_string(p_.local_size_0) + "];");
      stream << e->process("unsigned int #name_acc = 0;") << std::endl;
@@ -244,18 +244,18 @@ std::string dot::generate_impl(std::string const & suffix, math_expression const
  stream << "for(unsigned int i = lid; i < " << p_.num_groups << "; i += lsize)" << std::endl;
  stream << "{" << std::endl;
  stream.inc_tab();
-  for (mapped_scalar_dot* e: exprs)
-    if (e->is_index_dot())
-      compute_index_dot(stream, e->process("#name_acc"), e->process("#name_temp[i]"), e->process("#name_acc_value"),e->process("#name_temp_value[i]"),e->root_op());
+  for (mapped_reduce_1d* e: exprs)
+    if (e->is_index_reduction())
+      compute_index_reduce_1d(stream, e->process("#name_acc"), e->process("#name_temp[i]"), e->process("#name_acc_value"),e->process("#name_temp_value[i]"),e->root_op());
    else
-      compute_dot(stream, e->process("#name_acc"), e->process("#name_temp[i]"), e->root_op());
+      compute_reduce_1d(stream, e->process("#name_acc"), e->process("#name_temp[i]"), e->root_op());

  stream.dec_tab();
  stream << "}" << std::endl;

  for (unsigned int k = 0; k < N; ++k)
  {
-    if (exprs[k]->is_index_dot())
+    if (exprs[k]->is_index_reduction())
      stream << exprs[k]->process("#name_buf_value[lid] = #name_acc_value;") << std::endl;
    stream << exprs[k]->process("#name_buf[lid] = #name_acc;") << std::endl;
  }
@@ -268,7 +268,7 @@ std::string dot::generate_impl(std::string const & suffix, math_expression const
  stream << "{" << std::endl;
  stream.inc_tab();
  std::map<std::string, std::string> accessors;
-  accessors["scalar_dot"] = "#name_buf[0]";
+  accessors["scalar_reduce_1d"] = "#name_buf[0]";
  accessors["array1"] = "#pointer[#start]";
  accessors["array11"] = "#pointer[#start]";
  stream << evaluate(PARENT_NODE_TYPE, accessors, expressions, expressions.root(), mapping) << ";" << std::endl;
@@ -283,23 +283,23 @@ std::string dot::generate_impl(std::string const & suffix, math_expression const
  return stream.str();
 }

-dot::dot(dot::parameters_type const & parameters,
-                                       binding_policy_t binding) : base_impl<dot, dot_parameters>(parameters, binding)
+reduce_1d::reduce_1d(reduce_1d::parameters_type const & parameters,
+                                       binding_policy_t binding) : base_impl<reduce_1d, reduce_1d_parameters>(parameters, binding)
 { }

-dot::dot(unsigned int simd, unsigned int ls, unsigned int ng,
+reduce_1d::reduce_1d(unsigned int simd, unsigned int ls, unsigned int ng,
                               fetching_policy_type fetch, binding_policy_t bind):
-    base_impl<dot, dot_parameters>(dot_parameters(simd,ls,ng,fetch), bind)
+    base_impl<reduce_1d, reduce_1d_parameters>(reduce_1d_parameters(simd,ls,ng,fetch), bind)
 {}

-std::vector<int_t> dot::input_sizes(math_expression const  & x) const
+std::vector<int_t> reduce_1d::input_sizes(math_expression const  & x) const
 {
-  std::vector<size_t> dots_idx = filter_nodes(&is_dot, x, x.root(), false);
-  int_t N = vector_size(lhs_most(x.tree(), dots_idx[0]));
+  std::vector<size_t> reduce_1ds_idx = filter_nodes(&is_reduce_1d, x, x.root(), false);
+  int_t N = vector_size(lhs_most(x.tree(), reduce_1ds_idx[0]));
  return {N};
 }

-void dot::enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, base & fallback, execution_handler const & control)
+void reduce_1d::enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, base & fallback, execution_handler const & control)
 {
  math_expression const  & x = control.x();

@@ -313,10 +313,10 @@ void dot::enqueue(driver::CommandQueue & queue, driver::Program const & program,
      return;
  }

-  std::vector<math_expression::node const *> dots;
-    std::vector<size_t> dots_idx = filter_nodes(&is_dot, x, x.root(), false);
-    for (size_t idx: dots_idx)
-      dots.push_back(&x.tree()[idx]);
+  std::vector<math_expression::node const *> reduce_1ds;
+    std::vector<size_t> reduce_1ds_idx = filter_nodes(&is_reduce_1d, x, x.root(), false);
+    for (size_t idx: reduce_1ds_idx)
+      reduce_1ds.push_back(&x.tree()[idx]);

  //Kernel
  std::string name[2] = {"prod", "reduce"};
--- a/lib/kernels/templates/reduce_2d.cpp
+++ b/lib/kernels/templates/reduce_2d.cpp
@@ -2,7 +2,7 @@
 #include <iostream>
 #include "isaac/kernels/stream.h"
 #include "isaac/kernels/keywords.h"
-#include "isaac/kernels/templates/gemv.h"
+#include "isaac/kernels/templates/reduce_2d.h"

 #include "tools/arguments.hpp"
 #include "tools/loop.hpp"
@@ -16,33 +16,33 @@ namespace isaac
 namespace templates
 {

-gemv_parameters::gemv_parameters(unsigned int _simd_width,
+reduce_2d_parameters::reduce_2d_parameters(unsigned int _simd_width,
                              unsigned int _local_size_0, unsigned int _local_size_1,
                              unsigned int _num_groups_0, unsigned int _num_groups_1, fetching_policy_type _fetch_policy): base::parameters_type(_simd_width, _local_size_0, _local_size_1, 1),
 num_groups_0(_num_groups_0), num_groups_1(_num_groups_1), fetch_policy(_fetch_policy) { }


-int gemv::is_invalid_impl(driver::Device const &, math_expression const &) const
+int reduce_2d::is_invalid_impl(driver::Device const &, math_expression const &) const
 {
  if (p_.fetch_policy==FETCH_FROM_LOCAL)
    return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
  return TEMPLATE_VALID;
 }

-unsigned int gemv::lmem_usage(const math_expression&) const
+unsigned int reduce_2d::lmem_usage(const math_expression&) const
 {
  return (p_.local_size_0+1)*p_.local_size_1;
 }

-std::string gemv::generate_impl(std::string const & suffix, math_expression const & expression, driver::Device const & device, mapping_type const & mapping) const
+std::string reduce_2d::generate_impl(std::string const & suffix, math_expression const & expression, driver::Device const & device, mapping_type const & mapping) const
 {
  using tools::to_string;


-  std::vector<mapped_gemv*> dots;
-  std::vector<size_t> idx = filter_nodes(&is_dot, expression, expression.root(), false);
+  std::vector<mapped_reduce_2d*> reduce_1ds;
+  std::vector<size_t> idx = filter_nodes(&is_reduce_1d, expression, expression.root(), false);
  for (auto & elem : idx)
-    dots.push_back((mapped_gemv*)(mapping.at(mapping_key(elem, PARENT_NODE_TYPE)).get()));
+    reduce_1ds.push_back((mapped_reduce_2d*)(mapping.at(mapping_key(elem, PARENT_NODE_TYPE)).get()));

  kernel_generation_stream stream;
  driver::backend_type backend = device.backend();
@@ -55,11 +55,11 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
  auto unroll_tmp = [&]()
  {
      unsigned int offset = 0;
-      for (const auto & e : dots)
+      for (const auto & e : reduce_1ds)
      {
        numeric_type dtype = lhs_most(e->math_expression().tree(),  e->math_expression().root()).lhs.dtype;
        std::string sdtype = to_string(dtype);
-        if (e->is_index_dot())
+        if (e->is_index_reduction())
        {
          stream << e->process("uint* #name_temp = (uint*)(tmp + " + tools::to_string(offset) + "*M);");
          offset += 4*p_.num_groups_0;
@@ -73,7 +73,7 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
      }
  };

-  int col_simd_width = (dot_type_ == REDUCE_COLUMNS) ? 1 : p_.simd_width;
+  int col_simd_width = (reduce_1d_type_ == REDUCE_COLUMNS) ? 1 : p_.simd_width;
  switch(backend)
  {
    case driver::CUDA:
@@ -96,7 +96,7 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
  unsigned int local_size_0_ld = p_.local_size_0;
  std::string local_size_0_ld_str = to_string(local_size_0_ld);

-  for (const auto & e : dots)
+  for (const auto & e : reduce_1ds)
    stream << e->process(Local(backend).get() + " " + append_width("#scalartype", col_simd_width) + " #name_buf[" + to_string(p_.local_size_1*local_size_0_ld) + "];") << std::endl;

  stream << "for(" << _size_t << " r = " << GlobalIdx1(backend) << "*" << col_simd_width << "; r < (M +" << p_.local_size_1 - 1 << ")/" << p_.local_size_1 << "*" << p_.local_size_1*col_simd_width << "; r += " << GlobalSize1(backend) << "*" << col_simd_width << ")" << std::endl;
@@ -106,7 +106,7 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
  stream << "" << _size_t << " lidx = " << LocalIdx0(backend) << ";" << std::endl;
  stream << "" << _size_t << " lidy = " << LocalIdx1(backend) <<";" << std::endl;

-  for (const auto & e : dots){
+  for (const auto & e : reduce_1ds){
    std::string data_type = append_width("#scalartype",col_simd_width);

    stream << e->process(data_type + " #name_acc = " + InitPrefix(backend, data_type).get()  + "(" + neutral_element((e)->root_op(), backend, "#scalartype") + ");") << std::endl;
@@ -116,14 +116,14 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
  stream << "{" << std::endl;
  stream.inc_tab();

-  element_wise_loop_1D(stream, p_.fetch_policy, (dot_type_==REDUCE_COLUMNS)?p_.simd_width:1, "c", "N", GlobalIdx0(backend).get(), GlobalSize0(backend).get(), device, [&](unsigned int row_simd_width)
+  element_wise_loop_1D(stream, p_.fetch_policy, (reduce_1d_type_==REDUCE_COLUMNS)?p_.simd_width:1, "c", "N", GlobalIdx0(backend).get(), GlobalSize0(backend).get(), device, [&](unsigned int row_simd_width)
  {

    std::set<std::string> already_fetched;
-    for (const auto & e : dots)
+    for (const auto & e : reduce_1ds)
    {
      std::map<std::string, std::string> accessors;
-      if(dot_type_==REDUCE_COLUMNS)
+      if(reduce_1d_type_==REDUCE_COLUMNS)
      {
        std::string data_type = append_width("#scalartype",row_simd_width);
        accessors["arraynn"] = data_type + " #namereg = " + vload(row_simd_width, "#scalartype", "c*#stride", "#pointer + r*#ld", "1", backend,false)+";";
@@ -147,20 +147,20 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
        str[a] = access_vector_type("#namereg",a);


-    for (auto & elem : dots)
+    for (auto & elem : reduce_1ds)
      for (unsigned int a = 0; a < row_simd_width; ++a)
      {
        std::string value = elem->evaluate_recursive(LHS_NODE_TYPE, {{"arraynn", str[a]}, {"repeat", str[a]}, {"array1", "#namereg"}});
-        if (elem->is_index_dot())
-          compute_index_dot(stream, elem->process("#name_acc"), "c*"+to_string(row_simd_width) + to_string(a), elem->process("#name_acc_value"), value, elem->root_op());
+        if (elem->is_index_reduction())
+          compute_index_reduce_1d(stream, elem->process("#name_acc"), "c*"+to_string(row_simd_width) + to_string(a), elem->process("#name_acc_value"), value, elem->root_op());
        else
-          compute_dot(stream, elem->process("#name_acc"), value,elem->root_op());
+          compute_reduce_1d(stream, elem->process("#name_acc"), value,elem->root_op());
      }
  });
  stream.dec_tab();
  stream << "}" << std::endl;

-  for (auto & expr : dots)
+  for (auto & expr : reduce_1ds)
    stream << expr->process("#name_buf[lidy*" + local_size_0_ld_str + "+ lidx] = #name_acc;") << std::endl;

  stream << "#pragma unroll" << std::endl;
@@ -173,13 +173,13 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
  stream << "{" << std::endl;
  stream.inc_tab();

-  for (auto & e : dots)
-    if (e->is_index_dot())
-      compute_index_dot(stream, e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx]"), e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx + stride]")
+  for (auto & e : reduce_1ds)
+    if (e->is_index_reduction())
+      compute_index_reduce_1d(stream, e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx]"), e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx + stride]")
                                    , e->process("#name_buf_value[lidy*" + local_size_0_ld_str + " + lidx]"), e->process("#name_buf_value[lidy*" + local_size_0_ld_str + " + lidx + stride]")
                                    , e->root_op());
    else
-      compute_dot(stream,e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx]"), e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx + stride]"), e->root_op());
+      compute_reduce_1d(stream,e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx]"), e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx + stride]"), e->root_op());

  stream.dec_tab();
  stream << "}" << std::endl;
@@ -196,9 +196,9 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
    std::map<std::string, std::string> accessors;
    for(int s = 0 ; s < col_simd_width ; ++s)
    {
-        accessors["gemv"] = "#name_buf[lidy*" + local_size_0_ld_str + "]";
+        accessors["reduce_2d"] = "#name_buf[lidy*" + local_size_0_ld_str + "]";
        if(col_simd_width > 1)
-            accessors["gemv"] = access_vector_type(accessors["gemv"], s);
+            accessors["reduce_2d"] = access_vector_type(accessors["reduce_2d"], s);
        accessors["arrayn"] = "#pointer[(r +" + to_string(s) + ")*#stride]";
        accessors["array1n"] = "#pointer[(r +" + to_string(s) + ")*#stride]";
        accessors["arrayn1"] = "#pointer[(r +" + to_string(s) + ")*#stride]";
@@ -207,11 +207,11 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
  }
  else
  {
-    for (mapped_dot const * e : dots)
+    for (mapped_reduce const * e : reduce_1ds)
    {
      if(col_simd_width > 1)
          stream << "if(M - r > " << col_simd_width << "){" << std::endl;
-      if (e->is_index_dot())
+      if (e->is_index_reduction())
          stream << e->process(vstore(col_simd_width,"uint", "#name_buf_value[lidy*" + local_size_0_ld_str + "]", "0", "#name_temp_value + r + M*" + GroupIdx0(backend).get(), "1", backend, false)) << ";" << std::endl;
      stream << e->process(vstore(col_simd_width,"#scalartype", "#name_buf[lidy*" + local_size_0_ld_str + "]", "0", "#name_temp + r + M*" + GroupIdx0(backend).get(), "1", backend, false)) << ";" << std::endl;
      if(col_simd_width > 1)
@@ -220,7 +220,7 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
          stream << "else{" << std::endl;
          stream.inc_tab();
          for(int s = 0 ; s < col_simd_width ; ++s){
-              if (e->is_index_dot())
+              if (e->is_index_reduction())
                  stream << "if(r + " << s << "< M) " << e->process("#name_temp_value[r + " + to_string(s) + " + M*" + GroupIdx0(backend).get() + "] = " + access_vector_type("#name_buf_value[lidy*" + local_size_0_ld_str + "]", s)) << ";" << std::endl;
              stream << "if(r + " << s << "< M) " << e->process("#name_temp[r + " + to_string(s) + " + M*" + GroupIdx0(backend).get() + "] = " + access_vector_type("#name_buf[lidy*" + local_size_0_ld_str + "]", s)) << ";" << std::endl;
          }
@@ -262,7 +262,7 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
                         {"arrayn1", "#pointer += #start;"},
                         {"arraynn", "#pointer += #start; "}}, expression, mapping);

-  for (const auto & e : dots)
+  for (const auto & e : reduce_1ds)
    stream << e->process(Local(backend).get() + " #scalartype #name_buf[" + to_string(p_.local_size_1*local_size_0_ld) + "];") << std::endl;

  stream << "for(" << _size_t << " r = " << GlobalIdx1(backend) << "; r < (M +" << p_.local_size_1 - 1 << ")/" << p_.local_size_1 << "*" << p_.local_size_1 << "; r += " << GlobalSize1(backend) << "){" << std::endl;
@@ -270,7 +270,7 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
  stream << _size_t << " lidx = " << LocalIdx0(backend) << ";" << std::endl;
  stream << _size_t << " lidy = " << LocalIdx1(backend) <<";" << std::endl;

-  for (const auto & e : dots)
+  for (const auto & e : reduce_1ds)
    stream << e->process("#scalartype #name_acc = " + neutral_element((e)->root_op(), backend, "#scalartype") + ";") << std::endl;

  stream << "if (r < M)" << std::endl;
@@ -280,8 +280,8 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
  stream << "for(" << _size_t << " c = lidx; c < " << p_.num_groups_0 << "; c += " << LocalSize0(backend) << "){" << std::endl;
  stream.inc_tab();

-  for (mapped_dot* e: dots)
-    compute_dot(stream, e->process("#name_acc"), e->process("#name_temp[r + M*c]"), e->root_op());
+  for (mapped_reduce* e: reduce_1ds)
+    compute_reduce_1d(stream, e->process("#name_acc"), e->process("#name_temp[r + M*c]"), e->root_op());

  stream.dec_tab();
  stream << "}" << std::endl;
@@ -290,7 +290,7 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
  stream.dec_tab();
  stream << "}" << std::endl;

-  for (auto & expr : dots)
+  for (auto & expr : reduce_1ds)
    stream << expr->process("#name_buf[lidy*" + local_size_0_ld_str + "+ lidx] = #name_acc;") << std::endl;

  stream << "#pragma unroll" << std::endl;
@@ -303,13 +303,13 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
  stream << "{" << std::endl;
  stream.inc_tab();

-  for (auto & e : dots)
-    if (e->is_index_dot())
-      compute_index_dot(stream, e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx]"), e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx + stride]")
+  for (auto & e : reduce_1ds)
+    if (e->is_index_reduction())
+      compute_index_reduce_1d(stream, e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx]"), e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx + stride]")
                                    , e->process("#name_buf_value[lidy*" + local_size_0_ld_str + " + lidx]"), e->process("#name_buf_value[lidy*" + local_size_0_ld_str + " + lidx + stride]")
                                    , e->root_op());
    else
-      compute_dot(stream,e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx]"), e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx + stride]"), e->root_op());
+      compute_reduce_1d(stream,e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx]"), e->process("#name_buf[lidy*" + local_size_0_ld_str + " + lidx + stride]"), e->root_op());

  stream.dec_tab();
  stream << "}" << std::endl;
@@ -323,7 +323,7 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
  stream.inc_tab();

  std::map<std::string, std::string> accessors;
-  accessors["gemv"] = "#name_buf[lidy*" + local_size_0_ld_str + "]";
+  accessors["reduce_2d"] = "#name_buf[lidy*" + local_size_0_ld_str + "]";
  accessors["arrayn"] = "#pointer[r*#stride]";
  accessors["array1n"] = "#pointer[r*#stride]";
  accessors["arrayn1"] = "#pointer[r*#stride]";
@@ -344,30 +344,30 @@ std::string gemv::generate_impl(std::string const & suffix, math_expression cons
  return stream.str();
 }

-gemv::gemv(gemv::parameters_type const & parameters,
-                                         gemv::dot_type rtype,
+reduce_2d::reduce_2d(reduce_2d::parameters_type const & parameters,
+                                         reduce_2d::reduce_1d_type rtype,
                                         binding_policy_t binding_policy) :
-  base_impl<gemv, gemv_parameters>(parameters, binding_policy),
-  dot_type_(rtype){ }
+  base_impl<reduce_2d, reduce_2d_parameters>(parameters, binding_policy),
+  reduce_1d_type_(rtype){ }

-std::vector<int_t> gemv::input_sizes(math_expression const & expression) const
+std::vector<int_t> reduce_2d::input_sizes(math_expression const & expression) const
 {
-  std::vector<std::size_t> idx = filter_nodes(&is_dot, expression, expression.root(), false);
+  std::vector<std::size_t> idx = filter_nodes(&is_reduce_1d, expression, expression.root(), false);
  std::pair<int_t, int_t> MN = matrix_size(expression.tree(), lhs_most(expression.tree(), idx[0]));
-  if(dot_type_==REDUCE_COLUMNS)
+  if(reduce_1d_type_==REDUCE_COLUMNS)
    std::swap(MN.first,MN.second);
  return {MN.first, MN.second};
 }

-void gemv::enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, base & fallback, execution_handler const & control)
+void reduce_2d::enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, base & fallback, execution_handler const & control)
 {
  math_expression const & expression = control.x();

  std::vector<int_t> MN = input_sizes(expression);
-  std::vector<math_expression::node const *> dots;
-  std::vector<size_t> dots_idx = filter_nodes(&is_dot, expression, expression.root(), false);
-  for (size_t idx : dots_idx)
-    dots.push_back(&expression.tree()[idx]);
+  std::vector<math_expression::node const *> reduce_1ds;
+  std::vector<size_t> reduce_1ds_idx = filter_nodes(&is_reduce_1d, expression, expression.root(), false);
+  for (size_t idx : reduce_1ds_idx)
+    reduce_1ds.push_back(&expression.tree()[idx]);

  //Fallback
  if(p_.simd_width>1 && requires_fallback(expression))
@@ -406,15 +406,15 @@ void gemv::enqueue(driver::CommandQueue & queue, driver::Program const & program
    control.execution_options().enqueue(program.context(), kernels[i], global[i], local[i]);
 }

-gemv_n::gemv_n(gemv_parameters  const & parameters,binding_policy_t binding_policy): gemv(parameters, REDUCE_ROWS, binding_policy){}
+reduce_2d_n::reduce_2d_n(reduce_2d_parameters  const & parameters,binding_policy_t binding_policy): reduce_2d(parameters, REDUCE_ROWS, binding_policy){}

-gemv_n::gemv_n(unsigned int simd, unsigned int ls1, unsigned int ls2,  unsigned int ng1, unsigned int ng2,
-               fetching_policy_type fetch, binding_policy_t bind): gemv(gemv_parameters(simd, ls1, ls2, ng1, ng2, fetch), REDUCE_ROWS, bind) {}
+reduce_2d_n::reduce_2d_n(unsigned int simd, unsigned int ls1, unsigned int ls2,  unsigned int ng1, unsigned int ng2,
+               fetching_policy_type fetch, binding_policy_t bind): reduce_2d(reduce_2d_parameters(simd, ls1, ls2, ng1, ng2, fetch), REDUCE_ROWS, bind) {}

-gemv_t::gemv_t(gemv::parameters_type  const & parameters, binding_policy_t binding_policy): gemv(parameters, REDUCE_COLUMNS, binding_policy){}
+reduce_2d_t::reduce_2d_t(reduce_2d::parameters_type  const & parameters, binding_policy_t binding_policy): reduce_2d(parameters, REDUCE_COLUMNS, binding_policy){}

-gemv_t::gemv_t(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2,
-               fetching_policy_type fetch, binding_policy_t bind): gemv(gemv_parameters(simd, ls1, ls2, ng1, ng2, fetch), REDUCE_COLUMNS, bind) {}
+reduce_2d_t::reduce_2d_t(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2,
+               fetching_policy_type fetch, binding_policy_t bind): reduce_2d(reduce_2d_parameters(simd, ls1, ls2, ng1, ng2, fetch), REDUCE_COLUMNS, bind) {}


 }
--- a/lib/kernels/templates/tools/map.hpp
+++ b/lib/kernels/templates/tools/map.hpp
@@ -81,12 +81,12 @@ public:
        mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_column>(&math_expression, root_idx, &mapping_)));
      else if(root_node.op.type==OPERATOR_ACCESS_INDEX_TYPE)
        mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_array_access>(&math_expression, root_idx, &mapping_)));
-      else if (detail::is_scalar_dot(root_node))
-        mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_scalar_dot>(&math_expression, root_idx, &mapping_)));
-      else if (detail::is_vector_dot(root_node))
-        mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_gemv>(&math_expression, root_idx, &mapping_)));
+      else if (detail::is_scalar_reduce_1d(root_node))
+        mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_reduce_1d>(&math_expression, root_idx, &mapping_)));
+      else if (detail::is_vector_reduce_1d(root_node))
+        mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_reduce_2d>(&math_expression, root_idx, &mapping_)));
      else if (root_node.op.type_family == OPERATOR_GEMM_TYPE_FAMILY)
-        mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_gemm>(&math_expression, root_idx, &mapping_)));
+        mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_product>(&math_expression, root_idx, &mapping_)));
      else if (root_node.op.type == OPERATOR_REPEAT_TYPE)
        mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_repeat>(&math_expression, root_idx, &mapping_)));
      else if (root_node.op.type == OPERATOR_OUTER_PROD_TYPE)
--- a/lib/kernels/templates/tools/reductions.hpp
+++ b/lib/kernels/templates/tools/reductions.hpp
@@ -12,7 +12,7 @@ namespace isaac
 namespace templates
 {

-inline void compute_dot(kernel_generation_stream & os, std::string acc, std::string cur, op_element const & op)
+inline void compute_reduce_1d(kernel_generation_stream & os, std::string acc, std::string cur, op_element const & op)
 {
  if (detail::is_elementwise_function(op))
    os << acc << "=" << evaluate(op.type) << "(" << acc << "," << cur << ");" << std::endl;
@@ -20,7 +20,7 @@ inline void compute_dot(kernel_generation_stream & os, std::string acc, std::str
    os << acc << "= (" << acc << ")" << evaluate(op.type)  << "(" << cur << ");" << std::endl;
 }

-inline void compute_index_dot(kernel_generation_stream & os, std::string acc, std::string cur, std::string const & acc_value, std::string const & cur_value, op_element const & op)
+inline void compute_index_reduce_1d(kernel_generation_stream & os, std::string acc, std::string cur, std::string const & acc_value, std::string const & cur_value, op_element const & op)
 {
  //        os << acc << " = " << cur_value << ">" << acc_value  << "?" << cur << ":" << acc << ";" << std::endl;
  os << acc << "= select(" << acc << "," << cur << "," << cur_value << ">" << acc_value << ");" << std::endl;
@@ -51,11 +51,11 @@ inline std::string neutral_element(op_element const & op, driver::backend_type b
  case OPERATOR_ELEMENT_MIN_TYPE : return INF;
  case OPERATOR_ELEMENT_ARGMIN_TYPE : return INF;

-  default: throw std::runtime_error("Unsupported dot operator : no neutral element known");
+  default: throw std::runtime_error("Unsupported reduce_1d operator : no neutral element known");
  }
 }

-inline bool is_dot(math_expression::node const & node)
+inline bool is_reduce_1d(math_expression::node const & node)
 {
  return node.op.type_family==OPERATOR_VECTOR_DOT_TYPE_FAMILY
      || node.op.type_family==OPERATOR_COLUMNS_DOT_TYPE_FAMILY
@@ -63,7 +63,7 @@ inline bool is_dot(math_expression::node const & node)
 }


-inline bool is_index_dot(op_element const & op)
+inline bool is_index_reduction(op_element const & op)
 {
  return op.type==OPERATOR_ELEMENT_ARGFMAX_TYPE
      || op.type==OPERATOR_ELEMENT_ARGMAX_TYPE
--- a/lib/profiles/profiles.cpp
+++ b/lib/profiles/profiles.cpp
@@ -9,11 +9,11 @@
 #include "isaac/driver/program_cache.h"
 #include "isaac/profiles/profiles.h"
 #include "isaac/kernels/parse.h"
-#include "isaac/kernels/templates/axpy.h"
-#include "isaac/kernels/templates/dot.h"
-#include "isaac/kernels/templates/ger.h"
-#include "isaac/kernels/templates/gemv.h"
-#include "isaac/kernels/templates/gemm.h"
+#include "isaac/kernels/templates/elementwise_1d.h"
+#include "isaac/kernels/templates/reduce_1d.h"
+#include "isaac/kernels/templates/elementwise_2d.h"
+#include "isaac/kernels/templates/reduce_2d.h"
+#include "isaac/kernels/templates/matrix_product.h"
 #include "isaac/exception/operation_not_supported.h"


@@ -134,24 +134,24 @@ profiles::value_type::templates_container const & profiles::value_type::template
 std::shared_ptr<templates::base> profiles::create(std::string const & template_name, std::vector<int> const & x)
 {
  templates::fetching_policy_type fetch[] = {templates::FETCH_FROM_LOCAL, templates::FETCH_FROM_GLOBAL_STRIDED, templates::FETCH_FROM_GLOBAL_CONTIGUOUS};
-  if(template_name=="axpy")
-    return std::shared_ptr<templates::base>(new templates::axpy(x[0], x[1], x[2], fetch[x[3]]));
-  else if(template_name=="dot")
-    return std::shared_ptr<templates::base>(new templates::dot(x[0], x[1], x[2], fetch[x[3]]));
-  else if(template_name=="ger")
-    return std::shared_ptr<templates::base>(new templates::ger(x[0], x[1], x[2], x[3], x[4], fetch[x[5]]));
-  else if(template_name.find("gemv_n")!=std::string::npos)
-    return std::shared_ptr<templates::base>(new templates::gemv_n(x[0], x[1], x[2], x[3], x[4], fetch[x[5]]));
-  else if(template_name.find("gemv_t")!=std::string::npos)
-    return std::shared_ptr<templates::base>(new templates::gemv_t(x[0], x[1], x[2], x[3], x[4], fetch[x[5]]));
-  else if(template_name.find("gemm_nn")!=std::string::npos)
-    return std::shared_ptr<templates::base>(new templates::gemm_nn(x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], fetch[x[8]], fetch[x[9]], x[10], x[11]));
-  else if(template_name.find("gemm_tn")!=std::string::npos)
-    return std::shared_ptr<templates::base>(new templates::gemm_tn(x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], fetch[x[8]], fetch[x[9]], x[10], x[11]));
-  else if(template_name.find("gemm_nt")!=std::string::npos)
-    return std::shared_ptr<templates::base>(new templates::gemm_nt(x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], fetch[x[8]], fetch[x[9]], x[10], x[11]));
-  else if(template_name.find("gemm_tt")!=std::string::npos)
-    return std::shared_ptr<templates::base>(new templates::gemm_tt(x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], fetch[x[8]], fetch[x[9]], x[10], x[11]));
+  if(template_name=="elementwise_1d")
+    return std::shared_ptr<templates::base>(new templates::elementwise_1d(x[0], x[1], x[2], fetch[x[3]]));
+  else if(template_name=="reduce_1d")
+    return std::shared_ptr<templates::base>(new templates::reduce_1d(x[0], x[1], x[2], fetch[x[3]]));
+  else if(template_name=="elementwise_2d")
+    return std::shared_ptr<templates::base>(new templates::elementwise_2d(x[0], x[1], x[2], x[3], x[4], fetch[x[5]]));
+  else if(template_name.find("reduce_2d_n")!=std::string::npos)
+    return std::shared_ptr<templates::base>(new templates::reduce_2d_n(x[0], x[1], x[2], x[3], x[4], fetch[x[5]]));
+  else if(template_name.find("reduce_2d_t")!=std::string::npos)
+    return std::shared_ptr<templates::base>(new templates::reduce_2d_t(x[0], x[1], x[2], x[3], x[4], fetch[x[5]]));
+  else if(template_name.find("matrix_product_nn")!=std::string::npos)
+    return std::shared_ptr<templates::base>(new templates::matrix_product_nn(x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], fetch[x[8]], fetch[x[9]], x[10], x[11]));
+  else if(template_name.find("matrix_product_tn")!=std::string::npos)
+    return std::shared_ptr<templates::base>(new templates::matrix_product_tn(x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], fetch[x[8]], fetch[x[9]], x[10], x[11]));
+  else if(template_name.find("matrix_product_nt")!=std::string::npos)
+    return std::shared_ptr<templates::base>(new templates::matrix_product_nt(x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], fetch[x[8]], fetch[x[9]], x[10], x[11]));
+  else if(template_name.find("matrix_product_tt")!=std::string::npos)
+    return std::shared_ptr<templates::base>(new templates::matrix_product_tt(x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], fetch[x[8]], fetch[x[9]], x[10], x[11]));
  else
    throw std::invalid_argument("Invalid expression: " + template_name);
 }
@@ -163,7 +163,7 @@ void profiles::import(std::string const & str, driver::CommandQueue const & queu
  rapidjson::Document document;
  document.Parse<0>(str.c_str());
  //Deserialize
-  std::vector<std::string> operations = {"axpy", "dot", "ger", "gemv_n", "gemv_t", "gemm_nn", "gemm_tn", "gemm_nt", "gemm_tt"};
+  std::vector<std::string> operations = {"elementwise_1d", "reduce_1d", "elementwise_2d", "reduce_2d_n", "reduce_2d_t", "matrix_product_nn", "matrix_product_tn", "matrix_product_nt", "matrix_product_tt"};
  std::vector<std::string> dtype = {"float32", "float64"};
  for(auto & operation : operations)
  {
@@ -265,15 +265,15 @@ std::map<std::pair<expression_type, numeric_type>, std::shared_ptr<templates::ba
  numeric_type types[] = {CHAR_TYPE, UCHAR_TYPE, SHORT_TYPE, USHORT_TYPE, INT_TYPE, UINT_TYPE, LONG_TYPE, ULONG_TYPE, FLOAT_TYPE, DOUBLE_TYPE};
  for(auto DTYPE : types)
  {
-    res[std::make_pair(AXPY_TYPE, DTYPE)] = ptr_t (new templates::axpy(1,64,128,templates::FETCH_FROM_GLOBAL_STRIDED));
-    res[std::make_pair(DOT_TYPE, DTYPE)] = ptr_t(new templates::dot(1,64,128,templates::FETCH_FROM_GLOBAL_STRIDED));
-    res[std::make_pair(GER_TYPE, DTYPE)] = ptr_t(new templates::ger(1,128,1,16,32,templates::FETCH_FROM_GLOBAL_STRIDED));
-    res[std::make_pair(GEMV_N_TYPE, DTYPE)] = ptr_t(new templates::gemv_n(1, 8, 8, 4, 16, templates::FETCH_FROM_GLOBAL_STRIDED));
-    res[std::make_pair(GEMV_T_TYPE, DTYPE)] = ptr_t(new templates::gemv_t(1, 8, 8, 64, 8, templates::FETCH_FROM_GLOBAL_STRIDED));
-    res[std::make_pair(GEMM_NN_TYPE, DTYPE)] = ptr_t(new templates::gemm_nn(1, 8, 16, 8, 1, 8, 1, 8, templates::FETCH_FROM_LOCAL, templates::FETCH_FROM_LOCAL, 8, 8, true));
-    res[std::make_pair(GEMM_TN_TYPE, DTYPE)] = ptr_t(new templates::gemm_tn(1, 8, 16, 8, 1, 8, 1, 8, templates::FETCH_FROM_LOCAL, templates::FETCH_FROM_LOCAL, 8, 8, true));
-    res[std::make_pair(GEMM_NT_TYPE, DTYPE)] = ptr_t(new templates::gemm_nt(1, 8, 16, 8, 1, 8, 1, 8, templates::FETCH_FROM_LOCAL, templates::FETCH_FROM_LOCAL, 8, 8, true));
-    res[std::make_pair(GEMM_TT_TYPE, DTYPE)] = ptr_t(new templates::gemm_tt(1, 8, 16, 8, 1, 8, 1, 8, templates::FETCH_FROM_LOCAL, templates::FETCH_FROM_LOCAL, 8, 8, true));
+    res[std::make_pair(AXPY_TYPE, DTYPE)] = ptr_t (new templates::elementwise_1d(1,64,128,templates::FETCH_FROM_GLOBAL_STRIDED));
+    res[std::make_pair(DOT_TYPE, DTYPE)] = ptr_t(new templates::reduce_1d(1,64,128,templates::FETCH_FROM_GLOBAL_STRIDED));
+    res[std::make_pair(GER_TYPE, DTYPE)] = ptr_t(new templates::elementwise_2d(1,128,1,16,32,templates::FETCH_FROM_GLOBAL_STRIDED));
+    res[std::make_pair(GEMV_N_TYPE, DTYPE)] = ptr_t(new templates::reduce_2d_n(1, 8, 8, 4, 16, templates::FETCH_FROM_GLOBAL_STRIDED));
+    res[std::make_pair(GEMV_T_TYPE, DTYPE)] = ptr_t(new templates::reduce_2d_t(1, 8, 8, 64, 8, templates::FETCH_FROM_GLOBAL_STRIDED));
+    res[std::make_pair(GEMM_NN_TYPE, DTYPE)] = ptr_t(new templates::matrix_product_nn(1, 8, 16, 8, 1, 8, 1, 8, templates::FETCH_FROM_LOCAL, templates::FETCH_FROM_LOCAL, 8, 8, true));
+    res[std::make_pair(GEMM_TN_TYPE, DTYPE)] = ptr_t(new templates::matrix_product_tn(1, 8, 16, 8, 1, 8, 1, 8, templates::FETCH_FROM_LOCAL, templates::FETCH_FROM_LOCAL, 8, 8, true));
+    res[std::make_pair(GEMM_NT_TYPE, DTYPE)] = ptr_t(new templates::matrix_product_nt(1, 8, 16, 8, 1, 8, 1, 8, templates::FETCH_FROM_LOCAL, templates::FETCH_FROM_LOCAL, 8, 8, true));
+    res[std::make_pair(GEMM_TT_TYPE, DTYPE)] = ptr_t(new templates::matrix_product_tt(1, 8, 16, 8, 1, 8, 1, 8, templates::FETCH_FROM_LOCAL, templates::FETCH_FROM_LOCAL, 8, 8, true));
  }
  return res;
 }
--- a/lib/symbolic/execute.cpp
+++ b/lib/symbolic/execute.cpp
@@ -161,7 +161,7 @@ namespace isaac

    expression_type final_type;
    //GEMM
-    if(symbolic::preset::gemm::args args = symbolic::preset::gemm::check(tree, rootidx)){
+    if(symbolic::preset::matrix_product::args args = symbolic::preset::matrix_product::check(tree, rootidx)){
        final_type = args.type;
    }
    //Default
--- a/lib/symbolic/preset.cpp
+++ b/lib/symbolic/preset.cpp
@@ -9,7 +9,7 @@ namespace symbolic
 namespace preset
 {

-void gemm::handle_node(math_expression::container_type const & tree, size_t rootidx, args & a)
+void matrix_product::handle_node(math_expression::container_type const & tree, size_t rootidx, args & a)
 {
    //Matrix-Matrix product node
    if(tree[rootidx].op.type_family==OPERATOR_GEMM_TYPE_FAMILY)
@@ -46,11 +46,11 @@ void gemm::handle_node(math_expression::container_type const & tree, size_t root
    }
 }

-gemm::args gemm::check(math_expression::container_type const & tree, size_t rootidx)
+matrix_product::args matrix_product::check(math_expression::container_type const & tree, size_t rootidx)
 {
    lhs_rhs_element const * assigned = &tree[rootidx].lhs;
    numeric_type dtype = assigned->dtype;
-    gemm::args result ;
+    matrix_product::args result ;
    if(dtype==INVALID_NUMERIC_TYPE)
      return result;
    result.alpha = value_scalar(1, dtype);
--- a/python/setup.py
+++ b/python/setup.py
@@ -73,7 +73,7 @@ def main():
      libraries += ['gnustl_shared']

    #Source files
-    src =  'src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/dispatch.cpp src/lib/driver/program_cache.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/profiles/presets.cpp src/lib/profiles/profiles.cpp src/lib/profiles/predictors/random_forest.cpp src/lib/kernels/templates/gemv.cpp src/lib/kernels/templates/axpy.cpp src/lib/kernels/templates/gemm.cpp src/lib/kernels/templates/ger.cpp src/lib/kernels/templates/dot.cpp src/lib/kernels/templates/base.cpp src/lib/kernels/mapped_object.cpp src/lib/kernels/stream.cpp src/lib/kernels/parse.cpp src/lib/kernels/keywords.cpp src/lib/kernels/binder.cpp src/lib/wrap/clBLAS.cpp src/lib/wrap/cublas.cpp '.split() + [os.path.join('src', 'bind', sf)  for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
+    src =  'src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/dispatch.cpp src/lib/driver/program_cache.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/profiles/presets.cpp src/lib/profiles/profiles.cpp src/lib/profiles/predictors/random_forest.cpp src/lib/kernels/templates/reduce_2d.cpp src/lib/kernels/templates/elementwise_2d.cpp src/lib/kernels/templates/elementwise_1d.cpp src/lib/kernels/templates/reduce_1d.cpp src/lib/kernels/templates/matrix_product.cpp src/lib/kernels/templates/base.cpp src/lib/kernels/mapped_object.cpp src/lib/kernels/stream.cpp src/lib/kernels/parse.cpp src/lib/kernels/keywords.cpp src/lib/kernels/binder.cpp src/lib/wrap/clBLAS.cpp src/lib/wrap/cublas.cpp '.split() + [os.path.join('src', 'bind', sf)  for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
    boostsrc = 'external/boost/libs/'
    for s in ['numpy','python','smart_ptr','system','thread']:
        src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]