From 10524ebdeeaacf7d6b35f85a07fea638e38d1eb2 Mon Sep 17 00:00:00 2001
From: Philippe Tillet <ptillet@g.harvard.edu>
Date: Fri, 21 Aug 2015 13:06:20 -0400
Subject: [PATCH] CUDA: various improvements

---
 bench/CMakeLists.txt             |  8 ++--
 bench/blas.cpp                   | 70 +++++++++++++++++---------------
 cmake/python/setup.py            |  4 +-
 include/isaac/defines.h          |  8 ++--
 include/isaac/driver/common.h    |  2 +-
 include/isaac/driver/device.h    |  2 +-
 include/isaac/driver/event.h     |  4 --
 include/isaac/driver/handle.h    | 11 ++++-
 include/isaac/kernels/binder.h   |  2 +-
 include/isaac/kernels/keywords.h |  1 +
 include/isaac/value_scalar.h     |  2 +-
 lib/driver/backend.cpp           | 12 +++---
 lib/driver/event.cpp             |  2 +-
 lib/driver/handle.cpp            |  9 ++--
 lib/driver/ndrange.cpp           |  3 ++
 lib/driver/platform.cpp          |  6 ++-
 lib/kernels/templates/gemm.cpp   | 41 ++++++++++---------
 lib/profiles/profiles.cpp        | 11 ++---
 python/setup.py                  |  4 +-
 python/src/bind/exceptions.cpp   | 41 +++++++++++--------
 tests/linalg/axpy.cpp            |  6 ++-
 tests/linalg/gemm.cpp            |  2 +-
 tune/android/tune/model.py       | 19 +++++----
 tune/android/tune/optimize.py    |  6 +--
 tune/android/tune/tune.py        | 24 +++++++----
 25 files changed, 170 insertions(+), 130 deletions(-)

diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt
index 330acd3d6..153487968 100644
--- a/bench/CMakeLists.txt
+++ b/bench/CMakeLists.txt
@@ -39,12 +39,14 @@ string(REPLACE ";" " " BLAS_DEF_STR "${BLAS_DEF}")
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 foreach(PROG blas)
    if(CUDA_FOUND)
-       set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} " ${BLAS_DEF_STR} -std=c++11 ${BACKEND_DEFINES}")
+       set(OLD_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+       set(CMAKE_CXX_FLAGS "")
        set(CUPROG ${CMAKE_CURRENT_BINARY_DIR}/${PROG}.cu)
        file(COPY ${PROG}.cpp DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
        file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${PROG}.cpp ${CUPROG})
-       cuda_add_executable(${PROG}-bench  ${CUPROG})
-      cuda_add_cublas_to_target(${PROG}-bench)
+       cuda_add_executable(${PROG}-bench  ${CUPROG} OPTIONS "-std=c++11 ${BLAS_DEF_STR} ${BACKEND_DEFINES}")
+       cuda_add_cublas_to_target(${PROG}-bench)
+       set(CMAKE_CXX_FLAGS "${OLD_CXX_FLAGS}")
    else()
        add_executable(${PROG}-bench  ${PROG}.cpp)
        set_target_properties(${PROG}-bench PROPERTIES COMPILE_FLAGS "${BLAS_DEF_STR}")
diff --git a/bench/blas.cpp b/bench/blas.cpp
index d5e3fe851..9647e4943 100644
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -94,7 +94,7 @@ void bench(sc::numeric_type dtype, std::string operation)
   cudaEventCreate(&stop);\
   OP;\
   cudaThreadSynchronize();\
-  while(total_time*1e-3 < 1e-3){\
+  while(total_time*1e-3 < 1e-2){\
     flush = sc::zeros(1e6, 1, dtype);\
     cudaEventRecord(start,0);\
     OP;\
@@ -191,7 +191,6 @@ void bench(sc::numeric_type dtype, std::string operation)
   #endif
   #ifdef BENCH_CUBLAS
       T *cux, *cuy;
-      T result;
       cudaMalloc((void**) &cux, N * sizeof(T));
       cudaMalloc((void**) &cuy, N * sizeof(T));
       BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t)
@@ -210,6 +209,7 @@ void bench(sc::numeric_type dtype, std::string operation)
     //AlexNet
     MNs.push_back(std::make_tuple('N',1000,256));
     MNs.push_back(std::make_tuple('N',4096,256));
+
     MNs.push_back(std::make_tuple('T',169,256));
     MNs.push_back(std::make_tuple('T',169,384));
     MNs.push_back(std::make_tuple('T',729,256));
@@ -261,39 +261,43 @@ void bench(sc::numeric_type dtype, std::string operation)
   if(operation.substr(0,4)=="gemm")
   {
     std::vector<std::tuple<std::string, char, char, int_t, int_t, int_t> > MNKs;
-    MNKs.push_back(std::make_tuple("Square [512]",'N','T',512,512,512));
-    MNKs.push_back(std::make_tuple("Square [1536]",'N','T',1536,1536,1536));
-    //AlexNet (Forward)
-    MNKs.push_back(std::make_tuple("F-Conv1",'N','N',3025,96,363));
-    MNKs.push_back(std::make_tuple("F-Conv2",'N','N',729,128,1200));
-    MNKs.push_back(std::make_tuple("F-Conv3",'N','N',169,384,2304));
-    MNKs.push_back(std::make_tuple("F-Conv4",'N','N',169,192,1728));
-    MNKs.push_back(std::make_tuple("F-Conv5",'N','N',169,128,1728));
-    //LeNet (Forward)
-    MNKs.push_back(std::make_tuple("F-Conv1",'N','N',576,20,25));
-    MNKs.push_back(std::make_tuple("F-Conv2",'N','N',64,50,500));
+    //Square
+    MNKs.push_back(std::make_tuple("Square [N=896]",'N','T',896,896,896));
+    MNKs.push_back(std::make_tuple("Square [N=2560]",'N','T',2560,2560,2560));
 
-    //AlexNet (Backward)
-    MNKs.push_back(std::make_tuple("B-Conv5",'T','N',1728,128,169));
-    MNKs.push_back(std::make_tuple("B-Conv4",'T','N',1728,192,169));
-    MNKs.push_back(std::make_tuple("B-Conv3",'T','N',2304,384,169));
-    MNKs.push_back(std::make_tuple("B-Conv2",'T','N',1200,128,729));
-    MNKs.push_back(std::make_tuple("B-Conv1",'T','N',363,96,3025));
-    //LeNet (Backward)
-    MNKs.push_back(std::make_tuple("B-Conv2",'T','N',500,50,64));
-    MNKs.push_back(std::make_tuple("B-Conv1",'T','N',25,20,576));
+    //Convolution
+    MNKs.push_back(std::make_tuple("Convolution [AlexNet-1]",'N','N',3025,96,363));
+    MNKs.push_back(std::make_tuple("Convolution [AlexNet-2]",'N','N',729,128,1200));
+    MNKs.push_back(std::make_tuple("Convolution [AlexNet-3]",'N','N',169,384,2304));
+    MNKs.push_back(std::make_tuple("Convolution [AlexNet-4]",'N','N',169,192,1728));
+    MNKs.push_back(std::make_tuple("Convolution [AlexNet-5]",'N','N',169,128,1728));
+//    MNKs.push_back(std::make_tuple("Convolution [LeNet-1],'N','N',576,20,25));
+//    MNKs.push_back(std::make_tuple("Convolution [LeNet-2]",'N','N',64,50,500));
 
+    //Convolution Gradient-1
+    MNKs.push_back(std::make_tuple("Convolution Gradient-1 [AlexNet-5]",'T','N',1728,128,169));
+    MNKs.push_back(std::make_tuple("Convolution Gradient-1 [AlexNet-4]",'T','N',1728,192,169));
+    MNKs.push_back(std::make_tuple("Convolution Gradient-1 [AlexNet-3]",'T','N',2304,384,169));
+    MNKs.push_back(std::make_tuple("Convolution Gradient-1 [AlexNet-2]",'T','N',1200,128,729));
+    MNKs.push_back(std::make_tuple("Convolution Gradient-1 [AlexNet-1]",'T','N',363,96,3025));
+//    MNKs.push_back(std::make_tuple("Conv. Gradient-1 [LeNet-2]",'T','N',500,50,64));
+//    MNKs.push_back(std::make_tuple("Conv. Gradient-1 [LeNet-1]",'T','N',25,20,576));
 
-    MNKs.push_back(std::make_tuple("B-Conv5 [bottom]",'N','T',169,1728,128));
-    MNKs.push_back(std::make_tuple("B-Conv4 [bottom]",'N','T',169,1728,192));
-    MNKs.push_back(std::make_tuple("B-Conv3 [bottom]",'N','T',169,2304,384));
-    MNKs.push_back(std::make_tuple("B-Conv2 [bottom]",'N','T',729,1200,128));
-    //LeNet (Backward)
-    MNKs.push_back(std::make_tuple("B-Conv2 [bottom]",'N','T',64,500,50));
+    //Convolution Gradient-2
+    MNKs.push_back(std::make_tuple("Convolution Gradient-2 [AlexNet-5]",'N','T',169,1728,128));
+    MNKs.push_back(std::make_tuple("Convolution Gradient-2 [AlexNet-4]",'N','T',169,1728,192));
+    MNKs.push_back(std::make_tuple("Convolution Gradient-2 [AlexNet-3]",'N','T',169,2304,384));
+    MNKs.push_back(std::make_tuple("Convolution Gradient-2 [AlexNet-2]",'N','T',729,1200,128));
+//    MNKs.push_back(std::make_tuple("Conv. Gradient-2 [LeNet-2]",'N','T',64,500,50));
 
-    //Covariance (e.g., ICA)
-    MNKs.push_back(std::make_tuple("ICA [32 channels]",'N','N',32,32,32000));
-    MNKs.push_back(std::make_tuple("ICA [256 channels]",'N','N',256,256,32000));
+    //Covariance (e.g., ICA, 10minutes/1khz)
+    MNKs.push_back(std::make_tuple("ICA [32 channels]",'N','T',32,32,600000));
+    MNKs.push_back(std::make_tuple("ICA [256 channels]",'N','T',256,256,600000));
+
+    //Bi-diagonalization
+    MNKs.push_back(std::make_tuple("Bidiagonalization [Iteration 1]",'N','T',4096,4096,32));
+    MNKs.push_back(std::make_tuple("Bidiagonalization [Iteration 10]",'N','T',3456,3456,32));
+    MNKs.push_back(std::make_tuple("Bidiagonalization [Iteration 50]",'N','T',896,896,32));
 
     /*---------*/
     /*--BLAS3--*/
@@ -317,7 +321,7 @@ void bench(sc::numeric_type dtype, std::string operation)
     #ifdef HAS_A_BLAS
         int_t lda = A.ld(), ldb = B.ld(), ldc = C.ld();
     #endif
-        BENCHMARK_ISAAC(C = sc::control(AT?(BT?dot(A.T(),B.T()):dot(A.T(),B)):(BT?dot(A,B.T()):dot(A,B)), sc::execution_options_type(0, &events)), (double)2*M*N*K/t);
+        BENCHMARK_ISAAC(C = sc::control(AT?(BT?dot(A.T(),B.T()):dot(A.T(),B)):(BT?dot(A,B.T()):dot(A,B)), sc::execution_options_type(0, &events), sc::dispatcher_options_type(false)), (double)2*M*N*K/t);
         /* clblas */
     #ifdef BENCH_CLBLAS
         BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, BT?clblasTrans:clblasNoTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb,
@@ -336,7 +340,7 @@ void bench(sc::numeric_type dtype, std::string operation)
         cudaMalloc((void**) &cuA, M * K * sizeof(T));
         cudaMalloc((void**) &cuB, K * N * sizeof(T));
         cudaMalloc((void**) &cuC, M * N * sizeof(T));
-        BENCHMARK_CUDA(cublasSgemm('n', 't', M, N, K, 1, cuA, lda, cuB, ldb, 1, cuC, ldc), (double)2*M*N*K/t)
+        BENCHMARK_CUDA(cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, cuA, lda, cuB, ldb, 1, cuC, ldc), (double)2*M*N*K/t)
         cudaFree(cuA);
         cudaFree(cuB);
         cudaFree(cuC);
diff --git a/cmake/python/setup.py b/cmake/python/setup.py
index 65a6471ba..75e7b444a 100644
--- a/cmake/python/setup.py
+++ b/cmake/python/setup.py
@@ -143,7 +143,7 @@ def main():
                     libraries=libraries)]
     
     #External
-    extensions += [Extension('autotuning.external.sklearn._tree',
+    extensions += [Extension('external.sklearn._tree',
                              ['external/sklearn/_tree.c'],
                              include_dirs = [numpy_include])]
 
@@ -155,7 +155,7 @@ def main():
                 author='Philippe Tillet',
                 author_email='ptillet@g.harvard.edu',
                 license='MPL 2.0',
-                packages=['isaac','isaac.autotuning', 'isaac.autotuning.external', 'isaac.autotuning.external.deap', 'isaac.autotuning.external.deap.tools', 'isaac.autotuning.external.sklearn'],
+                packages=['isaac','isaac.external', 'isaac.external.sklearn'],
                 ext_package="isaac",
                 ext_modules=extensions,
                 cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass},
diff --git a/include/isaac/defines.h b/include/isaac/defines.h
index 3f9d6d478..9dedb9424 100644
--- a/include/isaac/defines.h
+++ b/include/isaac/defines.h
@@ -18,10 +18,10 @@
 	#define RESTORE_MSVC_WARNING_C4275 __pragma(warning(disable: 4275))
 
 #else
-	#define DISABLE_MSVC_WARNING_C4251
-	#define RESTORE_MSVC_WARNING_C4251
-	#define DISABLE_MSVC_WARNING_C4275
-	#define RESTORE_MSVC_WARNING_C4275
+    #define DISABLE_MSVC_WARNING_C4251
+    #define RESTORE_MSVC_WARNING_C4251
+    #define DISABLE_MSVC_WARNING_C4275
+    #define RESTORE_MSVC_WARNING_C4275
 #endif
 
 #endif
diff --git a/include/isaac/driver/common.h b/include/isaac/driver/common.h
index 86cefb210..e94853de6 100644
--- a/include/isaac/driver/common.h
+++ b/include/isaac/driver/common.h
@@ -19,7 +19,7 @@ namespace isaac
 namespace driver
 {
 
-enum ISAACAPI backend_type
+enum backend_type
 {
   OPENCL
 #ifdef ISAAC_WITH_CUDA
diff --git a/include/isaac/driver/device.h b/include/isaac/driver/device.h
index 3dd4faf6f..7aa16d77e 100644
--- a/include/isaac/driver/device.h
+++ b/include/isaac/driver/device.h
@@ -20,7 +20,7 @@ private:
   friend class CommandQueue;
 
 public:
-  enum ISAACAPI Type
+  enum Type
   {
     GPU = CL_DEVICE_TYPE_GPU,
     CPU = CL_DEVICE_TYPE_CPU,
diff --git a/include/isaac/driver/event.h b/include/isaac/driver/event.h
index 17af0d14e..5b61d7b89 100644
--- a/include/isaac/driver/event.h
+++ b/include/isaac/driver/event.h
@@ -15,10 +15,6 @@ namespace driver
 class ISAACAPI Event
 {
   friend class CommandQueue;
-private:
-#ifdef ISAAC_WITH_CUDA
-  typedef std::pair<CUevent, CUevent> cu_event_t;
-#endif
 public:
   Event(cl_event const & event, bool take_ownership = true);
   Event(backend_type backend);
diff --git a/include/isaac/driver/handle.h b/include/isaac/driver/handle.h
index 932dde63d..831e4e3df 100644
--- a/include/isaac/driver/handle.h
+++ b/include/isaac/driver/handle.h
@@ -12,6 +12,15 @@ namespace isaac
 namespace driver
 {
 
+#ifdef ISAAC_WITH_CUDA
+  struct cu_event_t{
+      operator bool() const { return first && second; }
+      CUevent first;
+      CUevent second;
+  };
+
+#endif
+
 #ifdef ISAAC_WITH_CUDA
 #define HANDLE_TYPE(CLTYPE, CUTYPE) Handle<CLTYPE, CUTYPE>
 #else
@@ -30,7 +39,7 @@ private:
   static void _delete(CUevent x);
   static void _delete(CUfunction);
   static void _delete(CUmodule x);
-  static void _delete(std::pair<CUevent, CUevent> x);
+  static void _delete(cu_event_t x);
 #endif
 
   static void release(cl_context x);
diff --git a/include/isaac/kernels/binder.h b/include/isaac/kernels/binder.h
index 9d2b9ab56..a8d3f510e 100644
--- a/include/isaac/kernels/binder.h
+++ b/include/isaac/kernels/binder.h
@@ -21,7 +21,7 @@ public:
   virtual ~symbolic_binder();
   virtual bool bind(driver::Buffer const &) = 0;
   virtual unsigned int get(driver::Buffer const &) = 0;
-  virtual unsigned int get();
+  unsigned int get();
 protected:
   unsigned int current_arg_;
   std::map<driver::Buffer,unsigned int> memory;
diff --git a/include/isaac/kernels/keywords.h b/include/isaac/kernels/keywords.h
index 262153277..9b9aac7c6 100644
--- a/include/isaac/kernels/keywords.h
+++ b/include/isaac/kernels/keywords.h
@@ -70,6 +70,7 @@ struct CastPrefix: public keyword{ CastPrefix(driver::backend_type backend, std:
 struct InitPrefix: public keyword{ InitPrefix(driver::backend_type backend, std::string const & datatype): keyword(backend, "", "make_" + datatype){} };
 
 struct Infinity: public keyword{ Infinity(driver::backend_type backend, std::string const & datatype): keyword(backend, "INFINITY", "infinity<" + datatype + ">()"){} };
+struct Select: public keyword{ Select(driver::backend_type backend, std::string cond, std::string if_value, std::string else_value): keyword(backend, "select(" + else_value + "," + if_value + "," + cond + ")", "(" + cond + ")?" + if_value + ":" + else_value) {} };
 #undef ADD_KEYWORD
 
 
diff --git a/include/isaac/value_scalar.h b/include/isaac/value_scalar.h
index c65909aa3..bd25810f0 100644
--- a/include/isaac/value_scalar.h
+++ b/include/isaac/value_scalar.h
@@ -1,9 +1,9 @@
 #ifndef ISAAC_VALUE_SCALAR_H
 #define ISAAC_VALUE_SCALAR_H
 
-#include <inttypes.h>
 #include "isaac/defines.h"
 #include "isaac/types.h"
+#include <stdint.h>
 
 namespace isaac
 {
diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp
index eae47885f..0f9e881f2 100644
--- a/lib/driver/backend.cpp
+++ b/lib/driver/backend.cpp
@@ -130,12 +130,12 @@ void backend::platforms(std::vector<Platform> & platforms)
   #ifdef ISAAC_WITH_CUDA
     platforms.push_back(Platform(CUDA));
   #endif
-    cl_uint nplatforms;
-    ocl::check(clGetPlatformIDs(0, NULL, &nplatforms));
-    std::vector<cl_platform_id> clplatforms(nplatforms);
-    ocl::check(clGetPlatformIDs(nplatforms, clplatforms.data(), NULL));
-    for(cl_platform_id p: clplatforms)
-        platforms.push_back(Platform(p));
+//    cl_uint nplatforms;
+//    ocl::check(clGetPlatformIDs(0, NULL, &nplatforms));
+//    std::vector<cl_platform_id> clplatforms(nplatforms);
+//    ocl::check(clGetPlatformIDs(nplatforms, clplatforms.data(), NULL));
+//    for(cl_platform_id p: clplatforms)
+//        platforms.push_back(Platform(p));
 }
 
 void backend::synchronize(Context const & context)
diff --git a/lib/driver/event.cpp b/lib/driver/event.cpp
index 26899eb5a..2d4bfa0fb 100644
--- a/lib/driver/event.cpp
+++ b/lib/driver/event.cpp
@@ -44,7 +44,7 @@ long Event::elapsed_time() const
   }
 }
 
-HANDLE_TYPE(cl_event, Event::cu_event_t) & Event::handle()
+HANDLE_TYPE(cl_event, cu_event_t) & Event::handle()
 { return h_; }
 
 }
diff --git a/lib/driver/handle.cpp b/lib/driver/handle.cpp
index 283c191f7..f551959fe 100644
--- a/lib/driver/handle.cpp
+++ b/lib/driver/handle.cpp
@@ -19,7 +19,7 @@ template<class CLType, class CUType>
 void Handle<CLType, CUType>::_delete(CUstream x) { cuStreamDestroy(x); }
 
 template<class CLType, class CUType>
-void Handle<CLType, CUType>::_delete(CUdevice) { }
+void Handle<CLType, CUType>::_delete(CUdevice) { std::cout << "CUdevice" << std::endl;}
 
 template<class CLType, class CUType>
 void Handle<CLType, CUType>::_delete(CUevent x) { cuEventDestroy(x); }
@@ -31,7 +31,7 @@ template<class CLType, class CUType>
 void Handle<CLType, CUType>::_delete(CUmodule x) { cuModuleUnload(x); }
 
 template<class CLType, class CUType>
-void Handle<CLType, CUType>::_delete(std::pair<CUevent, CUevent> x) { _delete(x.first); _delete(x.second); }
+void Handle<CLType, CUType>::_delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
 
 #endif
 
@@ -100,8 +100,9 @@ template<class CLType, class CUType>
 Handle<CLType, CUType>::~Handle()
 {
 #ifdef ISAAC_WITH_CUDA
-  if(has_ownership_ && cu_.unique())
+  if(has_ownership_ && cu_ && cu_.unique() && *cu_){
     _delete(*cu_);
+  }
 #endif
   if(has_ownership_ && cl_ && cl_.unique() && *cl_)
      release(*cl_);
@@ -132,7 +133,7 @@ template class Handle<cl_mem, CUdeviceptr>;
 template class Handle<cl_command_queue, CUstream>;
 template class Handle<cl_context, CUcontext>;
 template class Handle<cl_device_id, CUdevice>;
-template class Handle<cl_event, std::pair<CUevent, CUevent> >;
+template class Handle<cl_event, cu_event_t>;
 template class Handle<cl_kernel, CUfunction>;
 template class Handle<cl_program, CUmodule>;
 #else
diff --git a/lib/driver/ndrange.cpp b/lib/driver/ndrange.cpp
index a87b93c2c..2d1678294 100644
--- a/lib/driver/ndrange.cpp
+++ b/lib/driver/ndrange.cpp
@@ -10,6 +10,8 @@ namespace driver
 NDRange::NDRange(size_t size0)
 {
     sizes_[0] = size0;
+    sizes_[1] = 1;
+    sizes_[2] = 1;
     dimension_ = 1;
 }
 
@@ -17,6 +19,7 @@ NDRange::NDRange(size_t size0, size_t size1)
 {
     sizes_[0] = size0;
     sizes_[1] = size1;
+    sizes_[2] = 1;
     dimension_ = 2;
 }
 
diff --git a/lib/driver/platform.cpp b/lib/driver/platform.cpp
index 8b8d074e7..b80c4b4bc 100644
--- a/lib/driver/platform.cpp
+++ b/lib/driver/platform.cpp
@@ -12,7 +12,11 @@ namespace driver
 {
 
 #ifdef ISAAC_WITH_CUDA
-Platform::Platform(backend_type backend): backend_(backend){}
+Platform::Platform(backend_type backend): backend_(backend)
+{
+  if(backend==CUDA)
+      cuInit(0);
+}
 #endif
 
 Platform::Platform(cl_platform_id const & platform) : backend_(OPENCL)
diff --git a/lib/kernels/templates/gemm.cpp b/lib/kernels/templates/gemm.cpp
index 71fe523bd..353a58718 100644
--- a/lib/kernels/templates/gemm.cpp
+++ b/lib/kernels/templates/gemm.cpp
@@ -56,8 +56,11 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
       return 0;
   }
 
-  int gemm::is_invalid_impl(driver::Device const &, expressions_tuple const &) const
+  int gemm::is_invalid_impl(driver::Device const & device, expressions_tuple const &) const
   {
+    if(device.vendor()==driver::Device::Vendor::NVIDIA && p_.simd_width > 1)
+      return TEMPLATE_INVALID_SIMD_WIDTH;
+
     if(p_.A_fetching_policy!=FETCH_FROM_LOCAL || p_.B_fetching_policy!=FETCH_FROM_LOCAL)
       return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
 
@@ -170,8 +173,8 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
     stream << Local(backend) << " " << sdtype << " lB[" << p_.kL*p_.nL << "];" << std::endl;
     unsigned int npA = p_.mL/(A_trans_=='N'?p_.local_fetch_0*p_.simd_width:p_.local_fetch_1);
     unsigned int npB = p_.nL/(B_trans_=='T'?p_.local_fetch_0*p_.simd_width:p_.local_fetch_1);
-    stream << "__global " << sdtype << "* Ai[" << npA << "];" << std::endl;
-    stream << "__global " << sdtype << "* Bi[" << npB << "];" << std::endl;
+    stream << Global(backend) << " " << sdtype << "* Ai[" << npA << "];" << std::endl;
+    stream << Global(backend) << " " << sdtype << "* Bi[" << npB << "];" << std::endl;
     stream << std::endl;
 
     stream << "//identifiers" << std::endl;
@@ -179,7 +182,11 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
     stream << "int idt;" << std::endl;
     if(has_depth)
         stream << "int gidz, div, offz;" << std::endl;
-    stream << "int4 ids = (int4)(" << GroupIdx0(backend) << "," << GroupIdx1(backend) << "," << LocalIdx0(backend) << "," << LocalIdx1(backend) << ");" << std::endl;
+    stream << "uint4 ids;" << std::endl;
+    stream << "ids.x = " << GroupIdx0(backend) << ";" << std::endl;
+    stream << "ids.y = " << GroupIdx1(backend) << ";" << std::endl;
+    stream << "ids.z = " << LocalIdx0(backend) << ";" << std::endl;
+    stream << "ids.w = " << LocalIdx1(backend) << ";" << std::endl;
     stream << std::endl;
 
     stream << "//offsets" << std::endl;
@@ -266,15 +273,15 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
 
     for(unsigned int i = 0 ; i < npA ; i++ )
         if (A_trans_=='N')
-          stream << "Ai[" << i << "] += select(0, (int)((idT.x + " << i*p_.local_fetch_0*p_.simd_width << ")" << ASTRIDE1 << "), " << i*p_.local_fetch_0*p_.simd_width << " < M);" << std::endl;
+          stream << "Ai[" << i << "] += " << Select(backend, to_string(i*p_.local_fetch_0*p_.simd_width) + " < M", "(int)((idT.x + " + to_string(i*p_.local_fetch_0*p_.simd_width) + ")" + ASTRIDE1 + ")", "0") << ";" << std::endl;
         else
-          stream << "Ai[" << i << "] += select(0, (int)((idT.y + " << i*p_.local_fetch_1 << ")*lda), " << i*p_.local_fetch_1 << " < M);" << std::endl;
+          stream << "Ai[" << i << "] += " << Select(backend, to_string(i*p_.local_fetch_1) + " < M", "(int)((idT.y + " + to_string(i*p_.local_fetch_1) + ")*lda)", "0") << ";" << std::endl;
 
     for(unsigned int i = 0 ; i < npB ; i++ )
         if (B_trans_=='T')
-            stream << "Bi[" << i << "] += select(0, (int)((idT.x + " << i*p_.local_fetch_0*p_.simd_width << ")" << BSTRIDE1 << "), " << i*p_.local_fetch_0*p_.simd_width << " < N);" << std::endl;
+            stream << "Bi[" << i << "] += " << Select(backend, to_string(i*p_.local_fetch_0*p_.simd_width) + " < N", "(int)((idT.x + " + to_string(i*p_.local_fetch_0*p_.simd_width) + ")" + BSTRIDE1 + ")", "0") << ";" << std::endl;
         else
-            stream << "Bi[" << i << "] += select(0, (int)((idT.y + " << i*p_.local_fetch_1 << ")*ldb), " << i*p_.local_fetch_1 << " < N);" << std::endl;
+            stream << "Bi[" << i << "] += " << Select(backend, to_string(i*p_.local_fetch_1) + " < N", "(int)((idT.y + " + to_string(i*p_.local_fetch_1) + ")*ldb)", "0") << ";" << std::endl;
 
     stream << std::endl;
     stream << "//Outer loop" << std::endl;
@@ -504,14 +511,6 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
     stream << "N -= ids.y;" << std::endl;
     stream << "N -= ids.w*" << p_.simd_width <<  ";" << std::endl;
 
-    stream << "int ibm[" << p_.mS << "];" << std::endl;
-    for(unsigned int m=0; m < p_.mS; ++m)
-    {
-        string Ci = to_string((m/p_.simd_width)*(p_.local_size_0*p_.simd_width) + m%p_.simd_width);
-        stream << "ibm[" << m << "] = " << Ci << " < M;" << std::endl;
-    }
-
-
     for(unsigned int n=0; n < p_.nS; ++n)
     {
         string Cj = to_string((n/p_.simd_width)*(p_.local_size_1*p_.simd_width) + n%p_.simd_width);
@@ -521,13 +520,15 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
         for(unsigned int m=0; m < p_.mS; ++m)
         {
             string Ci = to_string((m/p_.simd_width)*(p_.local_size_0*p_.simd_width) + m%p_.simd_width);
-            stream << "if(ibm[" << m << "]) ";
-            stream << "C[" << Ci << CSTRIDE1 << "] = rC[" << m << "][" << n << "] + select((" << sdtype << ")0, C[" << Ci << CSTRIDE1 << "], beta>0);" << std::endl;
+            stream << "if(" << Ci << "< M) ";
+            stream << "C[" << Ci << CSTRIDE1 << "] = rC[" << m << "][" << n << "];" << std::endl;
         }
-        if((n+1)%p_.simd_width==0)
+        if((n+1)%p_.simd_width==0){
             stream << "C += ldc*" << p_.local_size_1*p_.simd_width - p_.simd_width + 1 << ";" << std::endl;
-        else
+        }
+        else{
             stream << "C += ldc;" << std::endl;
+        }
 
     }
 
diff --git a/lib/profiles/profiles.cpp b/lib/profiles/profiles.cpp
index 3008ed500..b23d26c46 100644
--- a/lib/profiles/profiles.cpp
+++ b/lib/profiles/profiles.cpp
@@ -119,15 +119,16 @@ void profiles::value_type::execute(controller<expressions_tuple> const & expr)
   else if(predictor_.get())
   {
     std::vector<float> predictions = predictor_->predict(x);
-    do{
+//    do{
         label = std::distance(predictions.begin(),std::max_element(predictions.begin(), predictions.end()));
-        predictions[label] = 0;
-    }while(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE);
+//        predictions[label] = 0;
+//    }while(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE);
   }
 
   //Execution
-  if(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE)
-    throw operation_not_supported_exception("Running this operation would require an overly large temporary.");
+//  std::cout << label << std::endl;
+//  if(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE)
+//    throw operation_not_supported_exception("Running this operation would require an overly large temporary.");
 
   return templates_[label]->enqueue(queue_, program, tools::to_string(label), *fallback_, expr);
 }
diff --git a/python/setup.py b/python/setup.py
index 7e5beaf30..28fb8015e 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -143,7 +143,7 @@ def main():
                     libraries=libraries)]
     
     #External
-    extensions += [Extension('autotuning.external.sklearn._tree',
+    extensions += [Extension('external.sklearn._tree',
                              ['external/sklearn/_tree.c'],
                              include_dirs = [numpy_include])]
 
@@ -155,7 +155,7 @@ def main():
                 author='Philippe Tillet',
                 author_email='ptillet@g.harvard.edu',
                 license='MPL 2.0',
-                packages=['isaac','isaac.autotuning', 'isaac.autotuning.external', 'isaac.autotuning.external.deap', 'isaac.autotuning.external.deap.tools', 'isaac.autotuning.external.sklearn'],
+                packages=['isaac','isaac.external', 'isaac.external.sklearn'],
                 ext_package="isaac",
                 ext_modules=extensions,
                 cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass},
diff --git a/python/src/bind/exceptions.cpp b/python/src/bind/exceptions.cpp
index fcb4e6287..03a2205e2 100644
--- a/python/src/bind/exceptions.cpp
+++ b/python/src/bind/exceptions.cpp
@@ -81,27 +81,32 @@ private:
 
 void export_exceptions()
 {
-    wrap::exception<isaac::operation_not_supported_exception>("OperationNotSupported", bp::init<std::string>())
-        .def("__str__", &isaac::operation_not_supported_exception::what)
-        ;
 
-    wrap::exception<isaac::driver::ocl::exception::out_of_resources>("LaunchOutOfResources")
-        .def("__str__", &isaac::driver::ocl::exception::out_of_resources::what)
-        ;
+#define BIND_EXCEPTION(CPPNAME, PYTHONNAME) \
+    wrap::exception<isaac::CPPNAME>(PYTHONNAME, bp::init<std::string>())\
+        .def("__str__", &isaac::CPPNAME::what)
 
-    wrap::exception<isaac::driver::ocl::exception::mem_object_allocation_failure>("MemObjectAllocationFailure")
-        .def("__str__", &isaac::driver::ocl::exception::mem_object_allocation_failure::what)
-        ;
+    BIND_EXCEPTION(operation_not_supported_exception, "OperationNotSupported");
 
-    wrap::exception<isaac::driver::ocl::exception::out_of_host_memory>("OutOfHostMemory")
-        .def("__str__", &isaac::driver::ocl::exception::out_of_host_memory::what)
-        ;
+    //OCL
+    wrap::exception<isaac::driver::ocl::exception::base>("OclException", bp::no_init);
+#define BIND_OCL_EXCEPTION(CPPNAME, PYTHONNAME) \
+            wrap::exception<isaac::driver::ocl::exception::CPPNAME, bp::bases<isaac::driver::ocl::exception::base> >(PYTHONNAME)\
+                .def("__str__", &isaac::driver::ocl::exception::CPPNAME::what)
 
-    wrap::exception<isaac::driver::ocl::exception::invalid_work_group_size>("InvalidWorkGroupSize")
-        .def("__str__", &isaac::driver::ocl::exception::invalid_work_group_size::what)
-        ;
 
-    wrap::exception<isaac::driver::ocl::exception::invalid_value>("InvalidValue")
-        .def("__str__", &isaac::driver::ocl::exception::invalid_value::what)
-        ;
+    BIND_OCL_EXCEPTION(out_of_resources, "OclLaunchOutOfResources");
+    BIND_OCL_EXCEPTION(mem_object_allocation_failure, "MemObjectAllocationFailure");
+    BIND_OCL_EXCEPTION(out_of_host_memory, "OutOfHostMemory");
+    BIND_OCL_EXCEPTION(invalid_work_group_size, "InvalidWorkGroupSize");
+    BIND_OCL_EXCEPTION(invalid_value, "InvalidValue");
+
+    //CUDA
+    wrap::exception<isaac::driver::cuda::exception::base>("CudaException", bp::no_init);
+#define BIND_CUDA_EXCEPTION(CPPNAME, PYTHONNAME) \
+            wrap::exception<isaac::driver::cuda::exception::CPPNAME, bp::bases<isaac::driver::cuda::exception::base> >(PYTHONNAME)\
+                .def("__str__", &isaac::driver::cuda::exception::CPPNAME::what)
+
+
+    BIND_CUDA_EXCEPTION(launch_out_of_resources, "CudaLaunchOutOfResources");
 }
diff --git a/tests/linalg/axpy.cpp b/tests/linalg/axpy.cpp
index b2e55b1ce..da49551b1 100644
--- a/tests/linalg/axpy.cpp
+++ b/tests/linalg/axpy.cpp
@@ -3,6 +3,7 @@
 #include "common.hpp"
 #include "isaac/array.h"
 #include "isaac/wrap/clBLAS.h"
+#include "isaac/driver/common.h"
 
 namespace sc = isaac;
 typedef isaac::int_t int_t;
@@ -45,6 +46,7 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
     std::cout << std::endl;\
   }
 
+  if(queue.device().backend()==sc::driver::OPENCL){
 #define PREFIX "[C]"
   RUN_TEST_VECTOR_AXPY("AXPY", cz[i] = a*cx[i] + cz[i], BLAS<T>::F(clblasSaxpy, clblasDaxpy)(N, a, CHANDLE(x), x.start()[0], x.stride()[0],
                                                                                              CHANDLE(z), z.start()[0], z.stride()[0],
@@ -56,9 +58,9 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
 
   RUN_TEST_VECTOR_AXPY("SCAL", cz[i] = a*cz[i], BLAS<T>::F(clblasSscal, clblasDscal)(N, a, CHANDLE(z), z.start()[0], z.stride()[0],
                                                                                      1, &clqueue, 0, NULL, NULL));
-
-
 #undef PREFIX
+  }
+
 #define PREFIX "[C++]"
   RUN_TEST_VECTOR_AXPY("z = 0", cz[i] = 0, z = zeros(N, 1, dtype, context))
   RUN_TEST_VECTOR_AXPY("z = x", cz[i] = cx[i], z = x)
diff --git a/tests/linalg/gemm.cpp b/tests/linalg/gemm.cpp
index adbb7d10e..46a55f305 100644
--- a/tests/linalg/gemm.cpp
+++ b/tests/linalg/gemm.cpp
@@ -52,7 +52,7 @@ void test_impl(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> cons
   else\
     std::cout << std::endl;
 
-  if(interf==clBLAS)
+  if(C.context().backend()==sc::driver::OPENCL && interf==clBLAS)
   {
       cl_command_queue clqueue = queue.handle().cl();
 
diff --git a/tune/android/tune/model.py b/tune/android/tune/model.py
index 5d5c94f4b..d5fb4abda 100644
--- a/tune/android/tune/model.py
+++ b/tune/android/tune/model.py
@@ -24,26 +24,29 @@ def nrmse(y_ground, y):
 def train(X, Y, profiles):      
     X = np.array(X)
     Y = np.array(Y)
-    M = X.shape[0]
-
+    #Remove unused profiles
+    unused = np.where(np.bincount(np.argmax(Y, 1))==0)[0]
+    profiles = [x in profiles for ix,x in enumerate(profiles) if ix not in unused]
+    Y = np.delete(Y, np.where(np.bincount(np.argmax(Y, 1))==0), axis=1)
+    #Shuffle
     p = np.random.permutation(X.shape[0])
+    M = X.shape[0]
     X = X[p,:]
     Y = Y[p,:]   
 
     #Train the.profile
-    cut = int(1.00*M)
-    CV = .1
-    XTr, YTr = X[:,:], Y[:,:]
-    XCv, YCv = X[:max(1,CV*M),:], Y[:max(1,CV*M),:]
+    cut = int(.7*M)
+    XTr, YTr = X[:cut,:], Y[:cut,:]
+    XCv, YCv = X[cut:,:], Y[cut:,:]
 
     nrmses = {}
-    for N in range(1,min(M+1,20)):
+    for N in range(1,min(M+1,10)):
         for depth in range(1,min(M+1,20)):
             clf = RandomForestRegressor(N, max_depth=depth).fit(XTr, YTr)
             t = np.argmax(clf.predict(XCv), axis = 1)
             y = np.array([YCv[i,t[i]] for i in range(t.size)])
             ground = np.max(YCv[:,:], axis=1)
             nrmses[clf] = nrmse(ground, y)
-            
+         
     clf = min(nrmses, key=nrmses.get)
     return clf, nrmses[clf]
diff --git a/tune/android/tune/optimize.py b/tune/android/tune/optimize.py
index b0cfd99a4..187de5410 100644
--- a/tune/android/tune/optimize.py
+++ b/tune/android/tune/optimize.py
@@ -13,12 +13,12 @@ from numpy import cumsum
 
 import tools
 
-fetch_types = [sc.templates.fetching_policy_type.FETCH_FROM_GLOBAL_CONTIGUOUS,
-               sc.templates.fetching_policy_type.FETCH_FROM_GLOBAL_STRIDED,
+fetch_types = [sc.templates.fetching_policy_type.FETCH_FROM_LOCAL,
+               sc.templates.fetching_policy_type.FETCH_FROM_LOCAL,
                sc.templates.fetching_policy_type.FETCH_FROM_LOCAL,
                sc.templates.fetching_policy_type.FETCH_FROM_LOCAL]
 
-to_catch = (sc.OperationNotSupported, sc.LaunchOutOfResources, sc.MemObjectAllocationFailure, sc.InvalidWorkGroupSize, sc.OutOfHostMemory, sc.InvalidValue)
+to_catch = (sc.OperationNotSupported, sc.OclLaunchOutOfResources, sc.CudaLaunchOutOfResources, sc.MemObjectAllocationFailure, sc.InvalidWorkGroupSize, sc.OutOfHostMemory, sc.InvalidValue)
 
 def exhaustive(template, sizes, context):
     tree, _ = tools.tree_of(template, sizes, context)
diff --git a/tune/android/tune/tune.py b/tune/android/tune/tune.py
index bdb0d7d6c..f8fcabb63 100644
--- a/tune/android/tune/tune.py
+++ b/tune/android/tune/tune.py
@@ -1,7 +1,7 @@
 import random, argparse, json, os
 from math import log, isinf
 from itertools import chain, product
-from numpy import argsort, argmax
+from numpy import argsort, argmax, where, delete, bincount
 from operator import mul
 import isaac as sc
 from isaac.external.sklearn.forest import RandomForestRegressor
@@ -9,6 +9,8 @@ import optimize, tools, model
 from json import encoder
 import json
 
+to_catch = (sc.OperationNotSupported, sc.OclLaunchOutOfResources, sc.CudaLaunchOutOfResources, sc.MemObjectAllocationFailure, sc.InvalidWorkGroupSize, sc.OutOfHostMemory, sc.InvalidValue)
+
 encoder.FLOAT_REPR = lambda o: format(o, '.2f')
 encoder.separators = (',',':')
 
@@ -30,14 +32,14 @@ def do_tuning(device, operation, json_path):
     sizes[sc.templates.axpy] = [(x,) for x in tools.expspace(1e3, 1e8, 4)]
     sizes[sc.templates.gemv_n] = product(pow2range(4,17), pow2range(4,17))
     sizes[sc.templates.gemv_t] = sizes[sc.templates.gemv_n]
-    sizes[sc.templates.gemm_nn]     = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 15))
+    sizes[sc.templates.gemm_nn]     = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 17))
     sizes[sc.templates.gemm_tn]     = sizes[sc.templates.gemm_nn]
     sizes[sc.templates.gemm_nt]     = sizes[sc.templates.gemm_nn]
     sizes[sc.templates.gemm_tt]     = sizes[sc.templates.gemm_nn]
     
 
     #Quick tuning - AlexNet sizes + Intuition
-    quick_tuning = True
+    quick_tuning = False
     if quick_tuning:
         sizes[sc.templates.ger] 		 = [(1536,1536)]
 
@@ -48,12 +50,14 @@ def do_tuning(device, operation, json_path):
                                             (729,256),
                                             (3025,96)]
         
-        sizes[sc.templates.gemm_nn]	 = [(576,20,25),(64,50,500),(3025,96,363),
+        sizes[sc.templates.gemm_nn]	 = [(3025,96,363),
                                             (729,128,1200),
                                             (169,384,2304),
                                             (169,192,1728),
                                             (169,128,1728)]
-        sizes[sc.templates.gemm_nt]	 = [(169,1728,128),
+                                            
+        sizes[sc.templates.gemm_nt]	 = [(1536,1536,1536),
+                                            (169,1728,128),
                                             (169,1728,192),
                                             (169,2304,384),
                                             (729,1200,128)]
@@ -121,7 +125,7 @@ def do_tuning(device, operation, json_path):
                 for b in best:
                     try:
                         perf += [performance(x, tools.benchmark(operation, profiles[b], tree))]
-                    except (sc.OperationNotSupported, sc.LaunchOutOfResources, sc.MemObjectAllocationFailure):
+                    except to_catch:
                         pass
                 predicted = profiles[best[argmax(perf)]]
             tune = not optimize.is_local_optimum(predicted, operation, x, context)     
@@ -138,7 +142,7 @@ def do_tuning(device, operation, json_path):
                         try:
                             time = tools.benchmark(operation, new, _tree)
                             perf = performance(xx, time)
-                        except (sc.OperationNotSupported, sc.LaunchOutOfResources, sc.MemObjectAllocationFailure):
+                        except to_catch:
                             perf = 0
                         yy.append(0 if isinf(perf) else perf)
         #Update dataset
@@ -147,7 +151,7 @@ def do_tuning(device, operation, json_path):
         for ip, p in enumerate(profiles):
             try:
                 perf = 0 if fastest and ip < nparams and predperf[ip]/fastest < .1 else performance(x,tools.benchmark(operation, p, tree))
-            except (sc.OperationNotSupported, sc.LaunchOutOfResources, sc.MemObjectAllocationFailure):
+            except to_catch:
                 perf = 0
             y.append(0 if isinf(perf) else perf)
         X.append(x)
@@ -158,6 +162,10 @@ def do_tuning(device, operation, json_path):
                 csv.writer(f).writerows(data)
 
     
+    unused = where(bincount(argmax(Y, 1))==0)[0]
+    profiles = [x for ix,x in enumerate(profiles) if ix not in unused]
+    Y = delete(Y, unused, axis=1)
+
     #Export to JSON
     json_path = tools.sanitize(device.name) + '.json' if not json_path else json_path
     if os.path.isfile(json_path):