From 95f2564c1a8636a40f9234d67093dcf79bbf23f6 Mon Sep 17 00:00:00 2001
From: Philippe Tillet <ptillet@g.harvard.edu>
Date: Mon, 24 Aug 2015 19:24:33 -0400
Subject: [PATCH] Tuning: Android UI improvement

---
 cmake/python/setup.py          |   2 +-
 include/isaac/driver/common.h  |  17 +-
 include/nvrtc.h                | 392 ---------------------------------
 lib/driver/backend.cpp         |  12 +-
 lib/kernels/templates/gemm.cpp |  28 +--
 lib/profiles/profiles.cpp      |  11 +-
 python/setup.py                |   6 +-
 tune/android/isaac.kv          |  26 ++-
 tune/android/main.py           |  47 +++-
 tune/android/screens/tune.kv   |  14 +-
 tune/android/tune/optimize.py  | 229 ++++++++++---------
 tune/android/tune/tools.py     |   9 +-
 tune/android/tune/tune.py      | 355 +++++++++++++++--------------
 13 files changed, 422 insertions(+), 726 deletions(-)
 delete mode 100644 include/nvrtc.h
diff --git a/cmake/python/setup.py b/cmake/python/setup.py
index 75e7b444a..bcd2cf00a 100644
--- a/cmake/python/setup.py
+++ b/cmake/python/setup.py
@@ -155,7 +155,7 @@ def main():
                 author='Philippe Tillet',
                 author_email='ptillet@g.harvard.edu',
                 license='MPL 2.0',
-                packages=['isaac','isaac.external', 'isaac.external.sklearn'],
+                packages=['isaac', 'isaac.external', 'isaac.external.sklearn'],
                 ext_package="isaac",
                 ext_modules=extensions,
                 cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass},
diff --git a/include/isaac/driver/common.h b/include/isaac/driver/common.h
index e94853de6..3c8dde3b2 100644
--- a/include/isaac/driver/common.h
+++ b/include/isaac/driver/common.h
@@ -1,15 +1,14 @@
 #ifndef ISAAC_DRIVER_COMMON_H
 #define ISAAC_DRIVER_COMMON_H
-
-
-#include <CL/cl.h>
-#include <CL/cl_ext.h>
 #include <exception>
 
-#ifdef ISAAC_WITH_CUDA
-#include <cuda.h>
-#include <nvrtc.h>
-#endif
+//OpenCL Backend
+#include "isaac/driver/external/CL/cl.h"
+#include "isaac/driver/external/CL/cl_ext.h"
+//CUDA Backend
+#include "isaac/driver/external/CUDA/cuda.h"
+#include "isaac/driver/external/CUDA/nvrtc.h"
+
 #include "isaac/defines.h"
 
 DISABLE_MSVC_WARNING_C4275
@@ -30,7 +29,6 @@ enum backend_type
 
 
 
-#ifdef ISAAC_WITH_CUDA
 
 namespace nvrtc
 {
@@ -130,7 +128,6 @@ namespace cuda
 void check(CUresult);
 }
 
-#endif
 
 namespace ocl
 {
diff --git a/include/nvrtc.h b/include/nvrtc.h
deleted file mode 100644
index 390de0cc9..000000000
--- a/include/nvrtc.h
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef __NVRTC_H__
-#define __NVRTC_H__
-
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
-#include <stdlib.h>
-
-
-/*****************************//**
- *
- * \defgroup error Error Handling
- *
- ********************************/
-
-
-/**
- * \ingroup error
- * \brief   CUDA Online Compiler API call result code.
- */
-typedef enum {
-  NVRTC_SUCCESS = 0,
-  NVRTC_ERROR_OUT_OF_MEMORY = 1,
-  NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
-  NVRTC_ERROR_INVALID_INPUT = 3,
-  NVRTC_ERROR_INVALID_PROGRAM = 4,
-  NVRTC_ERROR_INVALID_OPTION = 5,
-  NVRTC_ERROR_COMPILATION = 6,
-  NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7
-} nvrtcResult;
-
-
-/**
- * \ingroup error
- * \brief   ::nvrtcGetErrorString is a helper function that stringifies the
- *          given #nvrtcResult code, e.g., \link #nvrtcResult NVRTC_SUCCESS
- *          \endlink to \c "NVRTC_SUCCESS".  For unrecognized enumeration
- *          values, it returns \c "NVRTC_ERROR unknown".
- *
- * \param   [in] result CUDA Online Compiler API result code.
- * \return  Message string for the given #nvrtcResult code.
- */
-const char *nvrtcGetErrorString(nvrtcResult result);
-
-
-/****************************************//**
- *
- * \defgroup query General Information Query
- *
- *******************************************/
-
-
-/**
- * \ingroup query
- * \brief   ::nvrtcVersion sets the output parameters \p major and \p minor
- *          with the CUDA Online Compiler version number.
- *
- * \param   [out] major CUDA Online Compiler major version number.
- * \param   [out] minor CUDA Online Compiler minor version number.
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
- *
- */
-nvrtcResult nvrtcVersion(int *major, int *minor);
-
-
-/********************************//**
- *
- * \defgroup compilation Compilation
- *
- ***********************************/
-
-
-/**
- * \ingroup compilation
- * \brief   ::nvrtcProgram is the unit of compilation, and an opaque handle for
- *          a program.
- *
- * To compile a CUDA program string, an instance of nvrtcProgram must be
- * created first with ::nvrtcCreateProgram, then compiled with
- * ::nvrtcCompileProgram.
- */
-typedef struct _nvrtcProgram *nvrtcProgram;
-
-
-/**
- * \ingroup compilation
- * \brief   ::nvrtcCreateProgram creates an instance of ::nvrtcProgram with the
- *          given input parameters, and sets the output parameter \p prog with
- *          it.
- *
- * \param   [out] prog         CUDA Online Compiler program.
- * \param   [in]  src          CUDA program source.
- * \param   [in]  name         CUDA program name.\n
- *                             \p name can be \c NULL; \c "default_program" is
- *                             used when \p name is \c NULL.
- * \param   [in]  numHeaders   Number of headers used.\n
- *                             \p numHeaders must be greater than or equal to 0.
- * \param   [in]  headers      Sources of the headers.\n
- *                             \p headers can be \c NULL when \p numHeaders is
- *                             0.
- * \param   [in]  includeNames Name of each header by which they can be
- *                             included in the CUDA program source.\n
- *                             \p includeNames can be \c NULL when \p numHeaders
- *                             is 0.
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
- *
- * \see     ::nvrtcDestroyProgram
- */
-nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
-                               const char *src,
-                               const char *name,
-                               int numHeaders,
-                               const char **headers,
-                               const char **includeNames);
-
-
-/**
- * \ingroup compilation
- * \brief   ::nvrtcDestroyProgram destroys the given program.
- *
- * \param    [in] prog CUDA Online Compiler program.
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
- *
- * \see     ::nvrtcCreateProgram
- */
-nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
-
-
-/**
- * \ingroup compilation
- * \brief   ::nvrtcCompileProgram compiles the given program.
- *
- * The valid compiler options are:
- *
- *   - Compilation targets
- *     - --gpu-architecture=<em>\<GPU architecture name\></em> (-arch)\n
- *       Specify the name of the class of GPU architectures for which the
- *       input must be compiled.\n
- *       - Valid <em>GPU architecture name</em>s:
- *         - compute_20
- *         - compute_30
- *         - compute_35
- *         - compute_50
- *       - Default: compute_20
- *   - Separate compilation / whole-program compilation
- *     - --device-c (-dc)\n
- *       Generate relocatable code that can be linked with other relocatable
- *       device code.  It is equivalent to --relocatable-device-code=true.
- *     - --device-w (-dw)\n
- *       Generate non-relocatable code.  It is equivalent to
- *       --relocatable-device-code=false.
- *     - --relocatable-device-code=<em>[true, false]</em> (-rdc)\n
- *       Enable (disable) the generation of relocatable device code.
- *       - Default: false
- *   - Debugging support
- *     - --device-debug (-G)\n
- *       Generate debug information.
- *     - --generate-line-info (-lineinfo)\n
- *       Generate line-number information.
- *   - Code generation
- *     - --maxrregcount=<em>\<N\></em> (-maxrregcount)\n
- *       Specify the maximum amount of registers that GPU functions can use.
- *       Until a function-specific limit, a higher value will generally
- *       increase the performance of individual GPU threads that execute this
- *       function.  However, because thread registers are allocated from a
- *       global register pool on each GPU, a higher value of this option will
- *       also reduce the maximum thread block size, thereby reducing the amount
- *       of thread parallelism.  Hence, a good maxrregcount value is the result
- *       of a trade-off.  If this option is not specified, then no maximum is
- *       assumed.  Value less than the minimum registers required by ABI will
- *       be bumped up by the compiler to ABI minimum limit.
- *     - --ftz=<em>[true, false]</em> (-ftz)\n
- *       When performing single-precision floating-point operations, flush
- *       denormal values to zero or preserve denormal values.  --use_fast_math
- *       implies --ftz=true.
- *       - Default: false
- *     - --prec-sqrt=<em>[true, false]</em> (-prec-sqrt)\n
- *       For single-precision floating-point square root, use IEEE
- *       round-to-nearest mode or use a faster approximation.  --use_fast_math
- *       implies --prec-sqrt=false.
- *       - Default: true
- *     - --prec-div=<em>[true, false]</em> (-prec-div)\n
- *       For single-precision floating-point division and reciprocals, use IEEE
- *       round-to-nearest mode or use a faster approximation.  --use_fast_math
- *       implies --prec-div=false.
- *       - Default: true
- *     - --fmad=<em>[true, false]</em> (-fmad)\n
- *       Enables (disables) the contraction of floating-point multiplies and
- *       adds/subtracts into floating-point multiply-add operations (FMAD,
- *       FFMA, or DFMA).  --use_fast_math implies --fmad=true.
- *       - Default: true
- *     - --use_fast_math (-use_fast_math)\n
- *       Make use of fast math operations. --use_fast_math implies --ftz=true
- *       --prec-div=false --prec-sqrt=false --fmad=true.
- *   - Preprocessing
- *     - --define-macro=<em>\<macrodef\></em> (-D)\n
- *       <em>macrodef</em> can be either <em>name</em> or
- *       <em>name=definitions</em>.
- *       - <em>name</em>\n
- *         Predefine <em>name</em> as a macro with definition 1.
- *       - <em>name=definition</em>\n
- *         The contents of <em>definition</em> are tokenized and preprocessed
- *         as if they appeared during translation phase three in a \c \#define
- *         directive.  In particular, the definition will be truncated by
- *         embedded new line characters.
- *     - --undefine-macro=<em>\<name\></em> (-U)\n
- *       Cancel any previous definition of \em name.
- *     - --include-path=<em>\<dir\></em> (-I)\n
- *       Add the directory <em>dir</em> to the list of directories to be
- *       searched for headers.  These paths are searched after the list of
- *       headers given to ::nvrtcCreateProgram.
- *     - --pre-include=<em>\<header\></em> (-include)\n
- *       Preinclude <em>header</em> during preprocessing.
- *   - Language Dialect
- *     - --std=c++11 (-std=c++11)\n
- *       Set language dialect to C++11.
- *     - --builtin-move-forward=<em>[true, false]</em> (-builtin-move-forward)\n
- *       Provide builtin definitions of std::move and std::forward, when C++11
- *       language dialect is selected.
- *      - Default : true
- *     - --builtin-initializer-list=<em>[true, false]</em> (-builtin-initializer-list)\n
- *       Provide builtin definitions of std::initializer_list class and member
- *       functions when C++11 language dialect is selected.
- *       - Default : true
- *   - Misc
- *     - --disable-warnings (-w)\n
- *       Inhibit all warning messages.
- *     - --restrict (-restrict)\n
- *       Programmer assertion that all kernel pointer parameters are restrict
- *       pointers.
- *     - --device-as-default-execution-space
-*        (-default-device)\n
- *       Treat entities with no execution space annotation as \c __device__
- *       entities.
- *
- * \param   [in] prog       CUDA Online Compiler program.
- * \param   [in] numOptions Number of compiler options passed.
- * \param   [in] options    Compiler options in the form of C string array.\n
- *                          \p options can be \c NULL when \p numOptions is 0.
- *
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink
- */
-nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
-                                int numOptions, const char **options);
-
-
-/**
- * \ingroup compilation
- * \brief   ::nvrtcGetPTXSize sets \p ptxSizeRet with the size of the PTX
- *          generated by the previous compilation of \p prog (including the
- *          trailing \c NULL).
- *
- * \param   [in]  prog       CUDA Online Compiler program.
- * \param   [out] ptxSizeRet Size of the generated PTX (including the trailing
- *                           \c NULL).
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
- *
- * \see     ::nvrtcGetPTX
- */
-nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
-
-
-/**
- * \ingroup compilation
- * \brief   ::nvrtcGetPTX stores the PTX generated by the previous compilation
- *          of \p prog in the memory pointed by \p ptx.
- *
- * \param   [in]  prog CUDA Online Compiler program.
- * \param   [out] ptx  Compiled result.
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
- *
- * \see     ::nvrtcGetPTXSize
- */
-nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
-
-
-/**
- * \ingroup compilation
- * \brief   ::nvrtcGetProgramLogSize sets \p logSizeRet with the size of the
- *          log generated by the previous compilation of \p prog (including the
- *          trailing \c NULL).
- *
- * Note that compilation log may be generated with warnings and informative
- * messages, even when the compilation of \p prog succeeds.
- *
- * \param   [in]  prog       CUDA Online Compiler program.
- * \param   [out] logSizeRet Size of the compilation log
- *                           (including the trailing \c NULL).
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
- *
- * \see     ::nvrtcGetProgramLog
- */
-nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
-
-
-/**
- * \ingroup compilation
- * \brief   ::nvrtcGetProgramLog stores the log generated by the previous
- *          compilation of \p prog in the memory pointed by \p log.
- *
- * \param   [in]  prog CUDA Online Compiler program.
- * \param   [out] log  Compilation log.
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
- *
- * \see     ::nvrtcGetProgramLogSize
- */
-nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
-
-
-#ifdef __cplusplus
-}
-#endif /* __cplusplus */
-
-#endif /* __NVRTC_H__ */
diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp
index 0f9e881f2..eae47885f 100644
--- a/lib/driver/backend.cpp
+++ b/lib/driver/backend.cpp
@@ -130,12 +130,12 @@ void backend::platforms(std::vector<Platform> & platforms)
   #ifdef ISAAC_WITH_CUDA
     platforms.push_back(Platform(CUDA));
   #endif
-//    cl_uint nplatforms;
-//    ocl::check(clGetPlatformIDs(0, NULL, &nplatforms));
-//    std::vector<cl_platform_id> clplatforms(nplatforms);
-//    ocl::check(clGetPlatformIDs(nplatforms, clplatforms.data(), NULL));
-//    for(cl_platform_id p: clplatforms)
-//        platforms.push_back(Platform(p));
+    cl_uint nplatforms;
+    ocl::check(clGetPlatformIDs(0, NULL, &nplatforms));
+    std::vector<cl_platform_id> clplatforms(nplatforms);
+    ocl::check(clGetPlatformIDs(nplatforms, clplatforms.data(), NULL));
+    for(cl_platform_id p: clplatforms)
+        platforms.push_back(Platform(p));
 }
 
 void backend::synchronize(Context const & context)
diff --git a/lib/kernels/templates/gemm.cpp b/lib/kernels/templates/gemm.cpp
index 353a58718..80eb7aff7 100644
--- a/lib/kernels/templates/gemm.cpp
+++ b/lib/kernels/templates/gemm.cpp
@@ -467,23 +467,23 @@ gemm_parameters::gemm_parameters(unsigned int simd_width
     stream << "}" << std::endl;
 
 
-    if(A_trans_=='N' || B_trans_=='T')
-        stream << "int Ky = K - idT.y;" << std::endl;
-    if(A_trans_=='T' || B_trans_=='N')
-        stream << "int Kx = K - idT.x;" << std::endl;
+//    if(A_trans_=='N' || B_trans_=='T')
+//        stream << "int Ky = K - idT.y;" << std::endl;
+//    if(A_trans_=='T' || B_trans_=='N')
+//        stream << "int Kx = K - idT.x;" << std::endl;
 
-    if(A_trans_=='N' || B_trans_=='T')
-        for(unsigned int k = 0; k < p_.kL; k += p_.local_fetch_1)
-            stream << "int condy" << k << " = " << k << " < Ky;" << std::endl;
+//    if(A_trans_=='N' || B_trans_=='T')
+//        for(unsigned int k = 0; k < p_.kL; k += p_.local_fetch_1)
+//            stream << "int condy" << k << " = " << k << " < Ky;" << std::endl;
 
-    if(A_trans_=='T' || B_trans_=='N')
-    {
-        for(unsigned int k = 0 ; k < p_.kL ; k += p_.local_fetch_0*p_.simd_width)
-            for(unsigned int s = 0 ; s < p_.simd_width ; ++s)
-            stream << "int condx" << k + s << " = " << k + s << " < Kx;" << std::endl;
-    }
+//    if(A_trans_=='T' || B_trans_=='N')
+//    {
+//        for(unsigned int k = 0 ; k < p_.kL ; k += p_.local_fetch_0*p_.simd_width)
+//            for(unsigned int s = 0 ; s < p_.simd_width ; ++s)
+//            stream << "int condx" << k + s << " = " << k + s << " < Kx;" << std::endl;
+//    }
 
-    fetch_to_lds(true);
+//    fetch_to_lds(true);
 
     stream << "//Write back C" << std::endl;
     stream << "M += ids.x;" << std::endl;
diff --git a/lib/profiles/profiles.cpp b/lib/profiles/profiles.cpp
index b23d26c46..3008ed500 100644
--- a/lib/profiles/profiles.cpp
+++ b/lib/profiles/profiles.cpp
@@ -119,16 +119,15 @@ void profiles::value_type::execute(controller<expressions_tuple> const & expr)
   else if(predictor_.get())
   {
     std::vector<float> predictions = predictor_->predict(x);
-//    do{
+    do{
         label = std::distance(predictions.begin(),std::max_element(predictions.begin(), predictions.end()));
-//        predictions[label] = 0;
-//    }while(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE);
+        predictions[label] = 0;
+    }while(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE);
   }
 
   //Execution
-//  std::cout << label << std::endl;
-//  if(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE)
-//    throw operation_not_supported_exception("Running this operation would require an overly large temporary.");
+  if(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE)
+    throw operation_not_supported_exception("Running this operation would require an overly large temporary.");
 
   return templates_[label]->enqueue(queue_, program, tools::to_string(label), *fallback_, expr);
 }
diff --git a/python/setup.py b/python/setup.py
index 28fb8015e..0dc3fa2fb 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -114,7 +114,7 @@ def main():
 
     #Include directories
     numpy_include = os.path.join(find_module("numpy")[1], "core", "include")
-    include =' src/include src/lib/external /usr/local/cuda/include'.split() + ['external/boost/', 'external/boost/boost/', numpy_include]
+    include =' src/include src/lib/external'.split() + ['external/boost/', 'external/boost/boost/', numpy_include]
 
     #Android
     if for_android:
@@ -124,7 +124,7 @@ def main():
       libraries += ['gnustl_shared']
 
     #Source files
-    src =  'src/lib/exception/operation_not_supported.cpp src/lib/exception/unknown_datatype.cpp src/lib/value_scalar.cpp src/lib/driver/check.cpp src/lib/driver/ndrange.cpp src/lib/driver/platform.cpp src/lib/driver/backend.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/event.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/buffer.cpp src/lib/driver/context.cpp src/lib/kernels/templates/axpy.cpp src/lib/kernels/templates/gemv.cpp src/lib/kernels/templates/dot.cpp src/lib/kernels/templates/base.cpp src/lib/kernels/templates/ger.cpp src/lib/kernels/templates/gemm.cpp src/lib/kernels/stream.cpp src/lib/kernels/keywords.cpp src/lib/kernels/mapped_object.cpp src/lib/kernels/binder.cpp src/lib/kernels/parse.cpp src/lib/wrap/clBLAS.cpp src/lib/profiles/predictors/random_forest.cpp src/lib/profiles/presets.cpp src/lib/profiles/profiles.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/expression.cpp src/lib/symbolic/io.cpp src/lib/symbolic/preset.cpp src/lib/array.cpp '.split() + [os.path.join('src', 'bind', sf)  for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
+    src =  'src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/program_cache.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/profiles/presets.cpp src/lib/profiles/profiles.cpp src/lib/profiles/predictors/random_forest.cpp src/lib/kernels/templates/gemv.cpp src/lib/kernels/templates/axpy.cpp src/lib/kernels/templates/gemm.cpp src/lib/kernels/templates/ger.cpp src/lib/kernels/templates/dot.cpp src/lib/kernels/templates/base.cpp src/lib/kernels/mapped_object.cpp src/lib/kernels/stream.cpp src/lib/kernels/parse.cpp src/lib/kernels/keywords.cpp src/lib/kernels/binder.cpp src/lib/wrap/clBLAS.cpp '.split() + [os.path.join('src', 'bind', sf)  for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
     boostsrc = 'external/boost/libs/'
     for s in ['numpy','python','smart_ptr','system','thread']:
         src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]
@@ -155,7 +155,7 @@ def main():
                 author='Philippe Tillet',
                 author_email='ptillet@g.harvard.edu',
                 license='MPL 2.0',
-                packages=['isaac','isaac.external', 'isaac.external.sklearn'],
+                packages=['isaac', 'isaac.external', 'isaac.external.sklearn'],
                 ext_package="isaac",
                 ext_modules=extensions,
                 cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass},
diff --git a/tune/android/isaac.kv b/tune/android/isaac.kv
index 689424204..a60bf74b9 100644
--- a/tune/android/isaac.kv
+++ b/tune/android/isaac.kv
@@ -27,16 +27,26 @@ BoxLayout:
             ActionPrevious:
                 with_previous: False
 
-            ActionToggleButton:
-                id: Tune
-                text: 'Tune'
-                on_release: app.show_tune()
-                group: 'menu'
-    
-            ActionToggleButton:
+            ActionButton:
                 text: 'Settings'
                 on_release: app.open_settings()
                 group: 'menu'
-                
+    
+    BoxLayout:
+        orientation: 'horizontal'
+        size_hint: 1, .1
+
+        Button:
+            id: Tune
+            text: 'Tune'
+            on_release: app.show_tune()
+            group: 'menu'
+            
+        Button:
+            text: 'Benchmark'
+            group: 'menu'
+            
+            
+            
     ScreenManager:
         id: sm
diff --git a/tune/android/main.py b/tune/android/main.py
index ef2fd7845..65f2b6822 100644
--- a/tune/android/main.py
+++ b/tune/android/main.py
@@ -1,5 +1,7 @@
 from os.path import dirname, realpath, join
 
+from kivy.logger import Logger
+from kivy.uix.scrollview import ScrollView
 from kivy.uix.boxlayout import BoxLayout
 from kivy.uix.label import Label
 from kivy.uix.checkbox import CheckBox
@@ -12,11 +14,23 @@ from kivy.uix.settings import SettingsWithNoMenu
 
 import isaac as sc
 import json
+import thread
 
-from tune.tune import do_tuning
+from tune.tune import Tuner
 
 __version__ = '1.0'
 
+class ScrollableLabel(ScrollView):
+    text = StringProperty('')
+
+class LabelLogger:
+    def __init__(self, label):
+        self.label = label;
+        
+    def info(self, msg):
+        self.label.text += msg + '\n'
+        
+    
 class IsaacScreen(Screen):
     fullscreen = BooleanProperty(False)
 
@@ -54,16 +68,24 @@ class IsaacApp(App):
             
         #Default view
         self.show_tune()
+        
+        #Logger
+        self.logger = LabelLogger(self.screens['Tune'].ids.out)
     
     def start_tuning(self):
-        #FIXME: will be buggy if two devices from two different platforms have the same name
-        device = next(x for x in self.isaac_handler.devices if x.name==self.config.get('hardware', 'device'))
-        operation = sc.templates.axpy
-        json_path = ''
-        #FIXME: Move profiling logics into tuning
-        sc.driver.default.queue_properties = sc.driver.PROFILING_ENABLE
-        print device.infos
-        do_tuning(device, operation, json_path)
+        button = self.screens['Tune'].ids.action_button
+        if button.text == 'Run':
+            #FIXME: will be buggy if two devices from two different platforms have the same name
+            device = next(x for x in self.isaac_handler.devices if x.name==self.config.get('hardware', 'device')) 
+            #FIXME: Move profiling logics into tuning
+            sc.driver.default.queue_properties = sc.driver.PROFILING_ENABLE 
+            self.logger.info('Using ' + device.name)
+            self.logger.info('')
+            tuner = Tuner(self.logger, device, sc.templates.axpy, '')
+            tid = thread.start_new_thread(Tuner.run, (tuner,))
+        else:
+            pass
+        button.text = 'Running...' if button.text == 'Run' else button.text
         
     def show_benchmark(self):
         pass
@@ -77,7 +99,8 @@ class IsaacApp(App):
         if 'Settings' not in self.screens:
             self.screens['Settings'] = Screen(name='Settings')
             self.screens['Settings'].add_widget(settings)
-        self.root.ids.sm.switch_to(self.screens['Settings'], direction='left')
+        if self.root.ids.sm.current != 'Settings':
+            self.root.ids.sm.switch_to(self.screens['Settings'], direction='left')
 
     def build_config(self, config):
         self.isaac_handler = IsaacHandler()
@@ -104,10 +127,10 @@ class IsaacApp(App):
                         'key': operation.lower(),
                         'options': ['Simple', 'Intermediate', 'Full']}]
         
-        settings.add_json_panel('Settings',
+        settings.add_json_panel('ISAAC',
                                 self.config,
                                 data=json.dumps(layout))
-                                
+                                                        
     def close_settings(self, *args):
         pass
 
diff --git a/tune/android/screens/tune.kv b/tune/android/screens/tune.kv
index 53483caff..f6e1ebdcf 100644
--- a/tune/android/screens/tune.kv
+++ b/tune/android/screens/tune.kv
@@ -1,3 +1,10 @@
+<ScrollableLabel>:
+    Label:
+        size_hint_y: None
+        height: self.texture_size[1]
+        text_size: self.width, None
+        text: root.text
+        
 IsaacScreen:
 	name: 'Tune'
 	fullscreen: True
@@ -6,11 +13,14 @@ IsaacScreen:
         orientation: 'vertical'
         
         Button:
-            text: 'Start tuning'
+            id: action_button
+            background_color: .3,.3,.3,.3
+            text: 'Run'
             pos: 0, 0
             size_hint: 1, .1
             on_release: app.start_tuning()
         
-        Label:
+        ScrollableLabel:
+            id: out
             text: ''
             
diff --git a/tune/android/tune/optimize.py b/tune/android/tune/optimize.py
index 187de5410..19d79b70d 100644
--- a/tune/android/tune/optimize.py
+++ b/tune/android/tune/optimize.py
@@ -12,14 +12,13 @@ from external.deap import tools as deap_tools
 from numpy import cumsum
 
 import tools
+from tools import profile_execution_failure
 
 fetch_types = [sc.templates.fetching_policy_type.FETCH_FROM_LOCAL,
                sc.templates.fetching_policy_type.FETCH_FROM_LOCAL,
                sc.templates.fetching_policy_type.FETCH_FROM_LOCAL,
                sc.templates.fetching_policy_type.FETCH_FROM_LOCAL]
 
-to_catch = (sc.OperationNotSupported, sc.OclLaunchOutOfResources, sc.CudaLaunchOutOfResources, sc.MemObjectAllocationFailure, sc.InvalidWorkGroupSize, sc.OutOfHostMemory, sc.InvalidValue)
-
 def exhaustive(template, sizes, context):
     tree, _ = tools.tree_of(template, sizes, context)
     metric = tools.metric_of(template)
@@ -35,127 +34,137 @@ def exhaustive(template, sizes, context):
             time = tools.benchmark(template, parameters, tree)
             if not best or time < best[1]:
                 best = parameters, time
-        except to_catch:
+        except profile_execution_failure:
             pass
         if best:
             stdout.write('%.2f %% | Best %.2f [ for %s ]\r'%(float(idx*100)/len(ranges),metric(sizes, best[1]), best[0]))
     return best[0]
         
-       
-def genetic(template, sizes, context, naccept=200, niter = 1000, cxpb=0.4, mutpb=0.4, popsize = 10, initializer = None, prior = None):
-    tree, _ = tools.tree_of(template, sizes, context)
-    metric = tools.metric_of(template)
-    genetic_infos = tools.genetic_infos_of(template)
-    nbits = genetic_infos['nbits']
-    offsets = cumsum([0] + nbits)
 
-    def bin2gray(A):
-        g = [int(A[0])]
-        for i in range(1, len(A)): 
-            g += [int(A[i-1] != A[i])]
-        return g
+class GeneticOptimizer:
     
-    def gray2int(A):
-        b = [A[0]]
-        for i in range(1, len(A)):
-            b += [int(b[i-1] != A[i])]
-        return int(''.join(map(str,b)), 2)
-    
-    def encode(genome):
-        encoded = [bin2gray(bin(x)[2:].zfill(nb)) for x, nb in zip(genome, nbits)]
-        return sum(encoded, [])
+    def __init__(self, logger, naccept=500, niter=1000, cxpb=.4, mutpb=.4, popsize=10):
+        self.logger = logger
+        self.naccept = naccept
+        self.niter = niter
+        self.cxpb = cxpb
+        self.mutpb = mutpb
+        self.popsize = popsize
         
-    def decode(genome):
-        result = []
-        for off1,off2 in zip(offsets[:-1],offsets[1:]):
-            result += [gray2int(genome[off1:off2])]
-        result = [fetch_types[x] if i in genetic_infos['categorical'] else 2**x for i,x in enumerate(result)]
-        return result
+    def run(self, template, sizes, context, initializer = None, prior = None):
+        tree, _ = tools.tree_of(template, sizes, context)
+        metric = tools.metric_of(template)
+        genetic_infos = tools.genetic_infos_of(template)
+        nbits = genetic_infos['nbits']
+        offsets = cumsum([0] + nbits)
 
-    def evaluate(genome):
-        idx = tuple(genome)
-        if idx not in cache:
-            cache[idx] = tools.benchmark(template, decode(genome), tree)
-        return cache[idx],
+        def bin2gray(A):
+            g = [int(A[0])]
+            for i in range(1, len(A)): 
+                g += [int(A[i-1] != A[i])]
+            return g
         
-    cache = {}
-    hof = deap_tools.HallOfFame(1)
-
-    creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
-    creator.create("Individual", list, fitness=creator.FitnessMin)
-
-    toolbox = base.Toolbox()
-    toolbox.register("evaluate", evaluate)
-    toolbox.register("mate", deap_tools.cxTwoPoint)
-    toolbox.register("mutate", deap_tools.mutFlipBit)
-    toolbox.register("select", deap_tools.selNSGA2)
-
-    #Initialization
-    if initializer is None:
-        initializer = ([random.randint(0, 2**x) for x in nbits] for i in iter(int,1))
-    population = [] 
-
-    genome = encode(prior if prior else list(initializer.next()))
-    while len(population) < popsize:
-        individual = creator.Individual(genome)
-        try:
-            individual.fitness.values = toolbox.evaluate(genome)
-            population += [individual]
-        except to_catch:
-            pass
-        genome = encode(list(initializer.next()))
-    hof.update(population)
-    
-    x = []
-    y = []
-    it = 0
-    
-    while len(cache) < naccept and it<niter:
-        pad = len(cache) - len(x)
-        x += [len(cache)]*pad
-        y += [metric(sizes, hof[0].fitness.values[0])]*pad
+        def gray2int(A):
+            b = [A[0]]
+            for i in range(1, len(A)):
+                b += [int(b[i-1] != A[i])]
+            return int(''.join(map(str,b)), 2)
         
-        offspring = []
-        while len(offspring) < popsize:
-            try:
-                op_choice = random.random()
-                #Cross-over
-                if op_choice < cxpb: 
-                    ind1, ind2 = map(toolbox.clone, random.sample(population, 2))
-                    ind1, ind2 = toolbox.mate(ind1, ind2)
-                    ind = ind1
-                    toolbox.evaluate(ind)
-                    offspring += [ind]
-                #Mutation
-                elif op_choice < cxpb + mutpb: 
-                    ind = toolbox.clone(random.choice(population))
-                    ind, = toolbox.mutate(ind, 1.0/offsets[-1])
-                    toolbox.evaluate(ind)
-                    offspring += [ind]
-                #Reproduction
-                else: 
-                    offspring += [random.choice(population)]
-            except to_catch:
-                pass
-
-
-        #Update fitnesses
-        fitnesses = toolbox.map(toolbox.evaluate, offspring)
-        for ind, fit in zip(offspring, fitnesses):
-            ind.fitness.values = fit
+        def encode(genome):
+            encoded = [bin2gray(bin(x)[2:].zfill(nb)) for x, nb in zip(genome, nbits)]
+            return sum(encoded, [])
             
-        #Update population
-        population[:] = toolbox.select(population + offspring, popsize)
+        def decode(genome):
+            result = []
+            for off1,off2 in zip(offsets[:-1],offsets[1:]):
+                result += [gray2int(genome[off1:off2])]
+            result = [fetch_types[x] if i in genetic_infos['categorical'] else 2**x for i,x in enumerate(result)]
+            return result
+
+        def evaluate(genome):
+            idx = tuple(genome)
+            if idx not in cache:
+                cache[idx] = tools.benchmark(template, decode(genome), tree)
+            return cache[idx],
+            
+        cache = {}
+        hof = deap_tools.HallOfFame(1)
+
+        creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
+        creator.create("Individual", list, fitness=creator.FitnessMin)
+
+        toolbox = base.Toolbox()
+        toolbox.register("evaluate", evaluate)
+        toolbox.register("mate", deap_tools.cxTwoPoint)
+        toolbox.register("mutate", deap_tools.mutFlipBit)
+        toolbox.register("select", deap_tools.selNSGA2)
+
+        #Initialization
+        if initializer is None:
+            initializer = ([random.randint(0, 2**x) for x in nbits] for i in iter(int,1))
+        population = [] 
+
+        genome = encode(prior if prior else list(initializer.next()))
+        while len(population) < self.popsize:
+            individual = creator.Individual(genome)
+            try:
+                individual.fitness.values = toolbox.evaluate(genome)
+                population += [individual]
+            except profile_execution_failure:
+                pass
+            genome = encode(list(initializer.next()))
         hof.update(population)
         
-        optimal = '(%s)'%','.join(map(str,decode(hof[0])))
-        stdout.write('Iter %d | %d evaluated | Best %.2f [ for %s ]\r'%(it, x[-1], y[-1], optimal))
-        stdout.flush()
-        it += 1
-    stdout.write('\n')
+        x = []
+        y = []
+        it = 0
+        
+        while len(cache) < self.naccept and it<self.niter:
+            pad = len(cache) - len(x)
+            x += [len(cache)]*pad
+            y += [metric(sizes, hof[0].fitness.values[0])]*pad
+            
+            offspring = []
+            while len(offspring) < self.popsize:
+                try:
+                    op_choice = random.random()
+                    #Cross-over
+                    if op_choice < self.cxpb: 
+                        ind1, ind2 = map(toolbox.clone, random.sample(population, 2))
+                        ind1, ind2 = toolbox.mate(ind1, ind2)
+                        ind = ind1
+                        toolbox.evaluate(ind)
+                        offspring += [ind]
+                    #Mutation
+                    elif op_choice < self.cxpb + self.mutpb: 
+                        ind = toolbox.clone(random.choice(population))
+                        ind, = toolbox.mutate(ind, 1.0/offsets[-1])
+                        toolbox.evaluate(ind)
+                        offspring += [ind]
+                    #Reproduction
+                    else: 
+                        offspring += [random.choice(population)]
+                except profile_execution_failure:
+                    pass
 
-    return tuple(decode(hof[0])), x, y
-    
+
+            #Update fitnesses
+            fitnesses = toolbox.map(toolbox.evaluate, offspring)
+            for ind, fit in zip(offspring, fitnesses):
+                ind.fitness.values = fit
+                
+            #Update population
+            population[:] = toolbox.select(population + offspring, self.popsize)
+            hof.update(population)
+            
+            optimal = '(%s)'%','.join(map(str,decode(hof[0])))
+            stdout.write('Iter %d | %d evaluated | Best %.2f [ for %s ]\r'%(it, x[-1], y[-1], optimal))
+            stdout.flush()
+            it += 1
+        stdout.write('\n')
+
+        return tuple(decode(hof[0])), x, y
+        
 def is_local_optimum(parameters, template, sizes, context):
     tree, _ = tools.tree_of(template, sizes, context)
     genetic_infos = tools.genetic_infos_of(template)
@@ -174,7 +183,7 @@ def is_local_optimum(parameters, template, sizes, context):
     #Evaluate the provided parameters guess
     try:
         reference = tools.benchmark(template, parameters, tree)
-    except to_catch:
+    except profile_execution_failure:
         return False
         
     #Latency bound -- ignore
@@ -191,7 +200,7 @@ def is_local_optimum(parameters, template, sizes, context):
             time = tools.benchmark(template, x, tree)
             if time/reference < .97:
                 return False
-        except to_catch:
+        except profile_execution_failure:
             pass
     return True
     
diff --git a/tune/android/tune/tools.py b/tune/android/tune/tools.py
index d096b95f6..b49ed10c1 100644
--- a/tune/android/tune/tools.py
+++ b/tune/android/tune/tools.py
@@ -2,6 +2,8 @@ import isaac as sc
 from numpy import mean, median
 from math import ceil, exp, log, sqrt
 
+profile_execution_failure = (sc.OperationNotSupported,  sc.OclLaunchOutOfResources, sc.CudaLaunchOutOfResources, sc.MemObjectAllocationFailure, sc.InvalidWorkGroupSize, sc.OutOfHostMemory, sc.InvalidValue)
+
 def sanitize(string, keep_chars = ['_']):
     string = string.replace(' ', '_').replace('-', '_').lower()
     string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip()
@@ -86,7 +88,12 @@ def metric_of(template):
         return lambda sizes, t: memory_footprint(template, sizes)/t
     elif any([issubclass(template, x) for x in compute_bound]):
         return lambda sizes, t: 2*sizes[0]*sizes[1]*sizes[2]*1e-9/t
-                
+           
+def metric_name_of(template):
+    if issubclass(template, sc.templates.gemm):
+        return 'GFLOPS'
+    return 'GB/S'
+
 def genetic_infos_of(template):
     if issubclass(template, sc.templates.axpy):
         return {'categorical': [3], 'nbits': [3,4,4,2] }
diff --git a/tune/android/tune/tune.py b/tune/android/tune/tune.py
index f8fcabb63..fac99a11c 100644
--- a/tune/android/tune/tune.py
+++ b/tune/android/tune/tune.py
@@ -4,10 +4,11 @@ from itertools import chain, product
 from numpy import argsort, argmax, where, delete, bincount
 from operator import mul
 import isaac as sc
+from tools import profile_execution_failure
 from isaac.external.sklearn.forest import RandomForestRegressor
 import optimize, tools, model
 from json import encoder
-import json
+import json, csv
 
 to_catch = (sc.OperationNotSupported, sc.OclLaunchOutOfResources, sc.CudaLaunchOutOfResources, sc.MemObjectAllocationFailure, sc.InvalidWorkGroupSize, sc.OutOfHostMemory, sc.InvalidValue)
 
@@ -23,168 +24,200 @@ def pow2range(a, b):
     return [2**x for x in range(a, b)]
 
 
-def do_tuning(device, operation, json_path): 
-    #Context
-    context = sc.driver.context(device)
-    
-    #List of size tuples to use
-    sizes = {}
-    sizes[sc.templates.axpy] = [(x,) for x in tools.expspace(1e3, 1e8, 4)]
-    sizes[sc.templates.gemv_n] = product(pow2range(4,17), pow2range(4,17))
-    sizes[sc.templates.gemv_t] = sizes[sc.templates.gemv_n]
-    sizes[sc.templates.gemm_nn]     = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 17))
-    sizes[sc.templates.gemm_tn]     = sizes[sc.templates.gemm_nn]
-    sizes[sc.templates.gemm_nt]     = sizes[sc.templates.gemm_nn]
-    sizes[sc.templates.gemm_tt]     = sizes[sc.templates.gemm_nn]
-    
+class Tuner:
 
-    #Quick tuning - AlexNet sizes + Intuition
-    quick_tuning = False
-    if quick_tuning:
-        sizes[sc.templates.ger] 		 = [(1536,1536)]
-
-        sizes[sc.templates.gemv_n]		 = [(1000,256),
-                                            (4096,256)]
-        sizes[sc.templates.gemv_t]		 = [(169,256),
-                                            (169,384),
-                                            (729,256),
-                                            (3025,96)]
+    def __init__(self, logger, device, operation, json_path):
+        self.logger = logger
+        self.device = device
+        self.operation = operation
+        self.json_path = json_path
+      
+    def pprint_datapoint(self, x, y):
+        if self.logger:
+            self.logger.info(', '.join(map(str, x)) + ': ' + str(int(max(y))) + ' ' + tools.metric_name_of(self.operation))
+            
+    def run(self, levels = {'BLAS1': 'intermediate', 'BLAS2':'intermediate', 'BLAS3':'intermediate'}): 
+        for key, level in levels.iteritems():
+            assert key in ['BLAS1', 'BLAS2', 'BLAS3']
+            assert level in ['simple', 'intermediate', 'full']
         
-        sizes[sc.templates.gemm_nn]	 = [(3025,96,363),
-                                            (729,128,1200),
-                                            (169,384,2304),
-                                            (169,192,1728),
-                                            (169,128,1728)]
-                                            
-        sizes[sc.templates.gemm_nt]	 = [(1536,1536,1536),
-                                            (169,1728,128),
-                                            (169,1728,192),
-                                            (169,2304,384),
-                                            (729,1200,128)]
-        sizes[sc.templates.gemm_tn]	 = [(500,50,64),(25,20,576),(1728,128,169), 
-                                            (1728,192,169),
-                                            (2304,384,169),
-                                            (1200,128,729),
-                                            (363,96,3025)]
-    
-    #Remove duplicated
-    sizes = unique(list(sizes[operation]))
-    sizes = [x for x in sizes if 1e-4 <= tools.memory_footprint(operation, x) <= 1e-1]
-	
-
-    #Training data
-    performance = tools.metric_of(operation)
-    profiles, X, Y = [], [], []
-    
-	#Restore previous run
-    import csv
-    savepath = os.path.join('save', operation.__name__)
-    if not os.path.exists(savepath):
-        os.makedirs(savepath)
-	
-    try:
-		with open(os.path.join(savepath, 'X.csv')) as f:
-			X = [tuple(map(int, row)) for row in csv.reader(f, delimiter=',')]
-			
-		with open(os.path.join(savepath, 'Y.csv')) as f:
-			Y = [map(float, row) for row in csv.reader(f, delimiter=',')]
-		
-		with open(os.path.join(savepath, 'profiles.csv')) as f:
-			def mmap(x):
-				if x=='FETCH_FROM_LOCAL':
-					return sc.templates.fetching_policy_type.FETCH_FROM_LOCAL
-				if x=='FETCH_FROM_GLOBAL_CONTIGUOUS':
-					return sc.templates.fetching_policy_type.FETCH_FROM_GLOBAL_CONTIGUOUS
-				if x=='FETCH_FROM_GLOBAL_STRIDED':
-					return sc.templates.fetching_policy_type.FETCH_FROM_GLOBAL_STRIDED
-				return int(x)
-			profiles = [map(mmap,row) for v in row for row in csv.reader(f, delimiter=',')]
-    except:
-		pass
-	
-    for idx, x in enumerate(sizes):
-        if x in X:
-			continue
-        print x
-        idx = len(X)
-        nparams = len(profiles)
-        tree, operands = tools.tree_of(operation, x, context)
-        #Check if the current best prediction is not a local optimum
-        if idx==0:
-            tune = True
-            predicted = None
+        device = self.device
+        operation = self.operation
+        context = sc.driver.context(device)
+        
+        if self.logger:
+            self.logger.info('Now tuning ' + operation.__name__.replace('_','-').upper() + '...')
+         
+        sizes = {}
+        #BLAS1 training sizes
+        if levels['BLAS1']=='simple':
+            blas1_sizes = [(1e7,)]
+        elif levels['BLAS1']=='intermediate':
+            blas1_sizes = [(x,) for x in tools.expspace(1e3, 1e8, 10)]
         else:
-            if nparams==1:
-                predicted = profiles[0]
-            else:
-                clf = RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y)
-                #clf, nrmse = model.train(X, Y, profiles)
-                predperf = clf.predict(x)[0]
-                best = (-predperf).argsort()[:5]
-                perf = []
-                for b in best:
-                    try:
-                        perf += [performance(x, tools.benchmark(operation, profiles[b], tree))]
-                    except to_catch:
-                        pass
-                predicted = profiles[best[argmax(perf)]]
-            tune = not optimize.is_local_optimum(predicted, operation, x, context)     
-            #tune = True
-        #Retune if necessary
-        if tune:
-            #new = optimize.exhaustive(operation, x, context)
-            new = optimize.genetic(operation, x, context, niter=1000, naccept=1000, popsize=20, prior=predicted)[0]
-            if new not in profiles:
-                profiles.append(new)
-                if idx > 0:
-                    for xx,yy in zip(X, Y):
-                        _tree, _operands = tools.tree_of(operation, xx, context)
-                        try:
-                            time = tools.benchmark(operation, new, _tree)
-                            perf = performance(xx, time)
-                        except to_catch:
-                            perf = 0
-                        yy.append(0 if isinf(perf) else perf)
-        #Update dataset
-        y = []
-        fastest = max(predperf) if nparams > 1 else None
-        for ip, p in enumerate(profiles):
-            try:
-                perf = 0 if fastest and ip < nparams and predperf[ip]/fastest < .1 else performance(x,tools.benchmark(operation, p, tree))
-            except to_catch:
-                perf = 0
-            y.append(0 if isinf(perf) else perf)
-        X.append(x)
-        Y.append(y)
+            blas1_sizes = [(x,) for x in tools.expspace(1e3, 1e8, 30)] 
+        sizes[sc.templates.axpy] = blas1_sizes
+        sizes[sc.templates.dot] = blas1_sizes
+
+        #BLAS2 training sizes
+        if levels['BLAS2']=='simple':
+            blas2_sizes = [(1536, 1536)]
+        elif levels['BLAS2']=='intermediate':
+            blas2_sizes =  [(1000,256), 
+                            (4096,256), 
+                            (256, 1000), 
+                            (256, 4096),
+                            (169,256),
+                            (169, 384), 
+                            (729,256), 
+                            (3025,96)]
+        else:
+            blas2_sizes = product(pow2range(4,17), pow2range(4,17))
+        sizes[sc.templates.ger] = blas2_sizes
+        sizes[sc.templates.gemv_n] = blas2_sizes
+        sizes[sc.templates.gemv_t] = blas2_sizes
+
+        #BLAS3 training sizes
+        if levels['BLAS3']=='simple':
+            blas3_sizes = [(1536,1536,1536)]
+        elif levels['BLAS3']=='intermediate':
+            blas3_sizes = [(32, 32, 16000),
+                           (3025,96,363),
+                           (729,128,1200),
+                           (169,384,2304),
+                           (169,192,1728),
+                           (169,128,1728),
+                           (169,1728,128),
+                           (169,1728,192),
+                           (169,2304,384),
+                           (729,1200,128),
+                           (1728,128,169), 
+                           (1728,192,169),
+                           (2304,384,169),
+                           (1200,128,729),
+                           (363,96,3025)]
+        elif levels['BLAS3']=='full':
+            blas3_sizes = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 15))
+        sizes[sc.templates.gemm_nn]     = blas3_sizes
+        sizes[sc.templates.gemm_tn]     = blas3_sizes
+        sizes[sc.templates.gemm_nt]     = blas3_sizes
+        sizes[sc.templates.gemm_tt]     = blas3_sizes
+            
+        #Remove duplicates
+        sizes = unique(list(sizes[operation]))
+        sizes = [x for x in sizes if 1e-4 <= tools.memory_footprint(operation, x) <= 1e-1]
         
-        for (fname, data) in zip(['X.csv', 'Y.csv', 'profiles.csv'], [X, Y, profiles]):
-            with open(os.path.join(savepath, fname), 'wb') as f:
-                csv.writer(f).writerows(data)
-
-    
-    unused = where(bincount(argmax(Y, 1))==0)[0]
-    profiles = [x for ix,x in enumerate(profiles) if ix not in unused]
-    Y = delete(Y, unused, axis=1)
-
-    #Export to JSON
-    json_path = tools.sanitize(device.name) + '.json' if not json_path else json_path
-    if os.path.isfile(json_path):
-        json_data = json.load(open(json_path, 'r'))
-    else:
-        json_data = {}
-        json_data["version"] = "1.0"
-    operation_name = operation.__name__
-    if operation_name not in json_data:
-        json_data[operation_name] = {}
-    json_data[operation_name]['float32'] = {}
-    D = json_data[operation_name]['float32']
-    if len(profiles) > 1:
-        clf, nrmse = model.train(X, Y, profiles)
-        D['predictor'] = [{'children_left': e.tree_.children_left.tolist(),
-                            'children_right': e.tree_.children_right.tolist(),
-                            'threshold': e.tree_.threshold.astype('float64').tolist(),
-                            'feature': e.tree_.feature.astype('float64').tolist(),
-                            'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_]
-    D['profiles'] = [map(int, x) for x in profiles]
-    json.dump(json_data, open(json_path,'w'))
+        #Training data
+        performance = tools.metric_of(operation)
+        profiles, X, Y = [], [], []
+        
+        #Restore previous run
+        savepath = os.path.join('save', operation.__name__)
+        if not os.path.exists(savepath):
+            os.makedirs(savepath)
+        
+        try:
+            with open(os.path.join(savepath, 'X.csv')) as f:
+                X = [tuple(map(int, row)) for row in csv.reader(f, delimiter=',')]
+                
+            with open(os.path.join(savepath, 'Y.csv')) as f:
+                Y = [map(float, row) for row in csv.reader(f, delimiter=',')]
+            
+            with open(os.path.join(savepath, 'profiles.csv')) as f:
+                def mmap(x):
+                    if x=='FETCH_FROM_LOCAL':
+                        return sc.templates.fetching_policy_type.FETCH_FROM_LOCAL
+                    if x=='FETCH_FROM_GLOBAL_CONTIGUOUS':
+                        return sc.templates.fetching_policy_type.FETCH_FROM_GLOBAL_CONTIGUOUS
+                    if x=='FETCH_FROM_GLOBAL_STRIDED':
+                        return sc.templates.fetching_policy_type.FETCH_FROM_GLOBAL_STRIDED
+                    return int(x)
+                profiles = [map(mmap,row) for v in row for row in csv.reader(f, delimiter=',')]
+        except:
+            pass
+        
+        for idx, x in enumerate(sizes):
+            if x in X:
+                self.pprint_datapoint(x, Y[X.index(x)])
+                continue
+            idx = len(X)
+            nparams = len(profiles)
+            tree, operands = tools.tree_of(operation, x, context)
+            #Check if the current best prediction is not a local optimum
+            if idx==0:
+                tune = True
+                predicted = None
+            else:
+                if nparams==1:
+                    predicted = profiles[0]
+                else:
+                    clf = RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y)
+                    #clf, nrmse = model.train(X, Y, profiles)
+                    predperf = clf.predict(x)[0]
+                    best = (-predperf).argsort()[:5]
+                    perf = []
+                    for b in best:
+                        try:
+                            perf += [performance(x, tools.benchmark(operation, profiles[b], tree))]
+                        except (sc.OperationNotSupported, sc.LaunchOutOfResources, sc.MemObjectAllocationFailure):
+                            pass
+                    predicted = profiles[best[argmax(perf)]]
+                tune = not optimize.is_local_optimum(predicted, operation, x, context)     
+                #tune = True
+            #Retune if necessary
+            if tune:
+                #new = optimize.exhaustive(operation, x, context)
+                optimizer = optimize.GeneticOptimizer(self.logger, naccept=1000, niter=1000, cxpb=.4, mutpb=.4, popsize=20)
+                new = optimizer.run(operation, x, context, prior=predicted)[0]
+                if new not in profiles:
+                    profiles.append(new)
+                    if idx > 0:
+                        for xx,yy in zip(X, Y):
+                            _tree, _operands = tools.tree_of(operation, xx, context)
+                            try:
+                                time = tools.benchmark(operation, new, _tree)
+                                perf = performance(xx, time)
+                            except profile_execution_failure:
+                                perf = 0
+                            yy.append(0 if isinf(perf) else perf)
+            #Update dataset
+            y = []
+            fastest = max(predperf) if nparams > 1 else None
+            for ip, p in enumerate(profiles):
+                try:
+                    perf = 0 if fastest and ip < nparams and predperf[ip]/fastest < .1 else performance(x,tools.benchmark(operation, p, tree))
+                except profile_execution_failure:
+                    perf = 0
+                y.append(0 if isinf(perf) else perf)
+            X.append(x)
+            Y.append(y)
+            
+            for (fname, data) in zip(['X.csv', 'Y.csv', 'profiles.csv'], [X, Y, profiles]):
+                with open(os.path.join(savepath, fname), 'wb') as f:
+                    csv.writer(f).writerows(data)
+            
+            #Update logging
+            self.pprint_datapoint(x, y)
 
+        
+        #Export to JSON
+        json_path = tools.sanitize(device.name) + '.json' if not self.json_path else self.json_path
+        if os.path.isfile(json_path):
+            json_data = json.load(open(json_path, 'r'))
+        else:
+            json_data = {}
+            json_data["version"] = "1.0"
+        operation_name = operation.__name__
+        if operation_name not in json_data:
+            json_data[operation_name] = {}
+        json_data[operation_name]['float32'] = {}
+        D = json_data[operation_name]['float32']
+        if len(profiles) > 1:
+            clf, nrmse = model.train(X, Y, profiles)
+            D['predictor'] = [{'children_left': e.tree_.children_left.tolist(),
+                                'children_right': e.tree_.children_right.tolist(),
+                                'threshold': e.tree_.threshold.astype('float64').tolist(),
+                                'feature': e.tree_.feature.astype('float64').tolist(),
+                                'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_]
+        D['profiles'] = [map(int, x) for x in profiles]
+        json.dump(json_data, open(json_path,'w'))