Runtime: More progress towards cuBLAS integration

2016-10-04 01:02:43 -04:00
parent fb9669a34d
commit ffb9548b6a
18 changed files with 170 additions and 210 deletions
--- a/include/isaac/jit/generation/base.h
+++ b/include/isaac/jit/generation/base.h
@@ -84,6 +84,7 @@ public:
  virtual std::vector<int_t> input_sizes(expression_tree const & expressions) const = 0;
  virtual int is_invalid(expression_tree const & expressions, driver::Device const & device) const = 0;
  virtual void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & expressions) = 0;
+  virtual expression_type type() const = 0;
  std::string generate(std::string const & suffix, expression_tree const & expressions, driver::Device const & device);
  std::shared_ptr<base> getptr();
 };
--- a/include/isaac/jit/generation/elementwise_1d.h
+++ b/include/isaac/jit/generation/elementwise_1d.h
@@ -38,6 +38,7 @@ public:
  elementwise_1d(unsigned int vwidth, unsigned int ls, unsigned int ng, fetch_type fetch);
  std::vector<int_t> input_sizes(expression_tree const  & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
+  expression_type type() const;
 private:
  unsigned int ng_;
  fetch_type fetch_;
--- a/include/isaac/jit/generation/elementwise_2d.h
+++ b/include/isaac/jit/generation/elementwise_2d.h
@@ -39,6 +39,7 @@ public:
  elementwise_2d(unsigned int vwidth, unsigned int ls0, unsigned int ls1,  unsigned int ng0, unsigned int ng1, fetch_type fetch);
  std::vector<int_t> input_sizes(expression_tree const  & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
+  expression_type type() const;
 private:
  unsigned int ng0_;
  unsigned int ng1_;
--- a/include/isaac/jit/generation/gemm.h
+++ b/include/isaac/jit/generation/gemm.h
@@ -39,6 +39,7 @@ public:
  int is_invalid(expression_tree const  &, driver::Device const &) const;
  std::vector<int_t> input_sizes(expression_tree const & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const &, std::string const &, runtime::execution_handler const & h);
+  expression_type type() const;
 private:
  const char A_trans_;
  const char B_trans_;
@@ -62,6 +63,7 @@ public:
       , int_t lf0, int_t lf1, char A_trans, char B_trans);
  std::vector<int_t> input_sizes(expression_tree const & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & h);
+  expression_type type() const;

 private:
  //Parameters
--- a/include/isaac/jit/generation/reduce_1d.h
+++ b/include/isaac/jit/generation/reduce_1d.h
@@ -43,6 +43,8 @@ public:
  reduce_1d(unsigned int vwidth, unsigned int ls, unsigned int ng, fetch_type fetch);
  std::vector<int_t> input_sizes(expression_tree const  & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
+  expression_type type() const;
+
 private:
  unsigned int ng_;
  fetch_type fetch_;
--- a/include/isaac/jit/generation/reduce_2d.h
+++ b/include/isaac/jit/generation/reduce_2d.h
@@ -44,6 +44,7 @@ private:
 public:
  virtual std::vector<int_t> input_sizes(expression_tree const & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
+  expression_type type() const;
 private:
  unsigned int ng0_;
  unsigned int ng1_;
--- a/include/isaac/runtime/profiles.h
+++ b/include/isaac/runtime/profiles.h
@@ -53,7 +53,7 @@ public:

    public:
      value_type(expression_type, numeric_type, predictors::random_forest const &, std::vector< std::shared_ptr<templates::base> > const &, driver::CommandQueue const &);
-      value_type(expression_type, numeric_type, std::shared_ptr<templates::base> const &, driver::CommandQueue const &);
+      value_type(numeric_type, std::shared_ptr<templates::base> const &, driver::CommandQueue const &);
      void execute(runtime::execution_handler const &);
      templates_container const & templates() const;

--- a/lib/jit/generation/elementwise_1d.cpp
+++ b/lib/jit/generation/elementwise_1d.cpp
@@ -43,6 +43,9 @@ int elementwise_1d::is_invalid_impl(driver::Device const &, expression_tree cons
  return TEMPLATE_VALID;
 }

+expression_type elementwise_1d::type() const
+{ return ELEMENTWISE_1D; }
+
 std::string elementwise_1d::generate_impl(std::string const & suffix, expression_tree const & tree, driver::Device const & device, symbolic::symbols_table const & symbols) const
 {
  driver::backend_type backend = device.backend();
--- a/lib/jit/generation/elementwise_2d.cpp
+++ b/lib/jit/generation/elementwise_2d.cpp
@@ -42,6 +42,9 @@ int elementwise_2d::is_invalid_impl(driver::Device const &, expression_tree cons
  return TEMPLATE_VALID;
 }

+expression_type elementwise_2d::type() const
+{ return ELEMENTWISE_2D; }
+
 std::string elementwise_2d::generate_impl(std::string const & suffix, expression_tree const  & tree, driver::Device const & device, symbolic::symbols_table const & symbols) const
 {
  std::string init0, upper_bound0, inc0, init1, upper_bound1, inc1;
--- a/lib/jit/generation/gemm.cpp
+++ b/lib/jit/generation/gemm.cpp
@@ -50,16 +50,11 @@ std::vector<int_t> infos(expression_tree const & tree, symbolic::preset::gemm::a
 }

 /* ------------------ CUBLAS ------------------ */
-bool cublas_gemm::init()
-{
-  return driver::dispatch::cublasinit();
-}
-
 cublas_gemm::cublas_gemm(char A_trans, char B_trans): A_trans_(A_trans), B_trans_(B_trans), init_(driver::dispatch::cublasinit())
 { }

 int cublas_gemm::is_invalid(expression_tree const  &, driver::Device const & device) const
-{ return init_ && device.backend()==driver::CUDA; }
+{ return (init_ && device.backend()==driver::CUDA)?0:-1; }

 std::vector<int_t> cublas_gemm::input_sizes(expression_tree const & expressions) const
 {
@@ -67,9 +62,21 @@ std::vector<int_t> cublas_gemm::input_sizes(expression_tree const & expressions)
  return infos((expression_tree&)expressions, dummy, A_trans_);
 }

+expression_type cublas_gemm::type() const
+{
+  if(A_trans_=='N' && B_trans_=='N')
+    return GEMM_NN;
+  else if(A_trans_=='T' && B_trans_=='N')
+    return GEMM_TN;
+  else if(A_trans_=='N' && B_trans_=='T')
+    return GEMM_NT;
+  else
+    return GEMM_TT;
+}
+
 void cublas_gemm::enqueue(driver::CommandQueue & queue, driver::Program const &, std::string const &, runtime::execution_handler const & control)
 {
-  namespace drv = driver;;
+  namespace drv = driver;
  //Get GEMM info
  symbolic::preset::gemm::args args;
  std::vector<int_t> MNK = infos(control.x(), args, A_trans_);
@@ -115,6 +122,19 @@ unsigned int gemm::lmem_usage(expression_tree const & expression) const
  return N*size_of(expression.dtype());
 }

+expression_type gemm::type() const
+{
+  if(A_trans_=='N' && B_trans_=='N')
+    return GEMM_NN;
+  else if(A_trans_=='T' && B_trans_=='N')
+    return GEMM_TN;
+  else if(A_trans_=='N' && B_trans_=='T')
+    return GEMM_NT;
+  else
+    return GEMM_TT;
+}
+
+
 unsigned int gemm::registers_usage(expression_tree const & expression) const
 {
  unsigned int N = mS_ * nS_ + mS_ * kS_ + kS_ * nS_;
--- a/lib/jit/generation/reduce_1d.cpp
+++ b/lib/jit/generation/reduce_1d.cpp
@@ -55,6 +55,9 @@ unsigned int reduce_1d::temporary_workspace(expression_tree const &) const
    return 0;
 }

+expression_type reduce_1d::type() const
+{ return REDUCE_1D; }
+
 inline void reduce_1d::reduce_1d_local_memory(kernel_generation_stream & stream, unsigned int size, std::vector<symbolic::reduce_1d*> exprs,
                                   std::string const & buf_str, std::string const & buf_value_str, driver::backend_type) const
 {
--- a/lib/jit/generation/reduce_2d.cpp
+++ b/lib/jit/generation/reduce_2d.cpp
@@ -290,6 +290,14 @@ std::vector<int_t> reduce_2d::input_sizes(expression_tree const & tree) const
  return {shape[0], shape[1]};
 }

+expression_type reduce_2d::type() const
+{
+  if(reduction_type_==REDUCE_ROWS)
+    return REDUCE_2D_ROWS;
+  else
+    return REDUCE_2D_COLS;
+}
+
 void reduce_2d::enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & control)
 {
  expression_tree const & tree = control.x();
--- a/lib/runtime/profiles.cpp
+++ b/lib/runtime/profiles.cpp
@@ -77,7 +77,7 @@ profiles::value_type::value_type(expression_type etype, numeric_type dtype, pred
 }


-profiles::value_type::value_type(expression_type etype, numeric_type dtype, std::shared_ptr<templates::base> const & tp, driver::CommandQueue const & queue) : templates_(1,tp), queue_(queue), cache_(driver::backend::programs::get(queue,etype,dtype))
+profiles::value_type::value_type(numeric_type dtype, std::shared_ptr<templates::base> const & tp, driver::CommandQueue const & queue) : templates_(1,tp), queue_(queue), cache_(driver::backend::programs::get(queue,tp->type(),dtype))
 {
  cache_.clear();
 }
@@ -197,7 +197,7 @@ void profiles::import(std::string const & str, driver::CommandQueue const & queu
            result[{etype, dtype}] = std::make_shared<value_type>(etype, dtype, predictor, templates, queue);
          }
          else
-            result[{etype, dtype}] = std::make_shared<value_type>(etype, dtype, templates[0], queue);
+            result[{etype, dtype}] = std::make_shared<value_type>(dtype, templates[0], queue);
        }
      }
    }
--- a/python/src/bind/common.hpp
+++ b/python/src/bind/common.hpp
@@ -86,31 +86,5 @@ namespace tools
            throw;
        }
    }
-
-    inline sc::expression_type extract_template_type(bp::object const & odtype)
-    {
-        std::string name = bp::extract<std::string>(odtype.attr("__class__").attr("__name__"))();
-        if(name=="class")
-          name = bp::extract<std::string>(odtype.attr("__name__"))();
-        else
-          name = bp::extract<std::string>(odtype.attr("__class__").attr("__name__"))();
-
-        if(name=="elementwise_1d") return sc::ELEMENTWISE_1D;
-        else if(name=="elementwise_2d") return sc::ELEMENTWISE_2D;
-        else if(name=="reduce_1d") return sc::REDUCE_1D;
-        else if(name=="reduce_2d_rows") return sc::REDUCE_2D_ROWS;
-        else if(name=="reduce_2d_cols") return sc::REDUCE_2D_COLS;
-        else if(name=="gemm_nn") return sc::GEMM_NN;
-        else if(name=="gemm_tn") return sc::GEMM_TN;
-        else if(name=="gemm_nt") return sc::GEMM_NT;
-        else if(name=="gemm_tt") return sc::GEMM_TT;
-        else
-        {
-            PyErr_SetString(PyExc_TypeError, "Template type not understood");
-            bp::throw_error_already_set();
-            throw;
-        }
-    }
-
 }
 #endif
--- a/python/src/bind/core.cpp
+++ b/python/src/bind/core.cpp
@@ -106,7 +106,7 @@ namespace detail
  std::shared_ptr<rt::profiles::value_type> construct_model(bp::object const & tp, bp::object dtype, sc::driver::CommandQueue & queue)
  {
      tpt::base* raw =  bp::extract<tpt::base*>(tp);
-      return std::make_shared<rt::profiles::value_type>(tools::extract_template_type(tp), tools::extract_dtype(dtype), raw->getptr(), queue);
+      return std::make_shared<rt::profiles::value_type>(tools::extract_dtype(dtype), raw->getptr(), queue);
  }

  std::shared_ptr<sc::array>
@@ -219,9 +219,9 @@ namespace detail
  {
      static rt::profiles::value_type& get_item(rt::profiles::map_type& container, bp::tuple i_)
      {
-          sc::expression_type expression = tools::extract_template_type(i_[0]);
+          tpt::base* tpt =  bp::extract<tpt::base*>(i_[0]);
          sc::numeric_type dtype = tools::extract_dtype(i_[1]);
-          rt::profiles::map_type::iterator i = container.find(std::make_pair(expression, dtype));
+          rt::profiles::map_type::iterator i = container.find(std::make_pair(tpt->type(), dtype));
          if (i == container.end())
          {
              PyErr_SetString(PyExc_KeyError, "Invalid key");
@@ -232,9 +232,9 @@ namespace detail

      static void set_item(rt::profiles::map_type& container, bp::tuple i_, rt::profiles::value_type const & v)
      {
-          sc::expression_type expression = tools::extract_template_type(i_[0]);
+          tpt::base* tpt =  bp::extract<tpt::base*>(i_[0]);
          sc::numeric_type dtype = tools::extract_dtype(i_[1]);
-          container[std::make_pair(expression, dtype)].reset(new rt::profiles::value_type(v));
+          container[std::make_pair(tpt->type(), dtype)].reset(new rt::profiles::value_type(v));
      }
  };
 }
--- a/tune/android/tune/optimize.py
+++ b/tune/android/tune/optimize.py
@@ -39,28 +39,6 @@ fetch_types = [sc.templates.fetch_type.FETCH_FROM_GLOBAL_CONTIGUOUS,
               sc.templates.fetch_type.FETCH_FROM_LOCAL,
               sc.templates.fetch_type.FETCH_FROM_LOCAL]

-def exhaustive(template, sizes, context):
-    tree, _ = tools.tree_of(template, sizes, context)
-    metric = tools.metric_of(template)
-    nbits = tools.genetic_infos_of(template)['nbits']
-    categorical = tools.genetic_infos_of(template)['categorical']
-    ranges = [range(2**x) for x in nbits]
-    ranges = list(product(*ranges))
-    timings = {}
-    best = None
-    for idx, r in enumerate(ranges):
-        parameters = tuple([fetch_types[x] if i in categorical else 2**x for i,x in enumerate(r)])
-        try:
-            time = tools.benchmark(template, parameters, tree)
-            if not best or time < best[1]:
-                best = parameters, time
-        except profile_execution_failure:
-            pass
-        if best:
-            stdout.write('%.2f %% | Best %.2f [ for %s ]\r'%(float(idx*100)/len(ranges),metric(sizes, best[1]), best[0]))
-    return best[0]
-        
-
 class GeneticOptimizer:
    
    def __init__(self, logger, naccept=500, niter=1000, cxpb=.4, mutpb=.4, popsize=10, progress_bar = None):
@@ -105,7 +83,10 @@ class GeneticOptimizer:
        def evaluate(genome):
            idx = tuple(genome)
            if idx not in cache:
-                cache[idx] = tools.benchmark(template, decode(genome), tree)
+                time = tools.benchmark(template, template(*decode(genome)), tree)
+                if time == float('inf'):
+                    return time, 
+                cache[idx] = time
            self.progress_bar.update(max(len(cache), it), self.niter, decode(min(cache, key=cache.get)), metric(sizes, min(cache.values())))
            return cache[idx],
            
@@ -132,11 +113,9 @@ class GeneticOptimizer:
        genome = encode(prior if prior else list(initializer.next()))
        while len(population) < self.popsize:
            individual = creator.Individual(genome)
-            try:
-                individual.fitness.values = toolbox.evaluate(genome)
+            individual.fitness.values = toolbox.evaluate(genome)
+            if max(individual.fitness.values) != float('inf'):
                population += [individual]
-            except profile_execution_failure:
-                pass
            genome = encode(list(initializer.next()))
        hof.update(population)
        
@@ -146,26 +125,25 @@ class GeneticOptimizer:
            #Generate offspring
            offspring = []
            while len(offspring) < self.popsize:
-                try:
-                    op_choice = random.random()
-                    #Cross-over
-                    if op_choice < self.cxpb: 
-                        ind1, ind2 = map(toolbox.clone, random.sample(population, 2))
-                        ind1, ind2 = toolbox.mate(ind1, ind2)
-                        ind = ind1
-                        toolbox.evaluate(ind)
+                op_choice = random.random()
+                #Cross-over
+                if op_choice < self.cxpb: 
+                    ind1, ind2 = map(toolbox.clone, random.sample(population, 2))
+                    ind1, ind2 = toolbox.mate(ind1, ind2)
+                    ind = ind1
+                    toolbox.evaluate(ind)
+                    if max(ind.fitness.values) != float('inf'):
                        offspring += [ind]
-                    #Mutation
-                    elif op_choice < self.cxpb + self.mutpb: 
-                        ind = toolbox.clone(random.choice(population))
-                        ind, = toolbox.mutate(ind, 1.0/offsets[-1])
-                        toolbox.evaluate(ind)
+                #Mutation
+                elif op_choice < self.cxpb + self.mutpb: 
+                    ind = toolbox.clone(random.choice(population))
+                    ind, = toolbox.mutate(ind, 1.0/offsets[-1])
+                    toolbox.evaluate(ind)
+                    if max(ind.fitness.values) != float('inf'):
                        offspring += [ind]
-                    #Reproduction
-                    else: 
-                        offspring += [random.choice(population)]
-                except profile_execution_failure:
-                    pass
+                #Reproduction
+                else: 
+                    offspring += [random.choice(population)]

            #Update fitnesses
            fitnesses = toolbox.map(toolbox.evaluate, offspring)
@@ -195,9 +173,8 @@ def is_local_optimum(parameters, template, sizes, context):
        sweep_over = [0,1,2,3,4]
    
    #Evaluate the provided parameters guess
-    try:
-        reference = tools.benchmark(template, parameters, tree)
-    except profile_execution_failure:
+    reference = tools.benchmark(template, template(*parameters), tree)
+    if isinf(reference):
        return False

    #Latency bound -- ignore
@@ -210,12 +187,9 @@ def is_local_optimum(parameters, template, sizes, context):
    for x in product(*domain):
        if x==parameters:
            pass
-        try:
-            time = tools.benchmark(template, x, tree)
-            if time/reference < .98:
-                return False
-        except profile_execution_failure:
-            pass
+        time = tools.benchmark(template, template(*x), tree)
+        if time/reference < .98:
+            return False
    return True
    
    
--- a/tune/android/tune/tools.py
+++ b/tune/android/tune/tools.py
@@ -40,15 +40,18 @@ def linspace(a, b, n=100):
 def expspace(a,b,N,r=128):
    return [int(ceil(exp(x)/r)*r) for x in linspace(log(a), log(b), N)]
                  
-def benchmark(template, setting, tree):
+def benchmark(operation, template, tree):
    queue = tree.context.queues[0]
-    queue.profiles[template, sc.float32] = sc.profile(template(*setting), sc.float32, queue)
+    queue.profiles[template, sc.float32] = sc.profile(template, sc.float32, queue)
    times = []
    total = 0
    i = 0
    #Warm-up
-    z, events = sc.driver.enqueue(tree)
-    tree.context.queues[0].synchronize()
+    try:
+        z, events = sc.driver.enqueue(tree)
+        tree.context.queues[0].synchronize()
+    except profile_execution_failure:
+        return float("inf")
    #Time
    while total < 1e-1:
        start = time()
@@ -119,6 +122,16 @@ def metric_name_of(template):
        return 'GFLOPS'
    return 'GB/S'

+def external_profiles(template):
+    if template is sc.templates.gemm_nn:
+        return [sc.templates.cublas_gemm('N', 'N')]
+    elif template is sc.templates.gemm_tn:
+        return [sc.templates.cublas_gemm('T', 'N')]
+    elif template is sc.templates.gemm_nt:
+        return [sc.templates.cublas_gemm('N', 'T')]
+    elif template is sc.templates.gemm_tt:
+        return [sc.templates.cublas_gemm('T', 'T')]
+        
 def genetic_infos_of(template):
    if issubclass(template, sc.templates.elementwise_1d):
        return {'categorical': [3], 'nbits': [3,4,4,2] }
--- a/tune/android/tune/tune.py
+++ b/tune/android/tune/tune.py
@@ -69,73 +69,52 @@ class Tuner:

        #BLAS1 training sizes
        if operation in [sc.templates.elementwise_1d, sc.templates.reduce_1d]:
-            if level=='simple':
-                sizes = [(10000000,)]
-            elif level=='intermediate':
-                sizes = [(x,) for x in tools.expspace(1e3, 1e8, 10)]
-            else:
-                sizes = [(x,) for x in tools.expspace(1e3, 1e8, 100)] 
+            sizes = [(x,) for x in tools.expspace(1e3, 1e8, 20)]
        
        #BLAS2 training sizes
        if operation in [sc.templates.elementwise_2d, sc.templates.reduce_2d_rows, sc.templates.reduce_2d_cols]:
-            if level=='simple':
-                sizes = [(1536, 1536)]
-            elif level=='intermediate':
-				sizes = []
-				#Square
-				for N in [896, 1760, 2048, 2560]:
-				   sizes += [(N, N)]
-				#Tall and Skinny
-				for M in [16, 32, 64, 128]:
-					for N in [1024, 4096, 16384, 65536, 262144]:
-						sizes += [(M, N)]
-						sizes += [(N, M)]
-            else:
-                sizes = product(pow2range(4,17), pow2range(4,17))
+            sizes = []
+            #Square
+            for N in [896, 1760, 2048, 2560]:
+                sizes += [(N, N)]
+            #Tall and Skinny
+            for M in [16, 32, 64, 128]:
+                for N in [1024, 4096, 16384, 65536, 262144]:
+                    sizes += [(M, N)]
+                    sizes += [(N, M)]
        
        #BLAS3 training sizes
        if operation in [sc.templates.gemm_nn, sc.templates.gemm_nt, sc.templates.gemm_tn, sc.templates.gemm_tt]:
-            if level=='simple':
-                sizes = [(2560,2560,2560)]
-            elif level=='intermediate':
-               sizes = []
-               #Square
-               for N in [896, 1760, 2048, 2560]:
-				   sizes += [(N, N, N)]
-               #LaPack
-               for N in [896, 1760, 2048, 2560]:
-				   for K in [16, 32, 64, 128]:
-					   sizes += [(N, N, K)]
-               #Covariance
-               for N in [16, 32, 64, 128]:
-				   for K in [16000,32000,64000,128000]:
-					   sizes += [(N, N, K)]
-               #DeepSpeech
-               for M in [1760, 2048, 2560]:
-                   for N in [16, 32, 64, 128, M]:
-                       sizes += [(M, N, M)]
-            elif level=='full':
-			    sizes = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 17))
-
-        #Remove duplicates and or too small/big tuples
-        sizes = [x for x in sizes if 1e-4 <= tools.memory_footprint(operation, x) <= 2e-1]
+            sizes = []
+            #Square
+            for N in [896, 1760, 2048, 2560]:
+                sizes += [(N, N, N)]
+            #LaPack
+            for N in [896, 1760, 2048, 2560]:
+			   for K in [16, 32, 64, 128]:
+				   sizes += [(N, N, K)]
+            #Covariance
+            for N in [16, 32, 64, 128]:
+			   for K in [16000,32000,64000,128000]:
+				   sizes += [(N, N, K)]
+            #DeepSpeech
+            for M in [1760, 2048, 2560]:
+                for N in [16, 32, 64, 128, M]:
+                    sizes += [(M, N, M)]

        #Training data
        performance = tools.metric_of(operation)
        profiles, X, Y = [], [], []
        
-        #Restore previous run
+        #Restore progress
        savepath = os.path.join('save', operation.__name__)
        if not os.path.exists(savepath):
            os.makedirs(savepath)
-        
        try:
            with open(os.path.join(savepath, 'X.csv')) as f:
                X = [tuple(map(int, row)) for row in csv.reader(f, delimiter=',')]
-                
            with open(os.path.join(savepath, 'Y.csv')) as f:
                Y = [map(float, row) for row in csv.reader(f, delimiter=',')]
-            
            with open(os.path.join(savepath, 'profiles.csv')) as f:
                def mmap(x):
                    if x=='FETCH_FROM_LOCAL':
@@ -149,94 +128,69 @@ class Tuner:
        except:
            pass
        
-        ##### Exploration #####
+        #Tuning
        for idx, x in enumerate(sizes):
+            #Create new line on log
            if idx>0:
-                self.progress_bar.set_finished()
-
+             self.progress_bar.set_finished()
            self.progress_bar.set_prefix(', '.join(map(str, x)))
-            #Skip if saved
+            #Skip if already saved
            if x in X:
                row = Y[X.index(x)]
                self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
                continue
-            
-            #Check if the current best prediction is not a local optimum
-            idx = len(X)
-            nparams = len(profiles)
            tree, operands = tools.tree_of(operation, x, context)
-            if idx==0:
-                retune = True
-                predicted = None
-            else:
-                if nparams==1:
-                    predicted = profiles[0]
-                else:
-                    clf = RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y)
-                    #clf, nrmse = model.train(X, Y, profiles)
-                    predperf = clf.predict(x)[0]
-                    best = (-predperf).argsort()
-                    perf = []
-                    for b in best:
-                        try:
-                            perf += [performance(x, tools.benchmark(operation, profiles[b], tree))]
-                            break
-                        except profile_execution_failure:
-                            pass
-                    if perf:
-                        predicted = profiles[best[argmax(perf)]]
-                        retune = not optimize.is_local_optimum(predicted, operation, x, context)
-                    else:
-                        retune = True
-                        predicted = None
-                
+            #Check if GA needs to run (i.e., current best prediction is not a local optimum)
+            tune = True
+            best = None
+            if idx > 0:
+                dim = min(10, idx+1)
+                model = RandomForestRegressor(dim, dim).fit(X, Y)
+                predictions = model.predict(x)[0]
+                for idx in (-predictions).argsort():
+                    ts = tools.benchmark(operation, operation(*profiles[idx]), tree)
+                    if np.isfinite(ts):
+                        break
+                if np.isfinite(ts):
+                    best = profiles[idx]
+                    tune = not optimize.is_local_optimum(predicted, operation, x, context)
            #Retune if necessary
-            if retune:
+            if tune:
                optimizer = optimize.GeneticOptimizer(self.logger, naccept=1000, niter=1000, cxpb=.4, mutpb=.4, popsize=20, progress_bar = self.progress_bar)
-                new = optimizer.run(operation, x, context, prior=predicted)[0]
-                if new not in profiles:
-                    profiles.append(new)
-                    if idx > 0:
-                        for xx,yy in zip(X, Y):
-                            _tree, _operands = tools.tree_of(operation, xx, context)
-                            try:
-                                time = tools.benchmark(operation, new, _tree)
-                                perf = performance(xx, time)
-                            except profile_execution_failure:
-                                perf = 0
-                            yy.append(0 if isinf(perf) else perf)
-                            
-                
-            ##### Training #####
-            y = []
-            fastest = max(predperf) if nparams > 1 else None
-            for ip, p in enumerate(profiles):
-                try:
-                    perf = 0 if fastest and ip < nparams and predperf[ip]/fastest < .1 else performance(x,tools.benchmark(operation, p, tree))
-                except profile_execution_failure:
-                    perf = 0
-                y.append(0 if isinf(perf) else perf)
+                best = optimizer.run(operation, x, context, prior=best)[0]
+                if best not in profiles:
+                    profiles.append(best)
+                    for xx,yy in zip(X, Y):
+                        tree, _operands = tools.tree_of(operation, xx, context)
+                        time = tools.benchmark(operation, best, _tree)
+                        yy.append(performance(xx, time))
+            #Update dataset
            X.append(x)
+            y = [performance(x,tools.benchmark(operation, prf, tree)) for prf in profiles]
            Y.append(y)
-            
            #Save data
            for (fname, data) in zip(['X.csv', 'Y.csv', 'profiles.csv'], [X, Y, profiles]):
                with open(os.path.join(savepath, fname), 'wb') as f:
                    csv.writer(f).writerows(data)
-            
            #print performance info in case no tuning was done
-            if not retune:
+            if not tune:
                row = Y[X.index(x)]
                self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
        self.progress_bar.set_finished()
        
-        #Remove unused profiles
+        #Adding external profiles
+        #~ for prf in tools.external_profiles(operation):
+            #~ x = [1024, 1024, 1024]
+            #~ tree, operands = tools.tree_of(operation, x, context)
+            #~ print performance(x,tools.benchmark(operation, prf, tree))
+            
+        #Pruning of useless profiles
        if len(Y[0]) > 1:
            unused = np.where(np.bincount(np.argmax(Y, 1))==0)[0]
            profiles = [p for ip,p in enumerate(profiles) if ip not in unused]
            Y = np.delete(Y, np.where(np.bincount(np.argmax(Y, 1))==0), axis=1).tolist()          
        
-        ##### Exportation #####
+        #Exporting to JSON
        json_path = tools.sanitize(device.name) + '.json' if not self.json_path else self.json_path
        if os.path.isfile(json_path):
            json_data = json.load(open(json_path, 'r'))