diff --git a/python/autotune/external/config.ini b/python/autotune/external/config.ini
index 65053f508..faffd5eb7 100644
--- a/python/autotune/external/config.ini
+++ b/python/autotune/external/config.ini
@@ -1,28 +1,23 @@
-viennacl-src-root = /home/philippe/Development/viennacl-dev/viennacl/
+#~ viennacl-src-root = /home/philippe/Development/viennacl-dev/viennacl/
 
 [vector-axpy]
-devices = 0
-precision = single, double
-size = 5000000
+precision = single
+#~ size = 5000000
 
-[reduction]
-devices = 0
-precision = single, double
-size = 5000000
-
-[matrix-axpy]
-devices = 0
-precision = single, double
-size = 2560, 2560
-
-[row-wise-reduction]
-devices = 0
-precision = single, double
-layout = N,T
-size = 2560, 2560
-
-[matrix-product]
-devices = 0
-precision = single, double
-layout = NN,NT,TN,TT
-size = 1536, 1536, 1536
+#~ [reduction]
+#~ precision = single, double
+#~ size = 5000000
+#~ 
+#~ [matrix-axpy]
+#~ precision = single, double
+#~ size = 2560, 2560
+#~ 
+#~ [row-wise-reduction]
+#~ precision = single, double
+#~ layout = N,T
+#~ size = 2560, 2560
+#~ 
+#~ [matrix-product]
+#~ precision = single, double
+#~ layout = NN,NT,TN,TT
+#~ size = 1536, 1536, 1536
diff --git a/python/autotune/pysrc/autotune.py b/python/autotune/pysrc/autotune.py
index 31455f912..5009c3c59 100644
--- a/python/autotune/pysrc/autotune.py
+++ b/python/autotune/pysrc/autotune.py
@@ -1,18 +1,19 @@
 from __future__ import division
 
-import argparse, itertools, os, sys
+import argparse, itertools, os, sys, json
 import misc_tools, optimize
 
 import pyopencl as cl
 import pyviennacl as vcl
 import pyatidlas as atd
-
+import numpy as np
 
 from configobj import ConfigObj
 from numpy import random
 from dataset import generate_dataset
 from model import train_model
 
+
 DATATYPES = { 'single' : vcl.float32,
               'double' : vcl.float64 }
 
@@ -36,34 +37,34 @@ TYPES = { 'vector-axpy': {'template':atd.VectorAxpyTemplate,
                             'perf-index': lambda x: 2*x[1][0]*x[1][1]*x[1][2]/x[2]*1e-9,
                             'perf-measure': 'GFLOP/s'} }
 
-def do_tuning(config_fname, viennacl_root):
+
+def do_tuning(config_fname, viennacl_root, device):
+    json_out = {}
     config = ConfigObj(config_fname)
+
     def map_to_list(T, x):
         return list(map(T, x if isinstance(x, list) else [x]))
+
     for operation in ['vector-axpy', 'matrix-axpy', 'reduction', 'row-wise-reduction', 'matrix-product']:
+
         if operation in config:
             p = config[operation]
-            confdevices = p['devices']
-            all_devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
-            DEVICES_PRESETS = {'all': all_devices,
-                               'gpus': [d for d in all_devices if d.type==cl.device_type.GPU],
-                               'cpus': [d for d in all_devices if d.type==cl.device_type.CPU],
-                               'accelerators': [d for d in all_devices if d.type==cl.device_type.ACCELERATOR]
-            }
-            devices = DEVICES_PRESETS[confdevices] if confdevices in DEVICES_PRESETS else [all_devices[int(i)] for i in confdevices]
             precisions =  map_to_list(str, p['precision'])
             if 'all' in precisions:
                 precisions = ['single','double']
             datatypes = [DATATYPES[k] for k in precisions]
-            #Iterate through the datatypes and the devices
-            for datatype, device in itertools.product(datatypes, devices):
+
+            #Iterate through the datatypes
+            for datatype in datatypes:
+
                 ctx = cl.Context([device])
                 ctx = vcl.backend.Context(ctx)
-                device = ctx.current_device
+
                 #Check data-type
                 if datatype is vcl.float64 and not device.double_fp_config:
                     sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...')
                     continue
+
                 #Helper for execution
                 def execute(device, node, other_params, sizes, fname = os.devnull, parameters = None):
                     with vcl.Statement(node) as statement:
@@ -75,6 +76,7 @@ def do_tuning(config_fname, viennacl_root):
                         with open(fname, "w+") as archive:
                             return optimize.genetic(statement, device, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
                                                     lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive)
+
                 #Helper for tuning
                 def tune(execution_handler, nTuning, nDataPoints, draw, additional_parameters):
                     if 'size' in p:
@@ -85,7 +87,20 @@ def do_tuning(config_fname, viennacl_root):
                         def compute_perf(x, t):
                             return TYPES[operation]['perf-index']([datatype().itemsize, x, t])
                         X, Y, profiles = generate_dataset(TYPES[operation]['template'], execution_handler, nTuning, nDataPoints, draw)
-                        train_model(X, Y, profiles, TYPES[operation]['perf-measure'])
+                        clf = train_model(X, Y, profiles, TYPES[operation]['perf-measure'])
+
+                        #Update JSON
+                        full_operation = operation + ''.join(additional_parameters)
+                        if full_operation not in json_out:
+                            json_out[full_operation] = {}
+                        json_out[full_operation][datatype.__name__] = {}
+                        D = json_out[full_operation][datatype.__name__]
+                        D['profiles'] = [ prof.astype('int').tolist() for prof in profiles]
+                        D['predictor'] = [{'children_left': e.tree_.children_left.tolist(),
+                                       'children_right': e.tree_.children_right.tolist(),
+                                       'threshold': e.tree_.threshold.astype('float32').tolist(),
+                                       'feature': e.tree_.feature.astype('float32').tolist(),
+                                       'value': e.tree_.value[:,:,0].astype('float32').tolist()} for e in clf.estimators_]
 
                 #Vector AXPY
                 if operation=='vector-axpy':
@@ -143,6 +158,10 @@ def do_tuning(config_fname, viennacl_root):
                             return execute(device, vcl.Assign(C,LHS*RHS*alpha + C*beta),(A_trans, B_trans), sizes, fname, parameters)
                         tune(execution_handler, 50, 2000, lambda : 64*np.random.randint(low=1, high=40, size=3),(layout[0], layout[1]))
 
+    dname = misc_tools.sanitize_string(device.name)
+    json_out["version"] = "1.0"
+    json.dump(json_out, open(dname + '.json','w'))
+
 
 
 if __name__ == "__main__":
@@ -151,14 +170,15 @@ if __name__ == "__main__":
     print_devices_parser = subparsers.add_parser('list-devices', help='list the devices available')
     tune_parser = subparsers.add_parser('tune', help='tune using a specific configuration file')
     tune_parser.add_argument("--config", default="config.ini", required=False, type=str)
+    tune_parser.add_argument("--device", default=0, required=False, type=str)
     tune_parser.add_argument("--viennacl-root", default='', required=False, type=str)
     args = parser.parse_args()
 
+    devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
     if(args.action=='list-devices'):
         print("----------------")
         print("Devices available:")
         print("----------------")
-        devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
         for (i, d) in enumerate(devices):
             print 'Device', i, '|',  cl.device_type.to_string(d.type), '|', d.name, 'on', d.platform.name
         print("----------------")
@@ -166,4 +186,4 @@ if __name__ == "__main__":
         print("------")
         print("Auto-tuning")
         print("------")
-        do_tuning(args.config, args.viennacl_root)
+        do_tuning(args.config, args.viennacl_root, devices[args.device])
diff --git a/python/autotune/pysrc/dataset.py b/python/autotune/pysrc/dataset.py
index 376a97af6..c06476217 100644
--- a/python/autotune/pysrc/dataset.py
+++ b/python/autotune/pysrc/dataset.py
@@ -15,48 +15,53 @@ def resample(X, draw):
 
 def generate_dataset(TemplateType, execution_handler, nTuning, nDataPoints, draw):
 
-    print "Getting some good profiles..."
-    nDim = draw().size
-    X = np.empty((nTuning, nDim))
-    t = np.empty(nTuning)
-    profiles = []
-    for i in range(nTuning):
-        x = resample(X, draw)
-        y = execution_handler(x)
-        if y not in profiles:
-            profiles.append(y)
-        idx = profiles.index(y)
-        X[i,:] = x
-        t[i] = idx
-
-    print "Generating the dataset..."
-    Y = np.empty((nDataPoints, len(profiles)))
-    X = np.empty((nDataPoints, nDim))
-    t = []
-
-    for i in range(nDataPoints):
-        x = resample(X, draw)
-        for j,y in enumerate(profiles):
-            T = execution_handler(x, os.devnull, y)
-            Y[i,j] = T
-        idx = np.argmax(Y[i,:])
-        X[i,:] = x
-        t = np.argmax(Y[:i+1,], axis=1)
-        if i%10==0:
-            sys.stdout.write('%d data points generated\r'%i)
-            sys.stdout.flush()
+    # print "Getting some good profiles..."
+    # nDim = draw().size
+    # X = np.empty((nTuning, nDim))
+    # t = np.empty(nTuning)
+    # profiles = []
+    # for i in range(nTuning):
+    #     x = resample(X, draw)
+    #     y = execution_handler(x)
+    #     if y not in profiles:
+    #         profiles.append(y)
+    #     idx = profiles.index(y)
+    #     X[i,:] = x
+    #     t[i] = idx
+    #
+    # print "Generating the dataset..."
+    # Y = np.empty((nDataPoints, len(profiles)))
+    # X = np.empty((nDataPoints, nDim))
+    # t = []
+    #
+    # for i in range(nDataPoints):
+    #     x = resample(X, draw)
+    #     for j,y in enumerate(profiles):
+    #         T = execution_handler(x, os.devnull, y)
+    #         Y[i,j] = T
+    #     idx = np.argmax(Y[i,:])
+    #     X[i,:] = x
+    #     t = np.argmax(Y[:i+1,], axis=1)
+    #     if i%10==0:
+    #         sys.stdout.write('%d data points generated\r'%i)
+    #         sys.stdout.flush()
 
     template_name = TemplateType.__name__
     dir = os.path.join("data", template_name)
     if not os.path.exists(dir):
         os.makedirs(dir)
 
-    np.savetxt(os.path.join(dir,"profiles.csv"), profiles)
-    np.savetxt(os.path.join(dir,"X.csv"), X)
-    np.savetxt(os.path.join(dir,"Y.csv"), Y)
+    # np.savetxt(os.path.join(dir,"profiles.csv"), profiles)
+    # np.savetxt(os.path.join(dir,"X.csv"), X)
+    # np.savetxt(os.path.join(dir,"Y.csv"), Y)
 
     profiles = np.loadtxt(os.path.join(dir, "profiles.csv"))
     X = np.loadtxt(os.path.join(dir, "X.csv"),ndmin=2)
     Y = np.loadtxt(os.path.join(dir, "Y.csv"),ndmin=2)
 
+    #idx = np.argsort(np.bincount(np.argmin(Y, axis=1)))
+    idx = np.argsort(Y[np.argmax(X),:])
+    Y = Y[:, idx]
+    profiles = profiles[idx]
+
     return X, Y, profiles
diff --git a/python/autotune/pysrc/misc_tools.py b/python/autotune/pysrc/misc_tools.py
index bed1fd7d9..001b88ba7 100644
--- a/python/autotune/pysrc/misc_tools.py
+++ b/python/autotune/pysrc/misc_tools.py
@@ -207,13 +207,13 @@ def benchmark(template, statement, device):
         return current_time/N
 
 
+def sanitize_string(string, keep_chars = ['_']):
+    string = string.replace(' ', '_').replace('-', '_').lower()
+    string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip()
+    return string
+
 def update_viennacl_headers(viennacl_root, device, datatype, operation, additional_parameters, parameters):
     
-    def sanitize_string(string, keep_chars = ['_']):
-        string = string.replace(' ', '_').replace('-', '_').lower()
-        string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip()
-        return string
-    
     def append_include(data, path):
         include_name = '#include "' + path +'"\n'
         already_included = data.find(include_name)
diff --git a/python/autotune/pysrc/model.py b/python/autotune/pysrc/model.py
index f43d3a9f4..eaad42605 100644
--- a/python/autotune/pysrc/model.py
+++ b/python/autotune/pysrc/model.py
@@ -1,7 +1,6 @@
 from sklearn import tree
 from sklearn import ensemble
-from numpy import array, bincount, mean, std, max, argmax, min, argmin, median
-
+import numpy as np
 
 def gmean(a, axis=0, dtype=None):
     if not isinstance(a, np.ndarray):  # if not an ndarray object attempt to convert it
@@ -16,25 +15,30 @@ def gmean(a, axis=0, dtype=None):
     return np.exp(log_a.mean(axis=axis))
     
 def train_model(X, Y, profiles, metric):
-    print("Building the model...")
-
-    Xmean = mean(X)
-    Xstd = std(X)
-    X = (X - Xmean)/Xstd
-
-    Y = Y[:, :]
-    Ymax = max(Y)
+    Y=Y[:,:]
+    profiles=profiles[:]
+    Ymax = np.max(Y)
     Y = Y/Ymax
 
-    ref = argmax(bincount(argmin(Y, axis=1))) #most common profile
-    cut = int(0.800*X.shape[0]+1)
-
     #Train the model
-    clf = ensemble.RandomForestRegressor(10, max_depth=10).fit(X[:cut,:], Y[:cut,:])
+    cut = int(0.75*X.shape[0])
+    clf = ensemble.RandomForestRegressor(10, max_depth=4).fit(X[:cut,:], Y[:cut,:])
 
-    t = argmin(clf.predict(X[cut:,:]), axis = 1)
-    s = array([y[ref]/y[k] for y,k in zip(Y[cut:,:], t)])
-    tt = argmin(Y[cut:,:], axis = 1)
-    ss = array([y[ref]/y[k] for y,k in zip(Y[cut:,:], tt)])
-    print("Testing speedup : mean = %.3f, median = %.3f, min = %.3f,  max %.3f"%(gmean(s), median(s), min(s), max(s)))
-    print("Optimal speedup : mean = %.3f, median = %.3f, min = %.3f,  max %.3f"%(gmean(ss), median(ss), min(ss), max(ss)))
+    print clf.predict([10000])
+
+    t = np.argmin(clf.predict(X[cut:,:]), axis = 1)
+    s = np.array([y[0]/y[k] for y,k in zip(Y[cut:,:], t)])
+    tt = np.argmin(Y[cut:,:], axis = 1)
+    ss = np.array([y[0]/y[k] for y,k in zip(Y[cut:,:], tt)])
+
+    p5 = lambda a: np.percentile(a, 5)
+    p25 = lambda a: np.percentile(a, 25)
+    p50 = lambda a: np.percentile(a, 50)
+    p75 = lambda a: np.percentile(a, 75)
+    p95 = lambda a: np.percentile(a, 95)
+
+    print("Percentile     :\t 5 \t 25 \t 50 \t 75 \t 95")
+    print("Testing speedup:\t %.2f\t %.2f\t %.2f\t %.2f\t %.3f"%(p5(s), p25(s), p50(s), p75(s), p95(s)))
+    print("Optimal speedup:\t %.2f\t %.2f\t %.2f\t %.2f\t %.3f"%(p5(ss), p25(ss), p50(ss), p75(ss), p95(ss)))
+
+    return clf