Fixed indentation

2014-09-29 03:01:33 +02:00
parent 0eb56a10f0
commit f4653d9174
9 changed files with 810 additions and 810 deletions
--- a/autotune/python/autotune.py
+++ b/autotune/python/autotune.py
@@ -49,98 +49,98 @@ TYPES = { 'vector-axpy': {'template':vcl.atidlas.VectorAxpyTemplate,
                            'perf-measure': 'GFLOP/s'} }

 def do_tuning(config_fname, spec_fname, viennacl_root):
-  config = ConfigObj(config_fname, configspec=spec_fname)
-  map_to_list = lambda T: list(map(T[0], T[1] if isinstance(T[1], list) else [T[1]])) 
-  for operation in ['vector-axpy', 'matrix-axpy', 'row-wise-reduction', 'matrix-product']:
-   if operation in config:
-     p = config[operation]        
-     confdevices = p['devices']
-     devices = utils.DEVICES_PRESETS[confdevices] if confdevices in utils.DEVICES_PRESETS else [utils.all_devices[int(i)] for i in confdevices]
-     precisions =  map_to_list((str, p['precision']))
-     datatypes = [DATATYPES[k] for k in precisions]
-     #Iterate through the datatypes and the devices
-     for datatype, device in itertools.product(datatypes, devices):
-       ctx = cl.Context([device])
-       ctx = vcl.backend.Context(ctx)
-       device = ctx.current_device
-       #Check data-type
-       if datatype is vcl.float64 and not device.double_fp_config:
-         sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...')
-         continue
-       #Helper
-       def execute(statement, other_params, sizes, fname = os.devnull):
-         print('-----')
-         print(' '.join(map(str, ("Now tuning:", datatype.__name__, '-', operation, '-'.join(other_params), '[' + device.name, '(' + device.platform.name + ')] for sizes', sizes))))
-         with open(fname, "w+") as archive:
-           return optimize.genetic(statement, ctx, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
-                         TYPES[operation]['parameter-names'], lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive)
-       s = map_to_list((int, p['size']))
-       #Vector AXPY
-       if operation=='vector-axpy':
-         x = vcl.Vector(s[0], context=ctx, dtype=datatype)
-         y = vcl.Vector(s[0], context=ctx, dtype=datatype)
-         execute(vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y)), ())
-       #Matrix AXPY
-       if operation=='matrix-axpy':
-         A = vcl.Matrix(s, context=ctx, dtype=datatype)
-         B = vcl.Matrix(s, context=ctx, dtype=datatype)
-         execute(A+B, ())
-       #Row-wise reduction
-       if operation=='row-wise-reduction':
-         layouts = map_to_list((str,p['layout']))
-         if 'all' in layouts:
-           layouts = ['N', 'T']
-         for A_trans in layouts:
-           A = vcl.Matrix(s if A_trans=='N' else s[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR)
-           x = vcl.Vector(s[1] if A_trans=='N' else s[0], context=ctx, dtype=datatype)
-           LHS = A if A_trans=='N' else A.T
-           execute(LHS*x, ())
-       #Matrix Product
-       if operation=='matrix-product':
-         layouts = map_to_list((str,p['layout']))
-         if 'all' in layouts:
-           layouts = ['NN', 'NT', 'TN', 'TT']
-         for layout in layouts:
-           def execution_handler(sizes, fname, parameters=None):
-             A_trans = layout[0]
-             B_trans = layout[1]
-             A = vcl.Matrix((sizes[0], sizes[1]) if A_trans=='N' else (sizes[1],sizes[0]), context=ctx, dtype=datatype, layout=vcl.COL_MAJOR);
-             B = vcl.Matrix((sizes[1], sizes[2]) if B_trans=='N' else (sizes[2],sizes[1]), context=ctx, dtype=datatype, layout=vcl.COL_MAJOR);
-             LHS = A if A_trans=='N' else A.T
-             RHS = B if B_trans=='N' else B.T
-             alpha = vcl.HostScalar(1.0,  context=ctx, dtype = datatype)
-             beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
-             C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR)
-             statement = vcl.Statement(vcl.Assign(C,LHS*RHS*alpha + C*beta))
-             if parameters:
-               TemplateType = TYPES[operation]['template']
-               return tools.benchmark(TemplateType(TemplateType.Parameters(*parameters),A_trans,B_trans), statement, device)
-             else:
-               execute(statement,(A_trans, B_trans), sizes, fname)
-           X, Y, profiles = generate_dataset(TYPES[operation]['template'], execution_handler)
-           train_model(X, Y, profiles)
+    config = ConfigObj(config_fname, configspec=spec_fname)
+    map_to_list = lambda T: list(map(T[0], T[1] if isinstance(T[1], list) else [T[1]]))
+    for operation in ['vector-axpy', 'matrix-axpy', 'row-wise-reduction', 'matrix-product']:
+        if operation in config:
+            p = config[operation]
+            confdevices = p['devices']
+            devices = utils.DEVICES_PRESETS[confdevices] if confdevices in utils.DEVICES_PRESETS else [utils.all_devices[int(i)] for i in confdevices]
+            precisions =  map_to_list((str, p['precision']))
+            datatypes = [DATATYPES[k] for k in precisions]
+            #Iterate through the datatypes and the devices
+            for datatype, device in itertools.product(datatypes, devices):
+                ctx = cl.Context([device])
+                ctx = vcl.backend.Context(ctx)
+                device = ctx.current_device
+                #Check data-type
+                if datatype is vcl.float64 and not device.double_fp_config:
+                    sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...')
+                    continue
+                #Helper
+                def execute(statement, other_params, sizes, fname = os.devnull):
+                    print('-----')
+                    print(' '.join(map(str, ("Now tuning:", datatype.__name__, '-', operation, '-'.join(other_params), '[' + device.name, '(' + device.platform.name + ')] for sizes', sizes))))
+                    with open(fname, "w+") as archive:
+                        return optimize.genetic(statement, ctx, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
+                                      TYPES[operation]['parameter-names'], lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive)
+                s = map_to_list((int, p['size']))
+                #Vector AXPY
+                if operation=='vector-axpy':
+                    x = vcl.Vector(s[0], context=ctx, dtype=datatype)
+                    y = vcl.Vector(s[0], context=ctx, dtype=datatype)
+                    execute(vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y)), ())
+                #Matrix AXPY
+                if operation=='matrix-axpy':
+                    A = vcl.Matrix(s, context=ctx, dtype=datatype)
+                    B = vcl.Matrix(s, context=ctx, dtype=datatype)
+                    execute(A+B, ())
+                #Row-wise reduction
+                if operation=='row-wise-reduction':
+                    layouts = map_to_list((str,p['layout']))
+                    if 'all' in layouts:
+                        layouts = ['N', 'T']
+                    for A_trans in layouts:
+                        A = vcl.Matrix(s if A_trans=='N' else s[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR)
+                        x = vcl.Vector(s[1] if A_trans=='N' else s[0], context=ctx, dtype=datatype)
+                        LHS = A if A_trans=='N' else A.T
+                        execute(LHS*x, ())
+                #Matrix Product
+                if operation=='matrix-product':
+                    layouts = map_to_list((str,p['layout']))
+                    if 'all' in layouts:
+                        layouts = ['NN', 'NT', 'TN', 'TT']
+                    for layout in layouts:
+                        def execution_handler(sizes, fname, parameters=None):
+                            A_trans = layout[0]
+                            B_trans = layout[1]
+                            A = vcl.Matrix((sizes[0], sizes[1]) if A_trans=='N' else (sizes[1],sizes[0]), context=ctx, dtype=datatype, layout=vcl.COL_MAJOR);
+                            B = vcl.Matrix((sizes[1], sizes[2]) if B_trans=='N' else (sizes[2],sizes[1]), context=ctx, dtype=datatype, layout=vcl.COL_MAJOR);
+                            LHS = A if A_trans=='N' else A.T
+                            RHS = B if B_trans=='N' else B.T
+                            alpha = vcl.HostScalar(1.0,  context=ctx, dtype = datatype)
+                            beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
+                            C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR)
+                            statement = vcl.Statement(vcl.Assign(C,LHS*RHS*alpha + C*beta))
+                            if parameters:
+                                TemplateType = TYPES[operation]['template']
+                                return tools.benchmark(TemplateType(TemplateType.Parameters(*parameters),A_trans,B_trans), statement, device)
+                            else:
+                                execute(statement,(A_trans, B_trans), sizes, fname)
+                        X, Y, profiles = generate_dataset(TYPES[operation]['template'], execution_handler)
+                        train_model(X, Y, profiles)



 if __name__ == "__main__":
-  parser = argparse.ArgumentParser();
-  subparsers = parser.add_subparsers(dest='action')
-  print_devices_parser = subparsers.add_parser('list-devices', help='list the devices available')
-  tune_parser = subparsers.add_parser('tune', help='tune using a specific configuration file')
-  tune_parser.add_argument("--config", default="config.ini", required=False, type=str)
-  tune_parser.add_argument("--viennacl-root", default='', required=False, type=str)
-  args = parser.parse_args()
+    parser = argparse.ArgumentParser();
+    subparsers = parser.add_subparsers(dest='action')
+    print_devices_parser = subparsers.add_parser('list-devices', help='list the devices available')
+    tune_parser = subparsers.add_parser('tune', help='tune using a specific configuration file')
+    tune_parser.add_argument("--config", default="config.ini", required=False, type=str)
+    tune_parser.add_argument("--viennacl-root", default='', required=False, type=str)
+    args = parser.parse_args()

-  if(args.action=='list-devices'):
-      print("----------------")
-      print("Devices available:")
-      print("----------------")
-      devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
-      for (i, d) in enumerate(devices):
-          print('Device', i, ':', utils.DEVICE_TYPE_PREFIX[d.type].upper() + ':', d.name, 'on', d.platform.name)
-      print("----------------")
-  else:
-      print("------")
-      print("Auto-tuning")
-      print("------")
-      do_tuning(args.config, 'config_spec.ini', args.viennacl_root)
+    if(args.action=='list-devices'):
+        print("----------------")
+        print("Devices available:")
+        print("----------------")
+        devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
+        for (i, d) in enumerate(devices):
+            print('Device', i, ':', utils.DEVICE_TYPE_PREFIX[d.type].upper() + ':', d.name, 'on', d.platform.name)
+        print("----------------")
+    else:
+        print("------")
+        print("Auto-tuning")
+        print("------")
+        do_tuning(args.config, 'config_spec.ini', args.viennacl_root)
--- a/autotune/python/dataset.py
+++ b/autotune/python/dataset.py
@@ -7,95 +7,95 @@ from sklearn.neighbors.kde import KernelDensity;
 from pyviennacl.atidlas import FetchingPolicy

 def decode(y):
-  fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED]
-  y[7] = fetch[y[7]]
-  y[8] = fetch[y[8]]
-  return y
+    fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED]
+    y[7] = fetch[y[7]]
+    y[8] = fetch[y[8]]
+    return y

 def generate_dataset(TemplateType, execution_handler):
-  I = 0
-  step = 64;
-  max_size = 4000;
+    I = 0
+    step = 64;
+    max_size = 4000;

-  #Retrieves the existing data
-  print "Retrieving data..."
-  path = "./data"
-  files = os.listdir(path)
-  X = np.empty((len(files),3))
-  t = np.empty(len(files))
-  profiles = []
-  nonemptyfiles = []
-  for i,fname in enumerate(files):
-    if os.path.getsize(os.path.join(path,fname))>0:
-      nonemptyfiles.append(fname)
-  files = nonemptyfiles
+    #Retrieves the existing data
+    print "Retrieving data..."
+    path = "./data"
+    files = os.listdir(path)
+    X = np.empty((len(files),3))
+    t = np.empty(len(files))
+    profiles = []
+    nonemptyfiles = []
+    for i,fname in enumerate(files):
+        if os.path.getsize(os.path.join(path,fname))>0:
+            nonemptyfiles.append(fname)
+    files = nonemptyfiles

-  for i,fname in enumerate(files):
-    MNK = re.search(r"([0-9]+)-([0-9]+)-([0-9]+).csv", fname)
-    fl = open(os.path.join(path,fname),"rb")
-    A = np.loadtxt(fl,delimiter=',')
-    x = np.array([MNK.group(1), MNK.group(2), MNK.group(3)]).astype(float)
-    y = tuple(A[np.argmin(A[:,0]),1:])
-    if y not in profiles:
-      profiles.append(y)
-    idx = profiles.index(y)
-    X[i,:] = x
-    t[i] = idx
+    for i,fname in enumerate(files):
+        MNK = re.search(r"([0-9]+)-([0-9]+)-([0-9]+).csv", fname)
+        fl = open(os.path.join(path,fname),"rb")
+        A = np.loadtxt(fl,delimiter=',')
+        x = np.array([MNK.group(1), MNK.group(2), MNK.group(3)]).astype(float)
+        y = tuple(A[np.argmin(A[:,0]),1:])
+        if y not in profiles:
+            profiles.append(y)
+        idx = profiles.index(y)
+        X[i,:] = x
+        t[i] = idx

-  #Generates new data
-  print "Generating new data..."
-  kdes = [KernelDensity(kernel='gaussian', bandwidth=2*step).fit(X[t==i,:]) for i in range(int(max(t))+1)] if files else [];
-  X.resize((len(files)+I, 3), refcheck=False);
-  t.resize(len(files)+I, refcheck=False);
+    #Generates new data
+    print "Generating new data..."
+    kdes = [KernelDensity(kernel='gaussian', bandwidth=2*step).fit(X[t==i,:]) for i in range(int(max(t))+1)] if files else [];
+    X.resize((len(files)+I, 3), refcheck=False);
+    t.resize(len(files)+I, refcheck=False);

-  max_square = max_size/step
-  for i in range(I):
-    n_per_label = np.bincount(t[0:i+1].astype(int));
-    Xtuples = [tuple(x) for x in X];
-    r = random.random();
-    while(True):
-      if(len(kdes)==0 or r<=1.0/len(kdes)):
-        x = np.array([step*random.randint(1,40), step*random.randint(1,40), step*random.randint(1,40)]);
-      else:
-        probs = (1.0/n_per_label)
-        distr = np.random.choice(range(n_per_label.size), p = probs/np.sum(probs))
-        x = kdes[distr].sample()[0]
-        x = np.maximum(np.ones(x.shape),(x - step/2).astype(int)/step + 1)*step
-      if tuple(x) not in Xtuples:
-        break;
-    x = x.astype(int)
-    fname = os.path.join(path, `x[0]` +"-"+ `x[1]` +"-"+ `x[2]` +".csv")
-    #Execute auto-tuning procedure
-    execution_handler(x, fname)
-    #Load csv into matrix
-    fl = open(fname,"rb");
-    A = np.loadtxt(fl,delimiter=',');
-    #Update the kernel density estimators
-    y = tuple(A[np.argmin(A[:,0]),1:]);
-    if y not in profiles:
-      profiles.append(y);
-      kdes.append(KernelDensity(kernel='gaussian', bandwidth=2*step));
-    idx = profiles.index(y);
-    #Update data
-    X[len(files)+i,:] = x;
-    t[len(files)+i] = idx;
-    #Update density estimator p(M,N,K | t=idx)
-    kdes[idx].fit(X[t[0:len(files)+i+1]==idx,:]);
+    max_square = max_size/step
+    for i in range(I):
+        n_per_label = np.bincount(t[0:i+1].astype(int));
+        Xtuples = [tuple(x) for x in X];
+        r = random.random();
+        while(True):
+            if(len(kdes)==0 or r<=1.0/len(kdes)):
+                x = np.array([step*random.randint(1,40), step*random.randint(1,40), step*random.randint(1,40)]);
+            else:
+                probs = (1.0/n_per_label)
+                distr = np.random.choice(range(n_per_label.size), p = probs/np.sum(probs))
+                x = kdes[distr].sample()[0]
+                x = np.maximum(np.ones(x.shape),(x - step/2).astype(int)/step + 1)*step
+            if tuple(x) not in Xtuples:
+                break;
+        x = x.astype(int)
+        fname = os.path.join(path, `x[0]` +"-"+ `x[1]` +"-"+ `x[2]` +".csv")
+        #Execute auto-tuning procedure
+        execution_handler(x, fname)
+        #Load csv into matrix
+        fl = open(fname,"rb");
+        A = np.loadtxt(fl,delimiter=',');
+        #Update the kernel density estimators
+        y = tuple(A[np.argmin(A[:,0]),1:]);
+        if y not in profiles:
+            profiles.append(y);
+            kdes.append(KernelDensity(kernel='gaussian', bandwidth=2*step));
+        idx = profiles.index(y);
+        #Update data
+        X[len(files)+i,:] = x;
+        t[len(files)+i] = idx;
+        #Update density estimator p(M,N,K | t=idx)
+        kdes[idx].fit(X[t[0:len(files)+i+1]==idx,:]);


-  print "Exporting data...";
-  #Shuffle the list of file
-  files = os.listdir(path)
-  X = np.empty((len(files),3))
-  Y = np.zeros((len(files), len(profiles)))
-  for i,fname in enumerate(files):
-    MNK = re.search(r"([0-9]+)-([0-9]+)-([0-9]+).csv", fname)
-    X[i,:] = map(float,[MNK.group(k) for k in range(1,4)])
-    fl = open(os.path.join(path,fname),"rb");
-    A = np.loadtxt(fl,delimiter=',')
-    for j,y in enumerate(profiles):
-      idx = np.where(np.all(A[:,1:]==y,axis=1))[0]
-      T = A[idx[0], 0] if idx.size else execution_handler(map(int,X[i,:]), '', decode(map(int, y)))
-      Y[i,j] = 2*1e-9*X[i,0]*X[i,1]*X[i,2]/T
+    print "Exporting data...";
+    #Shuffle the list of file
+    files = os.listdir(path)
+    X = np.empty((len(files),3))
+    Y = np.zeros((len(files), len(profiles)))
+    for i,fname in enumerate(files):
+        MNK = re.search(r"([0-9]+)-([0-9]+)-([0-9]+).csv", fname)
+        X[i,:] = map(float,[MNK.group(k) for k in range(1,4)])
+        fl = open(os.path.join(path,fname),"rb");
+        A = np.loadtxt(fl,delimiter=',')
+        for j,y in enumerate(profiles):
+            idx = np.where(np.all(A[:,1:]==y,axis=1))[0]
+            T = A[idx[0], 0] if idx.size else execution_handler(map(int,X[i,:]), '', decode(map(int, y)))
+            Y[i,j] = 2*1e-9*X[i,0]*X[i,1]*X[i,2]/T

-  return X, Y, profiles
+    return X, Y, profiles
--- a/autotune/python/genetic.py
+++ b/autotune/python/genetic.py
@@ -15,12 +15,12 @@ from collections import OrderedDict as odict


 def closest_divisor(N, x):
-  x_low=x_high=max(1,min(round(x),N))
-  while N % x_low > 0 and x_low>0:
-    x_low = x_low - 1
-  while N % x_high > 0 and x_high < N:
-    x_high = x_high + 1
-  return x_low if x - x_low < x_high - x else x_high
+    x_low=x_high=max(1,min(round(x),N))
+    while N % x_low > 0 and x_low>0:
+        x_low = x_low - 1
+    while N % x_high > 0 and x_high < N:
+        x_high = x_high + 1
+    return x_low if x - x_low < x_high - x else x_high

 def b_gray_to_bin(A='00000000', endian='big'):
    assert type(endian) is str
@@ -33,154 +33,152 @@ def b_gray_to_bin(A='00000000', endian='big'):

 class GeneticOperators(object):

-  def __init__(self, device, statement, parameter_names, TemplateType, build_template, out):
-      self.device = device
-      self.statement = statement
-      self.parameter_names = parameter_names
-      self.TemplateType = TemplateType
-      self.ParameterType = TemplateType.Parameters
-      self.build_template = build_template
-      self.cache = {}
-      self.indpb = 0.05
-      self.out = out
+    def __init__(self, device, statement, parameter_names, TemplateType, build_template, out):
+        self.device = device
+        self.statement = statement
+        self.parameter_names = parameter_names
+        self.TemplateType = TemplateType
+        self.ParameterType = TemplateType.Parameters
+        self.build_template = build_template
+        self.cache = {}
+        self.indpb = 0.05
+        self.out = out

-      creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
-      creator.create("Individual", list, fitness=creator.FitnessMin)
+        creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
+        creator.create("Individual", list, fitness=creator.FitnessMin)

-      self.toolbox = base.Toolbox()
-      self.toolbox.register("population", self.init)
-      self.toolbox.register("evaluate", self.evaluate)
-      self.toolbox.register("mate", deap_tools.cxTwoPoint)
-      self.toolbox.register("mutate", self.mutate)
-      self.toolbox.register("select", deap_tools.selNSGA2)
+        self.toolbox = base.Toolbox()
+        self.toolbox.register("population", self.init)
+        self.toolbox.register("evaluate", self.evaluate)
+        self.toolbox.register("mate", deap_tools.cxTwoPoint)
+        self.toolbox.register("mutate", self.mutate)
+        self.toolbox.register("select", deap_tools.selNSGA2)

-  @staticmethod
-  def decode(s):
-    FetchingPolicy = vcl.atidlas.FetchingPolicy
-    fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED]
-    fetchA = fetch[s[0]]
-    fetchB = fetch[s[1]]
-    bincode = ''.join(s[2:])
-    decode_element = lambda x:2**int(b_gray_to_bin(x), 2)
-    simd = decode_element(bincode[0:3])
-    ls0 = decode_element(bincode[2:5])
-    ls1 = decode_element(bincode[5:8])
-    kL = decode_element(bincode[8:11])
-    mS = decode_element(bincode[11:14])
-    kS = decode_element(bincode[14:17])
-    nS = decode_element(bincode[17:20])
-    if fetchA==FetchingPolicy.FETCH_FROM_LOCAL or fetchB==FetchingPolicy.FETCH_FROM_LOCAL:
-      lf0 = decode_element(bincode[20:23])
-      lf1 = ls0*ls1/lf0
-    else:
-      lf0, lf1 = 0, 0
-    return [simd, ls0, kL, ls1, mS, kS, nS, fetchA, fetchB, lf0, lf1]
+    @staticmethod
+    def decode(s):
+        FetchingPolicy = vcl.atidlas.FetchingPolicy
+        fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED]
+        fetchA = fetch[s[0]]
+        fetchB = fetch[s[1]]
+        bincode = ''.join(s[2:])
+        decode_element = lambda x:2**int(b_gray_to_bin(x), 2)
+        simd = decode_element(bincode[0:3])
+        ls0 = decode_element(bincode[2:5])
+        ls1 = decode_element(bincode[5:8])
+        kL = decode_element(bincode[8:11])
+        mS = decode_element(bincode[11:14])
+        kS = decode_element(bincode[14:17])
+        nS = decode_element(bincode[17:20])
+        if fetchA==FetchingPolicy.FETCH_FROM_LOCAL or fetchB==FetchingPolicy.FETCH_FROM_LOCAL:
+            lf0 = decode_element(bincode[20:23])
+            lf1 = ls0*ls1/lf0
+        else:
+            lf0, lf1 = 0, 0
+        return [simd, ls0, kL, ls1, mS, kS, nS, fetchA, fetchB, lf0, lf1]

-  def init(self, N):
-    result = []
-    fetchcount = [0, 0, 0]
-    while len(result) < N:
-      while True:
-        fetch = random.randint(0,2)
-        bincode = [fetch, fetch] + [str(random.randint(0,1)) for i in range(23)]
-        parameters = self.decode(bincode)
-        template = self.build_template(self.TemplateType.Parameters(*parameters))
-        registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4
-        lmem_usage = template.lmem_usage(vcl.atidlas.StatementsTuple(self.statement))
-        local_size = template.parameters.local_size_0*template.parameters.local_size_1
-        occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
-        if not tools.skip(template, self.statement, self.device):
-          fetchcount[fetch] = fetchcount[fetch] + 1
-          if max(fetchcount) - min(fetchcount) <= 1:
-            result.append(creator.Individual(bincode))
-            break
-          else:
-            fetchcount[fetch] = fetchcount[fetch] - 1
-    return result
+    def init(self, N):
+        result = []
+        fetchcount = [0, 0, 0]
+        while len(result) < N:
+            while True:
+                fetch = random.randint(0,2)
+                bincode = [fetch, fetch] + [str(random.randint(0,1)) for i in range(23)]
+                parameters = self.decode(bincode)
+                template = self.build_template(self.TemplateType.Parameters(*parameters))
+                registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4
+                lmem_usage = template.lmem_usage(vcl.atidlas.StatementsTuple(self.statement))
+                local_size = template.parameters.local_size_0*template.parameters.local_size_1
+                occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
+                if not tools.skip(template, self.statement, self.device):
+                    fetchcount[fetch] = fetchcount[fetch] + 1
+                    if max(fetchcount) - min(fetchcount) <= 1:
+                        result.append(creator.Individual(bincode))
+                        break
+                    else:
+                        fetchcount[fetch] = fetchcount[fetch] - 1
+        return result

-  def mutate(self, individual):
-    while True:
-      new_individual = copy.deepcopy(individual)
-      for i in range(len(new_individual)):
-        if i < 2 and random.random() < self.indpb:
-          while new_individual[i] == individual[i]:
-            new_individual[i] = random.randint(0, 2)
-        elif i >= 2 and random.random() < self.indpb:
-          new_individual[i] = '1' if new_individual[i]=='0' else '0'
-      parameters = self.decode(new_individual)
-      template = self.build_template(self.TemplateType.Parameters(*parameters))
-      #print tools.skip(template, self.statement, self.device), parameters
-      if not tools.skip(template, self.statement, self.device):
-        break
-    return new_individual,
+    def mutate(self, individual):
+        while True:
+            new_individual = copy.deepcopy(individual)
+            for i in range(len(new_individual)):
+                if i < 2 and random.random() < self.indpb:
+                    while new_individual[i] == individual[i]:
+                        new_individual[i] = random.randint(0, 2)
+                elif i >= 2 and random.random() < self.indpb:
+                    new_individual[i] = '1' if new_individual[i]=='0' else '0'
+            parameters = self.decode(new_individual)
+            template = self.build_template(self.TemplateType.Parameters(*parameters))
+            #print tools.skip(template, self.statement, self.device), parameters
+            if not tools.skip(template, self.statement, self.device):
+                break
+        return new_individual,

-  def evaluate(self, individual):
-    if tuple(individual) not in self.cache:
-      parameters = self.decode(individual)      
-      template = self.build_template(self.TemplateType.Parameters(*parameters))
-      try:
-        tt = tools.benchmark(template, self.statement, self.device)
-        self.out.write(','.join([str(tt)]+map(str,map(int,parameters)))+'\n')
-        self.cache[tuple(individual)] = tt
-      except:
-        self.cache[tuple(individual)] = 10
-    return self.cache[tuple(individual)],
+    def evaluate(self, individual):
+        if tuple(individual) not in self.cache:
+            parameters = self.decode(individual)
+            template = self.build_template(self.TemplateType.Parameters(*parameters))
+            try:
+                tt = tools.benchmark(template, self.statement, self.device)
+                self.out.write(','.join([str(tt)]+map(str,map(int,parameters)))+'\n')
+                self.cache[tuple(individual)] = tt
+            except:
+                self.cache[tuple(individual)] = 10
+        return self.cache[tuple(individual)],

-  def optimize(self, maxtime, maxgen, compute_perf, perf_metric):
-      hof = deap_tools.HallOfFame(1)
-      # Begin the generational process
-      gen = 0
-      maxtime = time.strptime(maxtime, '%Mm%Ss')
-      maxtime = maxtime.tm_min*60 + maxtime.tm_sec
-      start_time = time.time()
+    def optimize(self, maxtime, maxgen, compute_perf, perf_metric):
+        hof = deap_tools.HallOfFame(1)
+        # Begin the generational process
+        gen = 0
+        maxtime = time.strptime(maxtime, '%Mm%Ss')
+        maxtime = maxtime.tm_min*60 + maxtime.tm_sec
+        start_time = time.time()

-      mu = 30
-      cxpb = 0.2
-      mutpb = 0.7
+        mu = 30
+        cxpb = 0.2
+        mutpb = 0.7

-      population = self.init(mu)
-      invalid_ind = [ind for ind in population if not ind.fitness.valid]
-      fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
-      for ind, fit in zip(invalid_ind, fitnesses):
-        ind.fitness.values = fit
-      hof.update(population)
-        
-      while time.time() - start_time < maxtime:
-        # Vary the population        
-        offspring = []
-        for _ in xrange(mu):
-            op_choice = random.random()
-            if op_choice < cxpb:            # Apply crossover
-                ind1, ind2 = map(self.toolbox.clone, random.sample(population, 2))
-                ind1, ind2 = self.toolbox.mate(ind1, ind2)
-                del ind1.fitness.values
-                offspring.append(ind1)
-            elif op_choice < cxpb + mutpb:  # Apply mutation
-                ind = self.toolbox.clone(random.choice(population))
-                ind, = self.toolbox.mutate(ind)
-                del ind.fitness.values
-                offspring.append(ind)
-            else:                           # Apply reproduction
-                offspring.append(random.choice(population))
-        
-        #~ for x in offspring:
-          #~ print self.decode(x)
-        # Evaluate the individuals with an invalid fitness
-        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
+        population = self.init(mu)
+        invalid_ind = [ind for ind in population if not ind.fitness.valid]
        fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
-        # Update the hall of fame with the generated individuals
-        hof.update(offspring)
-        # Select the next generation population
-        population[:] = self.toolbox.select(population + offspring, mu)
-        #Update
-        gen = gen + 1
-        best_profile = '(%s)'%','.join(map(str,GeneticOperators.decode(hof[0])));
-        best_performance = compute_perf(hof[0].fitness.values[0])
-        sys.stdout.write('Time %d | Best %d %s [ for %s ]\r'%(time.time() - start_time, best_performance, perf_metric, best_profile))
-        sys.stdout.flush()
-      sys.stdout.write('\n')
-      return population
+        hof.update(population)

+        while time.time() - start_time < maxtime:
+            # Vary the population
+            offspring = []
+            for _ in xrange(mu):
+                op_choice = random.random()
+                if op_choice < cxpb:            # Apply crossover
+                    ind1, ind2 = map(self.toolbox.clone, random.sample(population, 2))
+                    ind1, ind2 = self.toolbox.mate(ind1, ind2)
+                    del ind1.fitness.values
+                    offspring.append(ind1)
+                elif op_choice < cxpb + mutpb:  # Apply mutation
+                    ind = self.toolbox.clone(random.choice(population))
+                    ind, = self.toolbox.mutate(ind)
+                    del ind.fitness.values
+                    offspring.append(ind)
+                else:                           # Apply reproduction
+                    offspring.append(random.choice(population))

+            #~ for x in offspring:
+                    #~ print self.decode(x)
+            # Evaluate the individuals with an invalid fitness
+            invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
+            fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
+            for ind, fit in zip(invalid_ind, fitnesses):
+                ind.fitness.values = fit
+            # Update the hall of fame with the generated individuals
+            hof.update(offspring)
+            # Select the next generation population
+            population[:] = self.toolbox.select(population + offspring, mu)
+            #Update
+            gen = gen + 1
+            best_profile = '(%s)'%','.join(map(str,GeneticOperators.decode(hof[0])));
+            best_performance = compute_perf(hof[0].fitness.values[0])
+            sys.stdout.write('Time %d | Best %d %s [ for %s ]\r'%(time.time() - start_time, best_performance, perf_metric, best_profile))
+            sys.stdout.flush()
+        sys.stdout.write('\n')
+        return population
--- a/autotune/python/model.py
+++ b/autotune/python/model.py
@@ -4,41 +4,41 @@ import numpy as np
 import scipy as sp

 def train_model(X, Y, profiles):
-  #Preprocessing
-  scaler = preprocessing.StandardScaler().fit(X);
-  X = scaler.transform(X);
-  ref = np.argmax(np.bincount(np.argmax(Y, axis=1))) #most common profile
+    #Preprocessing
+    scaler = preprocessing.StandardScaler().fit(X);
+    X = scaler.transform(X);
+    ref = np.argmax(np.bincount(np.argmax(Y, axis=1))) #most common profile

-  print Y
-  print np.bincount(np.argmax(Y, axis=1))
-  #Cross-validation data-sets
-  cut = int(0.5*X.shape[0]+1);
-  XTr = X[0:cut, :];
-  YTr = Y[0:cut, :];
-  XTe = X[cut:,:];
-  YTe = Y[cut:,:];
+    print Y
+    print np.bincount(np.argmax(Y, axis=1))
+    #Cross-validation data-sets
+    cut = int(0.5*X.shape[0]+1);
+    XTr = X[0:cut, :];
+    YTr = Y[0:cut, :];
+    XTe = X[cut:,:];
+    YTe = Y[cut:,:];

-  #Train the model
-  print("Training the model...");
-  clf = linear_model.LinearRegression().fit(XTr,YTr);
+    #Train the model
+    print("Training the model...");
+    clf = linear_model.LinearRegression().fit(XTr,YTr);

-  #Evaluate the model
-  GFlops = np.empty(XTe.shape[0]);
-  speedups = np.empty(XTe.shape[0]);
-  optspeedups = np.empty(XTe.shape[0]);
-  for i,x in enumerate(XTe):
-    predictions = clf.predict(x);
-    label = np.argmax(predictions);
-    speedups[i] = YTe[i,label]/YTe[i,ref];
-    optspeedups[i] = np.max(YTe[i,:])/YTe[i,ref];
-    GFlops[i] = YTe[i,ref];
+    #Evaluate the model
+    GFlops = np.empty(XTe.shape[0]);
+    speedups = np.empty(XTe.shape[0]);
+    optspeedups = np.empty(XTe.shape[0]);
+    for i,x in enumerate(XTe):
+        predictions = clf.predict(x);
+        label = np.argmax(predictions);
+        speedups[i] = YTe[i,label]/YTe[i,ref];
+        optspeedups[i] = np.max(YTe[i,:])/YTe[i,ref];
+        GFlops[i] = YTe[i,ref];

-  np.set_printoptions(precision=2);
-  print("-----------------");
-  print("Average testing speedup : %f (Optimal : %f)"%(sp.stats.gmean(speedups), sp.stats.gmean(optspeedups)));
-  print("Average GFLOP/s : %f (Default %f, Optimal %f)"%(np.mean(np.multiply(GFlops,speedups)), np.mean(GFlops), np.mean(np.multiply(GFlops,optspeedups))));
-  print("Minimum speedup is %f wrt %i GFlops"%(np.min(speedups), GFlops[np.argmin(speedups)]));
-  print("Maximum speedup is %f wrt %i GFlops"%(np.max(speedups), GFlops[np.argmax(speedups)]));
-  print("--------");
+    np.set_printoptions(precision=2);
+    print("-----------------");
+    print("Average testing speedup : %f (Optimal : %f)"%(sp.stats.gmean(speedups), sp.stats.gmean(optspeedups)));
+    print("Average GFLOP/s : %f (Default %f, Optimal %f)"%(np.mean(np.multiply(GFlops,speedups)), np.mean(GFlops), np.mean(np.multiply(GFlops,optspeedups))));
+    print("Minimum speedup is %f wrt %i GFlops"%(np.min(speedups), GFlops[np.argmin(speedups)]));
+    print("Maximum speedup is %f wrt %i GFlops"%(np.max(speedups), GFlops[np.argmax(speedups)]));
+    print("--------");

-  print clf
+    print clf
--- a/autotune/python/optimize.py
+++ b/autotune/python/optimize.py
@@ -49,5 +49,5 @@ from genetic import GeneticOperators
    #~

 def genetic(statement, context, TemplateType, build_template, parameter_names, compute_perf, perf_metric, out):
-  GA = GeneticOperators(context.devices[0], statement, parameter_names, TemplateType, build_template, out)
-  GA.optimize(maxtime='2m30s', maxgen=1000, compute_perf=compute_perf, perf_metric=perf_metric)
+    GA = GeneticOperators(context.devices[0], statement, parameter_names, TemplateType, build_template, out)
+    GA.optimize(maxtime='2m30s', maxgen=1000, compute_perf=compute_perf, perf_metric=perf_metric)
--- a/autotune/python/tools.py
+++ b/autotune/python/tools.py
@@ -5,134 +5,136 @@ from pyviennacl.atidlas import StatementsTuple

 class PhysicalLimits:
    def __init__(self, dev):
-      self.compute_capability = pyopencl.characterize.nv_compute_capability(dev)
-      if self.compute_capability[0]==1:
-        if self.compute_capability[1]<=1:
-          self.warps_per_mp = 24
-          self.threads_per_mp = 768
-          self.num_32b_reg_per_mp = 8192
-          self.reg_alloc_unit_size = 256
+        self.compute_capability = pyopencl.characterize.nv_compute_capability(dev)
+        if self.compute_capability[0]==1:
+            if self.compute_capability[1]<=1:
+                self.warps_per_mp = 24
+                self.threads_per_mp = 768
+                self.num_32b_reg_per_mp = 8192
+                self.reg_alloc_unit_size = 256
+            else:
+                self.warps_per_mp = 32
+                self.threads_per_mp = 1024
+                self.num_32b_reg_per_mp = 16384
+                self.reg_alloc_unit_size = 512
+            self.threads_per_warp = 32
+            self.thread_blocks_per_mp = 8
+            self.reg_alloc_granularity = 'block'
+            self.reg_per_thread = 124
+            self.shared_mem_per_mp = 16384
+            self.shared_mem_alloc_unit_size = 512
+            self.warp_alloc_granularity = 2
+            self.max_thread_block_size = 512
+
+        elif self.compute_capability[0]==2:
+            self.threads_per_warp = 32
+            self.warps_per_mp = 48
+            self.threads_per_mp = 1536
+            self.thread_blocks_per_mp = 8
+            self.num_32b_reg_per_mp = 32768
+            self.reg_alloc_unit_size = 64
+            self.reg_alloc_granularity = 'warp'
+            self.reg_per_thread = 63
+            self.shared_mem_per_mp = 49152
+            self.shared_mem_alloc_unit_size = 128
+            self.warp_alloc_granularity = 2
+            self.max_thread_block_size = 1024
+
+        elif self.compute_capability[0]==3:
+            self.threads_per_warp = 32
+            self.warps_per_mp = 64
+            self.threads_per_mp = 2048
+            self.thread_blocks_per_mp = 16
+            self.num_32b_reg_per_mp = 65536
+            self.reg_alloc_unit_size = 256
+            self.reg_alloc_granularity = 'warp'
+            if(self.compute_capability[1]==5):
+                self.reg_per_thread = 255
+            else:
+                self.reg_per_thread = 63
+            self.shared_mem_per_mp = 49152
+            self.shared_mem_alloc_unit_size = 256
+            self.warp_alloc_granularity = 4
+            self.max_thread_block_size = 1024
+
        else:
-          self.warps_per_mp = 32
-          self.threads_per_mp = 1024
-          self.num_32b_reg_per_mp = 16384
-          self.reg_alloc_unit_size = 512
-        self.threads_per_warp = 32
-        self.thread_blocks_per_mp = 8
-        self.reg_alloc_granularity = 'block'
-        self.reg_per_thread = 124
-        self.shared_mem_per_mp = 16384
-        self.shared_mem_alloc_unit_size = 512
-        self.warp_alloc_granularity = 2
-        self.max_thread_block_size = 512
-        
-      elif self.compute_capability[0]==2:
-        self.threads_per_warp = 32
-        self.warps_per_mp = 48
-        self.threads_per_mp = 1536
-        self.thread_blocks_per_mp = 8
-        self.num_32b_reg_per_mp = 32768
-        self.reg_alloc_unit_size = 64
-        self.reg_alloc_granularity = 'warp'
-        self.reg_per_thread = 63
-        self.shared_mem_per_mp = 49152
-        self.shared_mem_alloc_unit_size = 128
-        self.warp_alloc_granularity = 2
-        self.max_thread_block_size = 1024
-      
-      elif self.compute_capability[0]==3:
-        self.threads_per_warp = 32
-        self.warps_per_mp = 64
-        self.threads_per_mp = 2048
-        self.thread_blocks_per_mp = 16
-        self.num_32b_reg_per_mp = 65536
-        self.reg_alloc_unit_size = 256
-        self.reg_alloc_granularity = 'warp'
-        if(self.compute_capability[1]==5):
-          self.reg_per_thread = 255
-        else:
-          self.reg_per_thread = 63
-        self.shared_mem_per_mp = 49152
-        self.shared_mem_alloc_unit_size = 256
-        self.warp_alloc_granularity = 4
-        self.max_thread_block_size = 1024
-        
-      else:
-        raise Exception('Compute capability not supported!')
-        
-def _int_floor(value, multiple_of=1):
-  """Round C{value} down to be a C{multiple_of} something."""
-  # Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
-
-  from math import floor
-  return int(floor(value/multiple_of))*multiple_of
-  
-def _int_ceiling(value, multiple_of=1):
-  """Round C{value} up to be a C{multiple_of} something."""
-  # Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
-
-  from math import ceil
-  return int(ceil(value/multiple_of))*multiple_of
+            raise Exception('Compute capability not supported!')

 class OccupancyRecord:

+    def _int_floor(value, multiple_of=1):
+        """Round C{value} down to be a C{multiple_of} something."""
+        # Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
+        from math import floor
+        return int(floor(value/multiple_of))*multiple_of
+
+    def _int_ceiling(value, multiple_of=1):
+        """Round C{value} up to be a C{multiple_of} something."""
+        # Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
+        from math import ceil
+        return int(ceil(value/multiple_of))*multiple_of
+
+    def init_nvidia(self, dev, threads, shared_mem, registers):
+        physical_limits = PhysicalLimits(dev)
+        limits = [];
+        allocated_warps =  max(1,_int_ceiling(threads/physical_limits.threads_per_warp))
+        max_warps_per_mp = physical_limits.warps_per_mp;
+        limits.append((min(physical_limits.thread_blocks_per_mp, _int_floor(max_warps_per_mp/allocated_warps)), 'warps'))
+
+        if registers>0:
+            if registers > physical_limits.reg_per_thread:
+                limits.append((0, 'registers'))
+            else:
+                allocated_regs = {'warp': allocated_warps,
+                                  'block': _int_ceiling(_int_ceiling(allocated_warps, physical_limits.warp_alloc_granularity)*registers*physical_limits.threads_per_warp,allocated_warps)}[physical_limits.reg_alloc_granularity]
+                max_reg_per_mp = {'warp': _int_floor(physical_limits.num_32b_reg_per_mp/_int_ceiling(registers*physical_limits.threads_per_warp, physical_limits.reg_alloc_unit_size), physical_limits.warp_alloc_granularity),
+                                  'block':physical_limits.num_32b_reg_per_mp}[physical_limits.reg_alloc_granularity]
+                limits.append((_int_floor(max_reg_per_mp/allocated_regs), 'registers'))
+
+        if shared_mem>0:
+            allocated_shared_mem = _int_ceiling(shared_mem, physical_limits.shared_mem_alloc_unit_size)
+            max_shared_mem_per_mp = physical_limits.shared_mem_per_mp
+            limits.append((_int_floor(max_shared_mem_per_mp/allocated_shared_mem), 'shared memory'))
+
+        self.limit, self.limited_by = min(limits)
+        self.warps_per_mp = self.limit*allocated_warps
+        self.occupancy = 100*self.warps_per_mp/physical_limits.warps_per_mp
+
    def __init__(self, dev, threads, shared_mem=0, registers=0):
-      physical_limits = PhysicalLimits(dev)
-      limits = [];
-      allocated_warps =  max(1,_int_ceiling(threads/physical_limits.threads_per_warp))
-      max_warps_per_mp = physical_limits.warps_per_mp;
-      limits.append((min(physical_limits.thread_blocks_per_mp, _int_floor(max_warps_per_mp/allocated_warps)), 'warps'))
+        self.init_nvidia(self, dev, threads, shared_mem, registers)

-      if registers>0:
-        if registers > physical_limits.reg_per_thread:
-          limits.append((0, 'registers'))
-        else:
-          allocated_regs = {'warp': allocated_warps,
-                            'block': _int_ceiling(_int_ceiling(allocated_warps, physical_limits.warp_alloc_granularity)*registers*physical_limits.threads_per_warp,allocated_warps)}[physical_limits.reg_alloc_granularity]
-          max_reg_per_mp = {'warp': _int_floor(physical_limits.num_32b_reg_per_mp/_int_ceiling(registers*physical_limits.threads_per_warp, physical_limits.reg_alloc_unit_size), physical_limits.warp_alloc_granularity),
-                            'block':physical_limits.num_32b_reg_per_mp}[physical_limits.reg_alloc_granularity]
-          limits.append((_int_floor(max_reg_per_mp/allocated_regs), 'registers'))
-      
-      if shared_mem>0:
-        allocated_shared_mem = _int_ceiling(shared_mem, physical_limits.shared_mem_alloc_unit_size)
-        max_shared_mem_per_mp = physical_limits.shared_mem_per_mp
-        limits.append((_int_floor(max_shared_mem_per_mp/allocated_shared_mem), 'shared memory'))
-      
-      self.limit, self.limited_by = min(limits)
-      self.warps_per_mp = self.limit*allocated_warps
-      self.occupancy = 100*self.warps_per_mp/physical_limits.warps_per_mp


 def skip(template, statement, device):
-      statements = StatementsTuple(statement)
-      registers_usage = template.registers_usage(statements)/4
-      lmem_usage = template.lmem_usage(statements)
-      local_size = template.parameters.local_size_0*template.parameters.local_size_1
-      occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
-      if template.check(statement) or occupancy_record.occupancy < 15:
+    statements = StatementsTuple(statement)
+    registers_usage = template.registers_usage(statements)/4
+    lmem_usage = template.lmem_usage(statements)
+    local_size = template.parameters.local_size_0*template.parameters.local_size_1
+    occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
+    if template.check(statement) or occupancy_record.occupancy < 15:
        return True
-      return False
+    return False

 def benchmark(template, statement, device):
-      statements = StatementsTuple(statement)
-      registers_usage = template.registers_usage(statements)/4
-      lmem_usage = template.lmem_usage(statements)
-      local_size = template.parameters.local_size_0*template.parameters.local_size_1
-      occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
-      if occupancy_record.occupancy < 15 :
+    statements = StatementsTuple(statement)
+    registers_usage = template.registers_usage(statements)/4
+    lmem_usage = template.lmem_usage(statements)
+    local_size = template.parameters.local_size_0*template.parameters.local_size_1
+    occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
+    if occupancy_record.occupancy < 15 :
        raise ValueError("Template has too low occupancy")
-      else:
+    else:
        #~ try:
        template.execute(statement, True)
        statement.result.context.finish_all_queues()
        N = 0
        current_time = 0
        while current_time < 1e-2:
-          time_before = time.time()
-          template.execute(statement,False)
-          statement.result.context.finish_all_queues()
-          current_time += time.time() - time_before
-          N+=1
+            time_before = time.time()
+            template.execute(statement,False)
+            statement.result.context.finish_all_queues()
+            current_time += time.time() - time_before
+            N+=1
        return current_time/N
        #~ except:
-          #~ raise ValueError("Invalid template")
+            #~ raise ValueError("Invalid template")
--- a/autotune/python/utils.py
+++ b/autotune/python/utils.py
@@ -28,6 +28,6 @@ DEVICES_PRESETS = {'all': all_devices,


 def sanitize_string(string, keep_chars = ['_']):
-  string = string.replace(' ', '_').lower()
-  string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip()
-  return string
+    string = string.replace(' ', '_').lower()
+    string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip()
+    return string
--- a/autotune/python/vclio.py
+++ b/autotune/python/vclio.py
@@ -3,114 +3,114 @@ import os
 import utils

 def append_include(data, path):
-  include_name = '#include "' + path +'"\n'
-  already_included = data.find(include_name)
-  if already_included == -1:
-    insert_index = data.index('\n', data.index('#define')) + 1
-    return data[:insert_index] + '\n' + include_name + data[insert_index:]
-  return data
+    include_name = '#include "' + path +'"\n'
+    already_included = data.find(include_name)
+    if already_included == -1:
+        insert_index = data.index('\n', data.index('#define')) + 1
+        return data[:insert_index] + '\n' + include_name + data[insert_index:]
+    return data

 def generate_viennacl_headers(viennacl_root, device, datatype, operation, additional_parameters, parameters):
-  builtin_database_dir = os.path.join(viennacl_root, "device_specific", "builtin_database")
-  if not os.path.isdir(builtin_database_dir):
-    raise EnvironmentError('ViennaCL root path is incorrect. Cannot access ' + builtin_database_dir + '!\n'
-                            'Your version of ViennaCL may be too old and/or corrupted.')
+    builtin_database_dir = os.path.join(viennacl_root, "device_specific", "builtin_database")
+    if not os.path.isdir(builtin_database_dir):
+        raise EnvironmentError('ViennaCL root path is incorrect. Cannot access ' + builtin_database_dir + '!\n'
+                                'Your version of ViennaCL may be too old and/or corrupted.')

-  function_name_dict = { vcl.float32: 'add_4B',
-                         vcl.float64: 'add_8B' }
+    function_name_dict = { vcl.float32: 'add_4B',
+                           vcl.float64: 'add_8B' }

-  additional_parameters_dict = {'N':  "char_to_type<'N'>",
-                                'T':  "char_to_type<'T'>"}
+    additional_parameters_dict = {'N':  "char_to_type<'N'>",
+                                  'T':  "char_to_type<'T'>"}

-  #Create the device-specific headers
-  cpp_device_name = utils.sanitize_string(device.name)
-  function_name = function_name_dict[datatype]
-  operation = operation.replace('-','_')
+    #Create the device-specific headers
+    cpp_device_name = utils.sanitize_string(device.name)
+    function_name = function_name_dict[datatype]
+    operation = operation.replace('-','_')

-  cpp_class_name = operation + '_template'
-  header_name = cpp_device_name + ".hpp"
-  function_declaration = 'inline void ' + function_name + '(' + ', '.join(['database_type<' + cpp_class_name + '::parameters_type> & db'] + \
-                                                                        [additional_parameters_dict[x] for x in additional_parameters]) + ')'
+    cpp_class_name = operation + '_template'
+    header_name = cpp_device_name + ".hpp"
+    function_declaration = 'inline void ' + function_name + '(' + ', '.join(['database_type<' + cpp_class_name + '::parameters_type> & db'] + \
+                                                                          [additional_parameters_dict[x] for x in additional_parameters]) + ')'

-  device_type_prefix = utils.DEVICE_TYPE_PREFIX[device.type]
-  vendor_prefix = utils.VENDOR_PREFIX[device.vendor_id]
-  architecture_family = vcl.opencl.architecture_family(device.vendor_id, device.name)
+    device_type_prefix = utils.DEVICE_TYPE_PREFIX[device.type]
+    vendor_prefix = utils.VENDOR_PREFIX[device.vendor_id]
+    architecture_family = vcl.opencl.architecture_family(device.vendor_id, device.name)

-  header_hierarchy = ["devices", device_type_prefix, vendor_prefix, architecture_family]
-  header_directory = os.path.join(builtin_database_dir, *header_hierarchy)
-  header_path = os.path.join(header_directory, header_name)
+    header_hierarchy = ["devices", device_type_prefix, vendor_prefix, architecture_family]
+    header_directory = os.path.join(builtin_database_dir, *header_hierarchy)
+    header_path = os.path.join(header_directory, header_name)

-  if not os.path.exists(header_directory):
-    os.makedirs(header_directory)
+    if not os.path.exists(header_directory):
+        os.makedirs(header_directory)

-  if os.path.exists(header_path):
-    with open (header_path, "r") as myfile:
-      data=myfile.read()
-  else:
-    data = ''
+    if os.path.exists(header_path):
+        with open (header_path, "r") as myfile:
+            data=myfile.read()
+    else:
+        data = ''

-  if not data:
-    ifndef_suffix = ('_'.join(header_hierarchy) + '_hpp_').upper()
-    data =  ('#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
-        '#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
-        '\n'
-        '#include "viennacl/device_specific/forwards.h"\n'
-        '#include "viennacl/device_specific/builtin_database/common.hpp"\n'
-        '\n' 
-        'namespace viennacl{\n'
-        'namespace device_specific{\n'
-        'namespace builtin_database{\n'
-        'namespace devices{\n'
-        'namespace '  + device_type_prefix + '{\n'
-        'namespace '  + vendor_prefix + '{\n'
-        'namespace '  + architecture_family + '{\n'
-        'namespace '  + cpp_device_name + '{\n'
-        '\n'
-        '}\n'
-        '}\n'
-        '}\n'
-        '}\n'
-        '}\n'
-        '}\n'
-        '}\n'
-        '}\n'
-        '#endif\n'
-        '')
+    if not data:
+        ifndef_suffix = ('_'.join(header_hierarchy) + '_hpp_').upper()
+        data =  ('#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
+            '#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
+            '\n'
+            '#include "viennacl/device_specific/forwards.h"\n'
+            '#include "viennacl/device_specific/builtin_database/common.hpp"\n'
+            '\n'
+            'namespace viennacl{\n'
+            'namespace device_specific{\n'
+            'namespace builtin_database{\n'
+            'namespace devices{\n'
+            'namespace '  + device_type_prefix + '{\n'
+            'namespace '  + vendor_prefix + '{\n'
+            'namespace '  + architecture_family + '{\n'
+            'namespace '  + cpp_device_name + '{\n'
+            '\n'
+            '}\n'
+            '}\n'
+            '}\n'
+            '}\n'
+            '}\n'
+            '}\n'
+            '}\n'
+            '}\n'
+            '#endif\n'
+            '')

-  data = append_include(data, 'viennacl/device_specific/templates/' + cpp_class_name + '.hpp')
+    data = append_include(data, 'viennacl/device_specific/templates/' + cpp_class_name + '.hpp')

-  add_to_database_arguments = [vendor_prefix + '_id', utils.DEVICE_TYPE_CL_NAME[device.type], 'ocl::'+architecture_family,
-                '"' + device.name + '"',  cpp_class_name + '::parameters' + str(parameters)]    
-  core = '  db.' + function_name + '(' + ', '.join(add_to_database_arguments) + ');'
+    add_to_database_arguments = [vendor_prefix + '_id', utils.DEVICE_TYPE_CL_NAME[device.type], 'ocl::'+architecture_family,
+                  '"' + device.name + '"',  cpp_class_name + '::parameters' + str(parameters)]
+    core = '  db.' + function_name + '(' + ', '.join(add_to_database_arguments) + ');'

-  already_declared = data.find(function_declaration)
-  if already_declared==-1:
-    substr = 'namespace '  + cpp_device_name + '{\n'
-    insert_index = data.index(substr) + len(substr)
-    data = data[:insert_index] + '\n' + function_declaration + '\n{\n' + core + '\n}\n' + data[insert_index:]
-  else:
-    i1 = data.find('{', already_declared)
-    if data[i1-1]=='\n':
-      i1 = i1 - 1
-    i2 = data.find('}', already_declared) + 1 
-    data = data[:i1]  + '\n{\n' + core + '\n}' + data[i2:]
+    already_declared = data.find(function_declaration)
+    if already_declared==-1:
+        substr = 'namespace '  + cpp_device_name + '{\n'
+        insert_index = data.index(substr) + len(substr)
+        data = data[:insert_index] + '\n' + function_declaration + '\n{\n' + core + '\n}\n' + data[insert_index:]
+    else:
+        i1 = data.find('{', already_declared)
+        if data[i1-1]=='\n':
+            i1 = i1 - 1
+        i2 = data.find('}', already_declared) + 1
+        data = data[:i1]  + '\n{\n' + core + '\n}' + data[i2:]

-  #Write the header file
-  with open(header_path, "w+") as myfile:
-    myfile.write(data)
+    #Write the header file
+    with open(header_path, "w+") as myfile:
+        myfile.write(data)

-  #Updates the global ViennaCL headers
-  with open(os.path.join(builtin_database_dir, operation + '.hpp'), 'r+') as operation_header:      
-    data = operation_header.read()
-    data = append_include(data, os.path.relpath(header_path, os.path.join(viennacl_root, os.pardir)))
+    #Updates the global ViennaCL headers
+    with open(os.path.join(builtin_database_dir, operation + '.hpp'), 'r+') as operation_header:
+        data = operation_header.read()
+        data = append_include(data, os.path.relpath(header_path, os.path.join(viennacl_root, os.pardir)))

-    scope_name = '_'.join(('init', operation) + additional_parameters)
-    scope = data.index(scope_name)
-    function_call = '  ' + '::'.join(header_hierarchy + [cpp_device_name, function_name]) + '(' + ', '.join(['result'] + [additional_parameters_dict[k] + '()' for k in additional_parameters]) + ')'
-    if function_call not in data:
-      insert_index = data.rindex('\n', 0, data.index('return result', scope))
-      data = data[:insert_index] + function_call + ';\n' + data[insert_index:]
+        scope_name = '_'.join(('init', operation) + additional_parameters)
+        scope = data.index(scope_name)
+        function_call = '  ' + '::'.join(header_hierarchy + [cpp_device_name, function_name]) + '(' + ', '.join(['result'] + [additional_parameters_dict[k] + '()' for k in additional_parameters]) + ')'
+        if function_call not in data:
+            insert_index = data.rindex('\n', 0, data.index('return result', scope))
+            data = data[:insert_index] + function_call + ';\n' + data[insert_index:]

-    operation_header.seek(0)
-    operation_header.truncate()
-    operation_header.write(data)
+        operation_header.seek(0)
+        operation_header.truncate()
+        operation_header.write(data)