Some improvements

2014-09-06 00:39:38 -04:00
parent 544583e6ca
commit 2055a8cc59
12 changed files with 215 additions and 84 deletions
--- a/autotune/python/autotune.py
+++ b/autotune/python/autotune.py
@@ -0,0 +1,167 @@
+from __future__ import division
+
+import argparse
+import itertools
+import os
+
+from external.configobj import ConfigObj
+
+import pyopencl as cl
+import pyviennacl as vcl
+from pyviennacl import backend
+from pyviennacl import opencl
+from pyviennacl import atidlas
+
+import utils
+import vclio
+import optimize
+import sys
+
+DATATYPES = { 'single' : vcl.float32,
+              'double' : vcl.float64 }
+
+TYPES = { 'vector-axpy': {'template':vcl.atidlas.VectorAxpyTemplate,
+                          'parameter-names':['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
+                          'perf-index':lambda x: 3*x[0]*x[1][0]/x[2]*1e-9,
+                          'perf-measure':'GB/s'},
+                          
+          'matrix-axpy': {'template':vcl.atidlas.MatrixAxpyTemplate,
+                          'parameter-names':['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'num-groups-1', 'fetch'],
+                          'perf-index':lambda x: 3*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
+                          'perf-measure':'GB/s'},
+                          
+          'reduction': {'template':vcl.atidlas.ReductionTemplate,
+                        'parameter-names':['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
+                        'perf-index':lambda x: 2*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
+                        'perf-measure':'GB/s'},
+          
+          'row-wise-reduction': {'template':vcl.atidlas.RowWiseReductionTemplate,
+                                'parameter-names':['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'fetch'],
+                                'perf-index':lambda x: x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
+                                'perf-measure':'GB/s'},
+          
+          'matrix-product': {'template':vcl.atidlas.MatrixProductTemplate,
+                            'parameter-names':['simd-width', 'local-size-0', 'kL', 'local-size-1', 'mS', 'kS', 'nS', 'A-fetch-policy', 'B-fetch-policy', 'local-fetch-size-0', 'local-fetch-size-1'],
+                            'perf-index': lambda x: 2*x[1][0]*x[1][1]*x[1][2]/x[2]*1e-9,
+                            'perf-measure': 'GFLOP/s'} }
+    
+def parameter_space(operation):
+  simd = [1, 2, 4, 8]
+  pow2_1D = [2**k for k in range(12)]
+  pow2_2D = [2**k for k in range(10)]
+  pow2_2D_unrolled = [2**k for k in range(6)]
+  FetchingPolicy = vcl.device_specific.FetchingPolicy
+  fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS]
+  if operation == 'vector-axpy': return [simd, pow2_1D, pow2_1D, fetch]
+  if operation == 'reduction': return [simd, pow2_1D, pow2_1D, fetch]
+  if operation == 'matrix-axpy': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D, fetch]
+  if operation == 'row-wise-reduction': return [simd, pow2_2D, pow2_2D, pow2_1D, fetch]
+  if operation == 'matrix-product': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D_unrolled,  pow2_2D_unrolled,  pow2_2D_unrolled, fetch, fetch, [0] + pow2_2D, [0] + pow2_2D]
+  
+def do_tuning(config_fname, spec_fname, viennacl_root):    
+
+  config = ConfigObj(config_fname, configspec=spec_fname)
+  map_to_list = lambda T: list(map(T[0], T[1] if isinstance(T[1], list) else [T[1]])) 
+ 
+  for operation in ['vector-axpy', 'matrix-axpy', 'row-wise-reduction', 'matrix-product']:
+    
+    tmp_folder = config['tmp-folder'] if 'tmp-folder' in config else ""
+    
+    if operation in config:
+      p = config[operation]        
+      confdevices = p['devices']
+      devices = utils.DEVICES_PRESETS[confdevices] if confdevices in utils.DEVICES_PRESETS else [utils.all_devices[int(i)] for i in confdevices]
+      precisions =  ['single', 'double'] if 'all' in p['precision'] else p['precision']
+      datatypes = [DATATYPES[k] for k in precisions]
+      s = map_to_list((int, p['size']))
+      
+      for datatype, device in itertools.product(datatypes, devices):
+        ctx = cl.Context([device])
+        ctx = vcl.backend.Context(ctx)
+        device = ctx.current_device
+
+        if datatype is vcl.float64 and not device.double_fp_config:
+          sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...')
+          continue
+
+        pairs = []
+        
+        def execute(node, other_params):
+          print('-----')
+          print(' '.join(map(str, ("Now tuning:", datatype.__name__, '-', operation, '-'.join(other_params), '[' + device.name, '(' + device.platform.name + ')]'))))
+          tmp_file = os.path.join(tmp_folder, utils.sanitize_string(device.name) + "-" + datatype.__name__ + "-" + operation + '-'.join(other_params) + ".dat")
+          if tmp_folder:
+            print('Saving history to ' + tmp_file)
+            fname = tmp_file
+          else:
+            fname = os.devnull
+          with open(fname, "w+") as archive:
+            with vcl.Statement(node) as statement:
+              result = optimize.genetic(statement, ctx, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
+                                    TYPES[operation]['parameter-names'], parameter_space(operation), lambda t: TYPES[operation]['perf-index']([datatype().itemsize, s, t]), TYPES[operation]['perf-measure'], archive)
+            if result and viennacl_root:
+              vclio.generate_viennacl_headers(viennacl_root, device, datatype, operation, other_params, result[1])
+        
+        if operation=='vector-axpy':
+          x = vcl.Vector(s[0], context=ctx, dtype=datatype)
+          y = vcl.Vector(s[0], context=ctx, dtype=datatype)
+          execute(vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y)), ())
+        
+        if operation=='matrix-axpy':
+          A = vcl.Matrix(s, context=ctx, dtype=datatype)
+          B = vcl.Matrix(s, context=ctx, dtype=datatype)
+          execute(A+B, ())
+        
+        if operation=='row-wise-reduction':
+          layouts = map_to_list((str,p['layout']))
+          if 'all' in layouts:
+            layouts = ['N', 'T']
+          for A_trans in layouts:
+            A = vcl.Matrix(s if A_trans=='N' else s[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR)
+            x = vcl.Vector(s[1] if A_trans=='N' else s[0], context=ctx, dtype=datatype)
+            LHS = A if A_trans=='N' else A.T
+            execute(LHS*x, ())
+          
+        if operation=='matrix-product':
+          layouts = map_to_list((str,p['layout']))
+          if 'all' in layouts:
+            layouts = ['NN', 'NT', 'TN', 'TT']
+          for layout in layouts:
+            A_trans = layout[0]
+            B_trans = layout[1]
+            
+            A = vcl.Matrix((s[0], s[1]) if A_trans=='N' else (s[1],s[0]), context=ctx, dtype=datatype, layout=vcl.COL_MAJOR);
+            B = vcl.Matrix((s[1], s[2]) if B_trans=='N' else (s[2],s[1]), context=ctx, dtype=datatype, layout=vcl.COL_MAJOR);
+            LHS = A if A_trans=='N' else A.T
+            RHS = B if B_trans=='N' else B.T
+            alpha = vcl.HostScalar(1.0,  context=ctx, dtype = datatype)
+            beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
+            C = vcl.Matrix((s[0], s[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR)
+            execute(vcl.Assign(C,LHS*RHS*alpha + C*beta),(A_trans, B_trans))
+            
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser();
+  
+  subparsers = parser.add_subparsers(dest='action')
+
+  print_devices_parser = subparsers.add_parser('list-devices', help='list the devices available')
+  
+  tune_parser = subparsers.add_parser('tune', help='tune using a specific configuration file')
+  tune_parser.add_argument("--config", default="config.ini", required=False, type=str)
+  tune_parser.add_argument("--viennacl-root", default='', required=False, type=str)
+  args = parser.parse_args()
+  
+  if(args.action=='list-devices'):
+      print("----------------")
+      print("Devices available:")
+      print("----------------")
+      devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
+      for (i, d) in enumerate(devices):
+          print('Device', i, ':', utils.DEVICE_TYPE_PREFIX[d.type].upper() + ':', d.name, 'on', d.platform.name)
+      print("----------------")
+  else:
+      print("------")
+      print("Auto-tuning")
+      print("------")
+      do_tuning(args.config, 'config_spec.ini', args.viennacl_root)
--- a/autotune/python/genetic_operators.py
+++ b/autotune/python/genetic_operators.py
@@ -0,0 +1,140 @@
+import random
+import time
+import tools
+import pyviennacl as vcl
+
+from collections import OrderedDict as odict
+
+def closest_divisor(N, x):
+  x_low=x_high=max(1,min(round(x),N))
+  while N % x_low > 0 and x_low>0:
+    x_low = x_low - 1
+  while N % x_high > 0 and x_high < N:
+    x_high = x_high + 1
+  return x_low if x - x_low < x_high - x else x_high
+    
+class GeneticOperators(object):
+  
+  def __init__(self, device, statement, parameters, parameter_names, TemplateType, build_template):
+      self.device = device
+      self.statement = statement
+      self.parameters = parameters
+      self.parameter_names = parameter_names
+      self.TemplateType = TemplateType
+      self.ParameterType = TemplateType.Parameters
+      self.build_template = build_template
+      self.cache = {}
+    
+  def init(self):
+      while True:
+        result = [random.choice(L) for L in self.parameters]
+        template = self.build_template(self.TemplateType.Parameters(*result))
+        registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4
+        lmem_usage = template.lmem_usage(vcl.atidlas.StatementsTuple(self.statement))
+        local_size = template.parameters.local_size_0*template.parameters.local_size_1
+        occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
+        if template.check(self.statement) and occupancy_record.occupancy >= 10 :
+          return result
+
+  @staticmethod
+  def min_to_hyperbol(a, tup):
+    x = 1
+    for i in range(100):
+      dx = 2*(-a**2/x**3 + a*tup[1]/x**2 - tup[0] + x);
+      ddx = 6*a**2/x**4 - 4*a*tup[1]/x**3 + 2;
+      if abs(dx) < 1e-7 or abs(ddx) < 1e-7:
+        break
+      x-=dx/ddx; 
+      if x<1 or x>a:
+        x = max(1, min(x, a))
+        break
+    new_x = int(closest_divisor(a, x))
+    new_y = int(a / new_x)
+    return (new_x, new_y)
+      
+  def repair(self,func):
+    def repair_impl(child):
+      D = odict(zip(self.parameter_names, child))
+      dummy_template = self.build_template(self.ParameterType(*D.values()))
+      FetchingPolicy = vcl.atidlas.FetchingPolicy;
+      if 'local-size-1' not in D:
+        D['local-size-0'] = min(D['local-size-0'], self.device.max_work_group_size)
+      elif D['local-size-0']*D['local-size-1'] > self.device.max_work_group_size:
+        res = GeneticOperators.min_to_hyperbol(self.device.max_work_group_size, (D['local-size-0'], D['local-size-1']))
+        D['local-size-0'] = res[0]
+        D['local-size-1'] = res[1]
+      
+      if self.ParameterType is vcl.atidlas.MatrixProductTemplate.Parameters:
+        if dummy_template.A_trans != 'N' and dummy_template.B_trans != 'T':
+          D['simd-width'] = 1
+          
+        D['mS'] = max(D['mS'], D['simd-width'])
+        D['mS'] = D['mS'] - D['mS']%D['simd-width']
+        
+        D['nS'] = max(D['nS'], D['simd-width'])
+        D['nS'] = D['nS'] - D['nS']%D['simd-width']
+        
+        
+        if D['A-fetch-policy']!=FetchingPolicy.FETCH_FROM_LOCAL and D['B-fetch-policy']!=FetchingPolicy.FETCH_FROM_LOCAL:
+          D['local-fetch-size-0']=D['local-fetch-size-1']=0
+        
+        else:
+          res = GeneticOperators.min_to_hyperbol(D['local-size-0']*D['local-size-1'], (D['local-fetch-size-0'], D['local-fetch-size-1']))
+          D['local-fetch-size-0'] = res[0]
+          D['local-fetch-size-1'] = res[1]      
+        
+        if D['A-fetch-policy']==FetchingPolicy.FETCH_FROM_LOCAL and dummy_template.A_trans=='N' and D['kL'] % D['local-fetch-size-1'] > 0:
+          D['kL'] = max(1,round(D['kL']/D['local-fetch-size-1']))*D['local-fetch-size-1']
+        
+        if D['B-fetch-policy']==FetchingPolicy.FETCH_FROM_LOCAL and dummy_template.B_trans=='T' and D['kL'] % D['local-fetch-size-1'] > 0:
+          D['kL'] = max(1,round(D['kL']/D['local-fetch-size-1']))*D['local-fetch-size-1']
+          
+        D['kS'] = min(D['kL'], D['kS'])
+      
+      return D.values()
+      
+    def wrappper(*args, **kargs):
+      offspring = func(*args, **kargs)
+      for child in offspring:
+        new_child = repair_impl(child)
+        for i in range(len(child)):
+          if child[i] != new_child[i]:
+            child[i] = new_child[i]
+            
+      return offspring
+    return wrappper
+
+  def mutate(self, individual, indpb):
+    for i in range(len(individual)):
+        if random.random() < indpb:
+            j = self.parameters[i].index(individual[i])
+            j = max(0,min(random.randint(j-1, j+1),len(self.parameters[i])-1))
+            individual[i] = self.parameters[i][j]
+    return individual,
+      
+  def evaluate(self, individual):
+    tupindividual = tuple(individual)
+    if tupindividual not in self.cache:
+      template = self.build_template(self.TemplateType.Parameters(*individual))
+      registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4
+      lmem_usage = template.lmem_usage(vcl.atidlas.StatementsTuple(self.statement))
+      local_size = template.parameters.local_size_0*template.parameters.local_size_1
+      occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
+      if occupancy_record.occupancy < 10 :
+        self.cache[tupindividual] = 10
+      else:
+        try:
+          template.execute(self.statement, True)
+          self.statement.result.context.finish_all_queues()
+          N = 0
+          current_time = 0
+          while current_time < 1e-2:
+            time_before = time.time()
+            template.execute(self.statement,False)
+            self.statement.result.context.finish_all_queues()
+            current_time += time.time() - time_before
+            N+=1
+          self.cache[tupindividual] = current_time/N
+        except:
+          self.cache[tupindividual] = 10
+    return self.cache[tupindividual],
--- a/autotune/python/optimize.py
+++ b/autotune/python/optimize.py
@@ -0,0 +1,84 @@
+import array
+import numpy as np
+import random
+import time
+import sys
+
+from deap import algorithms
+from deap import base
+from deap import creator
+from deap import tools
+
+from genetic_operators import GeneticOperators
+
+def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime, maxgen, halloffame, compute_perf, perf_metric):
+    # Evaluate the individuals with an invalid fitness
+    invalid_ind = [ind for ind in population if not ind.fitness.valid]
+    fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
+    for ind, fit in zip(invalid_ind, fitnesses):
+        ind.fitness.values = fit
+
+    if halloffame is not None:
+        halloffame.update(population)
+
+    # Begin the generational process
+    gen = 0
+    maxtime = time.strptime(maxtime, '%Mm%Ss')
+    maxtime = maxtime.tm_min*60 + maxtime.tm_sec
+    start_time = time.time()
+    while time.time() - start_time < maxtime and gen < maxgen:
+        # Vary the population
+        offspring = algorithms.varOr(population, toolbox, lambda_, cxpb, mutpb)
+        
+        # Evaluate the individuals with an invalid fitness
+        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
+        fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
+        for ind, fit in zip(invalid_ind, fitnesses):
+            ind.fitness.values = fit
+        
+        # Update the hall of fame with the generated individuals
+        if halloffame is not None:
+            halloffame.update(offspring)
+
+        # Select the next generation population
+        population[:] = toolbox.select(population + offspring, mu)
+
+        # Update the statistics with the new population
+        gen = gen + 1
+        
+        best_profile = '(%s)'%','.join(map(str,halloffame[0]));
+        best_performance = compute_perf(halloffame[0].fitness.values[0])
+        sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\n'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile))
+    sys.stdout.write('\n')
+    return population
+    
+def genetic(statement, context, TemplateType, build_template, parameter_names, all_parameters, compute_perf, perf_metric, out):
+  gen = GeneticOperators(context.devices[0], statement, all_parameters, parameter_names, TemplateType, build_template)
+  creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
+  creator.create("Individual", list, fitness=creator.FitnessMin)
+
+  toolbox = base.Toolbox()
+  toolbox.register("individual", tools.initIterate, creator.Individual, gen.init)
+  toolbox.register("population", tools.initRepeat, list, toolbox.individual)
+  toolbox.decorate("population", gen.repair)
+  toolbox.register("evaluate", gen.evaluate)
+  toolbox.register("mate", tools.cxUniform, indpb=0.1)
+  toolbox.decorate("mate", gen.repair)
+  toolbox.register("mutate", gen.mutate, indpb=0.1)
+  toolbox.decorate("mutate", gen.repair)
+  toolbox.register("select", tools.selNSGA2)
+    
+  pop = toolbox.population(n=70)
+  hof = tools.HallOfFame(1)
+
+  best_performer = lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x])
+  best_profile = lambda x: '(%s)'%','.join(map(str,hof[0]))
+
+  cxpb = 0.5
+  mutpb = 0.2
+  
+  stats = tools.Statistics(lambda ind: ind.fitness.values)
+  stats.register("max (" + perf_metric + ")", lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x]))
+  stats.register("profile ", lambda x: '(%s)'%','.join(map(str,hof[0])))
+
+  pop = eaMuPlusLambda(pop, toolbox, 70, 100, cxpb=0.1, mutpb=0.1, maxtime='5m0s', maxgen=1000, halloffame=hof, compute_perf=compute_perf, perf_metric=perf_metric)
--- a/autotune/python/tools.py
+++ b/autotune/python/tools.py
@@ -0,0 +1,104 @@
+from __future__ import division
+import pyopencl
+
+class PhysicalLimits:
+    def __init__(self, dev):
+      self.compute_capability = pyopencl.characterize.nv_compute_capability(dev)
+      if self.compute_capability[0]==1:
+        if self.compute_capability[1]<=1:
+          self.warps_per_mp = 24
+          self.threads_per_mp = 768
+          self.num_32b_reg_per_mp = 8192
+          self.reg_alloc_unit_size = 256
+        else:
+          self.warps_per_mp = 32
+          self.threads_per_mp = 1024
+          self.num_32b_reg_per_mp = 16384
+          self.reg_alloc_unit_size = 512
+        self.threads_per_warp = 32
+        self.thread_blocks_per_mp = 8
+        self.reg_alloc_granularity = 'block'
+        self.reg_per_thread = 124
+        self.shared_mem_per_mp = 16384
+        self.shared_mem_alloc_unit_size = 512
+        self.warp_alloc_granularity = 2
+        self.max_thread_block_size = 512
+        
+      elif self.compute_capability[0]==2:
+        self.threads_per_warp = 32
+        self.warps_per_mp = 48
+        self.threads_per_mp = 1536
+        self.thread_blocks_per_mp = 8
+        self.num_32b_reg_per_mp = 32768
+        self.reg_alloc_unit_size = 64
+        self.reg_alloc_granularity = 'warp'
+        self.reg_per_thread = 63
+        self.shared_mem_per_mp = 49152
+        self.shared_mem_alloc_unit_size = 128
+        self.warp_alloc_granularity = 2
+        self.max_thread_block_size = 1024
+      
+      elif self.compute_capability[0]==3:
+        self.threads_per_warp = 32
+        self.warps_per_mp = 64
+        self.threads_per_mp = 2048
+        self.thread_blocks_per_mp = 16
+        self.num_32b_reg_per_mp = 65536
+        self.reg_alloc_unit_size = 256
+        self.reg_alloc_granularity = 'warp'
+        if(self.compute_capability[1]==5):
+          self.reg_per_thread = 255
+        else:
+          self.reg_per_thread = 63
+        self.shared_mem_per_mp = 49152
+        self.shared_mem_alloc_unit_size = 256
+        self.warp_alloc_granularity = 4
+        self.max_thread_block_size = 1024
+        
+      else:
+        raise Exception('Compute capability not supported!')
+        
+def _int_floor(value, multiple_of=1):
+  """Round C{value} down to be a C{multiple_of} something."""
+  # Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
+
+  from math import floor
+  return int(floor(value/multiple_of))*multiple_of
+  
+def _int_ceiling(value, multiple_of=1):
+  """Round C{value} up to be a C{multiple_of} something."""
+  # Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
+
+  from math import ceil
+  return int(ceil(value/multiple_of))*multiple_of
+      
+class OccupancyRecord:
+      
+    def __init__(self, dev, threads, shared_mem=0, registers=0):
+      physical_limits = PhysicalLimits(dev)
+      limits = [];
+      allocated_warps =  _int_ceiling(threads/physical_limits.threads_per_warp)
+      max_warps_per_mp = physical_limits.warps_per_mp;
+      limits.append((min(physical_limits.thread_blocks_per_mp, _int_floor(max_warps_per_mp/allocated_warps)), 'warps'))
+      
+      if registers>0:
+        if registers > physical_limits.reg_per_thread:
+          limits.append((0, 'registers'))
+        else:
+          allocated_regs = {'warp': allocated_warps,
+                            'block': _int_ceiling(_int_ceiling(allocated_warps, physical_limits.warp_alloc_granularity)*registers*physical_limits.threads_per_warp,allocated_warps)}[physical_limits.reg_alloc_granularity]
+          max_reg_per_mp = {'warp': _int_floor(physical_limits.num_32b_reg_per_mp/_int_ceiling(registers*physical_limits.threads_per_warp, physical_limits.reg_alloc_unit_size), physical_limits.warp_alloc_granularity),
+                            'block':physical_limits.num_32b_reg_per_mp}[physical_limits.reg_alloc_granularity]
+          limits.append((_int_floor(max_reg_per_mp/allocated_regs), 'registers'))
+      
+      if shared_mem>0:
+        allocated_shared_mem = _int_ceiling(shared_mem, physical_limits.shared_mem_alloc_unit_size)
+        max_shared_mem_per_mp = physical_limits.shared_mem_per_mp
+        limits.append((_int_floor(max_shared_mem_per_mp/allocated_shared_mem), 'shared memory'))
+      
+      self.limit, self.limited_by = min(limits)
+      self.warps_per_mp = self.limit*allocated_warps
+      self.occupancy = 100*self.warps_per_mp/physical_limits.warps_per_mp
+        
+      
+      
--- a/autotune/python/utils.py
+++ b/autotune/python/utils.py
@@ -0,0 +1,33 @@
+import pyopencl as cl
+import pyviennacl as vcl
+
+all_devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
+
+DEVICE_TYPE_PREFIX = {  cl.device_type.GPU: 'gpu',
+                        cl.device_type.CPU: 'cpu',
+                        cl.device_type.ACCELERATOR: 'accelerator' 
+}
+     
+DEVICE_TYPE_CL_NAME = { cl.device_type.GPU: 'CL_DEVICE_TYPE_GPU',
+                        cl.device_type.CPU: 'CL_DEVICE_TYPE_CPU',
+                        cl.device_type.ACCELERATOR: 'CL_DEVICE_TYPE_ACCELERATOR' 
+}
+                        
+VENDOR_PREFIX = {       vcl.opencl.VendorId.beignet_id: 'beignet',
+                        vcl.opencl.VendorId.nvidia_id: 'nvidia',
+                        vcl.opencl.VendorId.amd_id: 'amd',
+                        vcl.opencl.VendorId.intel_id: 'intel' 
+}
+
+DEVICES_PRESETS = {'all': all_devices,
+                   'gpus': [d for d in all_devices if d.type==cl.device_type.GPU],
+                   'cpus': [d for d in all_devices if d.type==cl.device_type.CPU],
+                   'accelerators': [d for d in all_devices if d.type==cl.device_type.ACCELERATOR]
+}
+
+
+                          
+def sanitize_string(string, keep_chars = ['_']):
+  string = string.replace(' ', '_').lower()
+  string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip()
+  return string
--- a/autotune/python/vclio.py
+++ b/autotune/python/vclio.py
@@ -0,0 +1,116 @@
+import sys
+import os
+import utils
+
+def append_include(data, path):
+  include_name = '#include "' + path +'"\n'
+  already_included = data.find(include_name)
+  if already_included == -1:
+    insert_index = data.index('\n', data.index('#define')) + 1
+    return data[:insert_index] + '\n' + include_name + data[insert_index:]
+  return data
+      
+def generate_viennacl_headers(viennacl_root, device, datatype, operation, additional_parameters, parameters):
+  builtin_database_dir = os.path.join(viennacl_root, "device_specific", "builtin_database")
+  if not os.path.isdir(builtin_database_dir):
+    raise EnvironmentError('ViennaCL root path is incorrect. Cannot access ' + builtin_database_dir + '!\n'
+                            'Your version of ViennaCL may be too old and/or corrupted.')
+                       
+  function_name_dict = { vcl.float32: 'add_4B',
+                         vcl.float64: 'add_8B' }
+  
+  additional_parameters_dict = {'N':  "char_to_type<'N'>",
+                                'T':  "char_to_type<'T'>"}
+  
+  #Create the device-specific headers
+  cpp_device_name = utils.sanitize_string(device.name)
+  function_name = function_name_dict[datatype]
+  operation = operation.replace('-','_')
+    
+  cpp_class_name = operation + '_template'
+  header_name = cpp_device_name + ".hpp"
+  function_declaration = 'inline void ' + function_name + '(' + ', '.join(['database_type<' + cpp_class_name + '::parameters_type> & db'] + \
+                                                                        [additional_parameters_dict[x] for x in additional_parameters]) + ')'
+  
+  device_type_prefix = utils.DEVICE_TYPE_PREFIX[device.type]
+  vendor_prefix = utils.VENDOR_PREFIX[device.vendor_id]
+  architecture_family = vcl.opencl.architecture_family(device.vendor_id, device.name)
+  
+  header_hierarchy = ["devices", device_type_prefix, vendor_prefix, architecture_family]
+  header_directory = os.path.join(builtin_database_dir, *header_hierarchy)
+  header_path = os.path.join(header_directory, header_name)
+  
+  if not os.path.exists(header_directory):
+    os.makedirs(header_directory)
+  
+  if os.path.exists(header_path):
+    with open (header_path, "r") as myfile:
+      data=myfile.read()
+  else:
+    data = ''
+
+  if not data:
+    ifndef_suffix = ('_'.join(header_hierarchy) + '_hpp_').upper()
+    data =  ('#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
+        '#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
+        '\n'
+        '#include "viennacl/device_specific/forwards.h"\n'
+        '#include "viennacl/device_specific/builtin_database/common.hpp"\n'
+        '\n' 
+        'namespace viennacl{\n'
+        'namespace device_specific{\n'
+        'namespace builtin_database{\n'
+        'namespace devices{\n'
+        'namespace '  + device_type_prefix + '{\n'
+        'namespace '  + vendor_prefix + '{\n'
+        'namespace '  + architecture_family + '{\n'
+        'namespace '  + cpp_device_name + '{\n'
+        '\n'
+        '}\n'
+        '}\n'
+        '}\n'
+        '}\n'
+        '}\n'
+        '}\n'
+        '}\n'
+        '}\n'
+        '#endif\n'
+        '')
+  
+  data = append_include(data, 'viennacl/device_specific/templates/' + cpp_class_name + '.hpp')
+    
+  add_to_database_arguments = [vendor_prefix + '_id', utils.DEVICE_TYPE_CL_NAME[device.type], 'ocl::'+architecture_family,
+                '"' + device.name + '"',  cpp_class_name + '::parameters' + str(parameters)]    
+  core = '  db.' + function_name + '(' + ', '.join(add_to_database_arguments) + ');'
+  
+  already_declared = data.find(function_declaration)
+  if already_declared==-1:
+    substr = 'namespace '  + cpp_device_name + '{\n'
+    insert_index = data.index(substr) + len(substr)
+    data = data[:insert_index] + '\n' + function_declaration + '\n{\n' + core + '\n}\n' + data[insert_index:]
+  else:
+    i1 = data.find('{', already_declared)
+    if data[i1-1]=='\n':
+      i1 = i1 - 1
+    i2 = data.find('}', already_declared) + 1 
+    data = data[:i1]  + '\n{\n' + core + '\n}' + data[i2:]
+
+  #Write the header file
+  with open(header_path, "w+") as myfile:
+    myfile.write(data)
+    
+  #Updates the global ViennaCL headers
+  with open(os.path.join(builtin_database_dir, operation + '.hpp'), 'r+') as operation_header:      
+    data = operation_header.read()
+    data = append_include(data, os.path.relpath(header_path, os.path.join(viennacl_root, os.pardir)))
+    
+    scope_name = '_'.join(('init', operation) + additional_parameters)
+    scope = data.index(scope_name)
+    function_call = '  ' + '::'.join(header_hierarchy + [cpp_device_name, function_name]) + '(' + ', '.join(['result'] + [additional_parameters_dict[k] + '()' for k in additional_parameters]) + ')'
+    if function_call not in data:
+      insert_index = data.rindex('\n', 0, data.index('return result', scope))
+      data = data[:insert_index] + function_call + ';\n' + data[insert_index:]
+
+    operation_header.seek(0)
+    operation_header.truncate()
+    operation_header.write(data)