Added exhaustive search backend

2014-09-11 16:13:46 -04:00
parent bf48d69b17
commit 08d17aa58c
4 changed files with 119 additions and 77 deletions
--- a/autotune/python/autotune.py
+++ b/autotune/python/autotune.py
@@ -48,10 +48,10 @@ TYPES = { 'vector-axpy': {'template':vcl.atidlas.VectorAxpyTemplate,
 def parameter_space(operation):
  simd = [1, 2, 4, 8]
  pow2_1D = [2**k for k in range(12)]
-  pow2_2D = [2**k for k in range(10)]
-  pow2_2D_unrolled = [2**k for k in range(6)]
+  pow2_2D = [8, 16]
+  pow2_2D_unrolled = [1, 2, 4, 8]
  FetchingPolicy = vcl.atidlas.FetchingPolicy
-  fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS]
+  fetch = [FetchingPolicy.FETCH_FROM_LOCAL]
  if operation == 'vector-axpy': return [simd, pow2_1D, pow2_1D, fetch]
  if operation == 'reduction': return [simd, pow2_1D, pow2_1D, fetch]
  if operation == 'matrix-axpy': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D, fetch]
@@ -97,7 +97,7 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
            fname = os.devnull
          with open(fname, "w+") as archive:
            with vcl.Statement(node) as statement:
-              result = optimize.genetic(statement, ctx, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
+              result = optimize.exhaustive(statement, ctx, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
                                    TYPES[operation]['parameter-names'], parameter_space(operation), lambda t: TYPES[operation]['perf-index']([datatype().itemsize, s, t]), TYPES[operation]['perf-measure'], archive)
            if result and viennacl_root:
              vclio.generate_viennacl_headers(viennacl_root, device, datatype, operation, other_params, result[1])
--- a/autotune/python/genetic_operators.py
+++ b/autotune/python/genetic_operators.py
@@ -1,9 +1,12 @@
 import random
 import time
+import sys
 import tools
 import pyviennacl as vcl
 import numpy

+from deap import algorithms
+
 from collections import OrderedDict as odict

 def closest_divisor(N, x):
@@ -154,28 +157,51 @@ class GeneticOperators(object):
    return individual,
      
  def evaluate(self, individual):
-    tupindividual = tuple(individual)
-    if tupindividual not in self.cache:
+    if tuple(individual) not in self.cache:
      template = self.build_template(self.TemplateType.Parameters(*individual))
-      registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4
-      lmem_usage = template.lmem_usage(vcl.atidlas.StatementsTuple(self.statement))
-      local_size = template.parameters.local_size_0*template.parameters.local_size_1
-      occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
-      if occupancy_record.occupancy < 15 :
-        self.cache[tupindividual] = 10
-      else:
      try:
-          template.execute(self.statement, True)
-          self.statement.result.context.finish_all_queues()
-          N = 0
-          current_time = 0
-          while current_time < 1e-2:
-            time_before = time.time()
-            template.execute(self.statement,False)
-            self.statement.result.context.finish_all_queues()
-            current_time += time.time() - time_before
-            N+=1
-          self.cache[tupindividual] = current_time/N
+        self.cache[tuple(individual)] = tools.benchmark(template, self.statement, self.device)
      except:
-          self.cache[tupindividual] = 10
-    return self.cache[tupindividual],
+        self.cache[tuple(individual)] = 10
+    return self.cache[tuple(individual)],
+
+def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime, maxgen, halloffame, compute_perf, perf_metric):
+    # Evaluate the individuals with an invalid fitness
+    invalid_ind = [ind for ind in population if not ind.fitness.valid]
+    fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
+    for ind, fit in zip(invalid_ind, fitnesses):
+        ind.fitness.values = fit
+
+    if halloffame is not None:
+        halloffame.update(population)
+
+    # Begin the generational process
+    gen = 0
+    maxtime = time.strptime(maxtime, '%Mm%Ss')
+    maxtime = maxtime.tm_min*60 + maxtime.tm_sec
+    start_time = time.time()
+    while time.time() - start_time < maxtime and gen < maxgen:
+        # Vary the population
+        offspring = algorithms.varOr(population, toolbox, lambda_, cxpb, mutpb)
+        
+        # Evaluate the individuals with an invalid fitness
+        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
+        fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
+        for ind, fit in zip(invalid_ind, fitnesses):
+            ind.fitness.values = fit
+        
+        # Update the hall of fame with the generated individuals
+        if halloffame is not None:
+            halloffame.update(offspring)
+
+        # Select the next generation population
+        population[:] = toolbox.select(population + offspring, mu)
+
+        # Update the statistics with the new population
+        gen = gen + 1
+        
+        best_profile = '(%s)'%','.join(map(str,halloffame[0]));
+        best_performance = compute_perf(halloffame[0].fitness.values[0])
+        sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\n'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile))
+    sys.stdout.write('\n')
+    return population
--- a/autotune/python/optimize.py
+++ b/autotune/python/optimize.py
@@ -1,56 +1,38 @@
 import array
 import numpy as np
 import random
-import time
-import sys

-from deap import algorithms
+import itertools
+
+import tools
+import deap.tools
+
 from deap import base
 from deap import creator
-from deap import tools
+from genetic import GeneticOperators
+from genetic import eaMuPlusLambda

-from genetic_operators import GeneticOperators
+def exhaustive(statement, context, TemplateType, build_template, parameter_names, all_parameters, compute_perf, perf_metric, out):
+  device = context.devices[0]
+  nvalid = 0
+  current = 0
+  minT = float('inf')
+  for individual in itertools.product(*all_parameters):
+    template = build_template(TemplateType.Parameters(*individual))
+    if not tools.skip(template, statement, device):
+      nvalid = nvalid + 1
+  for individual in itertools.product(*all_parameters):
+    template = build_template(TemplateType.Parameters(*individual))
+    try:
+      T = tools.benchmark(template,statement,device)
+      current = current + 1
+      if T < minT:
+        minT = T
+        best = individual
+      print '%d / %d , Best is %d %s for %s\r'%(current, nvalid, compute_perf(minT), perf_metric, best)
+    except:
+      pass
    
-def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime, maxgen, halloffame, compute_perf, perf_metric):
-    # Evaluate the individuals with an invalid fitness
-    invalid_ind = [ind for ind in population if not ind.fitness.valid]
-    fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
-    for ind, fit in zip(invalid_ind, fitnesses):
-        ind.fitness.values = fit
-
-    if halloffame is not None:
-        halloffame.update(population)
-
-    # Begin the generational process
-    gen = 0
-    maxtime = time.strptime(maxtime, '%Mm%Ss')
-    maxtime = maxtime.tm_min*60 + maxtime.tm_sec
-    start_time = time.time()
-    while time.time() - start_time < maxtime and gen < maxgen:
-        # Vary the population
-        offspring = algorithms.varOr(population, toolbox, lambda_, cxpb, mutpb)
-        
-        # Evaluate the individuals with an invalid fitness
-        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
-        fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
-        for ind, fit in zip(invalid_ind, fitnesses):
-            ind.fitness.values = fit
-        
-        # Update the hall of fame with the generated individuals
-        if halloffame is not None:
-            halloffame.update(offspring)
-
-        # Select the next generation population
-        population[:] = toolbox.select(population + offspring, mu)
-
-        # Update the statistics with the new population
-        gen = gen + 1
-        
-        best_profile = '(%s)'%','.join(map(str,halloffame[0]));
-        best_performance = compute_perf(halloffame[0].fitness.values[0])
-        sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\n'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile))
-    sys.stdout.write('\n')
-    return population
  
 def genetic(statement, context, TemplateType, build_template, parameter_names, all_parameters, compute_perf, perf_metric, out):
  gen = GeneticOperators(context.devices[0], statement, all_parameters, parameter_names, TemplateType, build_template)
@@ -68,12 +50,12 @@ def genetic(statement, context, TemplateType, build_template, parameter_names, a
  toolbox.register("select", tools.selBest)
    
  pop = toolbox.population(n=30)
-  hof = tools.HallOfFame(1)
+  hof = deap.tools.HallOfFame(1)

  best_performer = lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x])
  best_profile = lambda x: '(%s)'%','.join(map(str,hof[0]))
  
-  stats = tools.Statistics(lambda ind: ind.fitness.values)
+  stats = deap.tools.Statistics(lambda ind: ind.fitness.values)
  stats.register("max (" + perf_metric + ")", lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x]))
  stats.register("profile ", lambda x: '(%s)'%','.join(map(str,hof[0])))

--- a/autotune/python/tools.py
+++ b/autotune/python/tools.py
@@ -1,5 +1,7 @@
 from __future__ import division
 import pyopencl
+import time
+from pyviennacl.atidlas import StatementsTuple

 class PhysicalLimits:
    def __init__(self, dev):
@@ -101,4 +103,36 @@ class OccupancyRecord:
      self.occupancy = 100*self.warps_per_mp/physical_limits.warps_per_mp
        

+def skip(template, statement, device):
+      statements = StatementsTuple(statement)
+      registers_usage = template.registers_usage(statements)/4
+      lmem_usage = template.lmem_usage(statements)
+      local_size = template.parameters.local_size_0*template.parameters.local_size_1
+      occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
+      if template.check(statement) or occupancy_record.occupancy < 15:
+        return True
+      return False
      
+def benchmark(template, statement, device):
+      statements = StatementsTuple(statement)
+      registers_usage = template.registers_usage(statements)/4
+      lmem_usage = template.lmem_usage(statements)
+      local_size = template.parameters.local_size_0*template.parameters.local_size_1
+      occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
+      if occupancy_record.occupancy < 15 :
+        raise ValueError("Template has too low occupancy")
+      else:
+        try:
+          template.execute(statement, True)
+          statement.result.context.finish_all_queues()
+          N = 0
+          current_time = 0
+          while current_time < 1e-2:
+            time_before = time.time()
+            template.execute(statement,False)
+            statement.result.context.finish_all_queues()
+            current_time += time.time() - time_before
+            N+=1
+          return current_time/N
+        except:
+          raise ValueError("Invalid template")