diff --git a/autotune/python/autotune.py b/autotune/python/autotune.py index 0ad94b4c3..a40d4c221 100644 --- a/autotune/python/autotune.py +++ b/autotune/python/autotune.py @@ -48,10 +48,10 @@ TYPES = { 'vector-axpy': {'template':vcl.atidlas.VectorAxpyTemplate, def parameter_space(operation): simd = [1, 2, 4, 8] pow2_1D = [2**k for k in range(12)] - pow2_2D = [2**k for k in range(10)] - pow2_2D_unrolled = [2**k for k in range(6)] + pow2_2D = [8, 16] + pow2_2D_unrolled = [1, 2, 4, 8] FetchingPolicy = vcl.atidlas.FetchingPolicy - fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS] + fetch = [FetchingPolicy.FETCH_FROM_LOCAL] if operation == 'vector-axpy': return [simd, pow2_1D, pow2_1D, fetch] if operation == 'reduction': return [simd, pow2_1D, pow2_1D, fetch] if operation == 'matrix-axpy': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D, fetch] @@ -97,7 +97,7 @@ def do_tuning(config_fname, spec_fname, viennacl_root): fname = os.devnull with open(fname, "w+") as archive: with vcl.Statement(node) as statement: - result = optimize.genetic(statement, ctx, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params), + result = optimize.exhaustive(statement, ctx, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params), TYPES[operation]['parameter-names'], parameter_space(operation), lambda t: TYPES[operation]['perf-index']([datatype().itemsize, s, t]), TYPES[operation]['perf-measure'], archive) if result and viennacl_root: vclio.generate_viennacl_headers(viennacl_root, device, datatype, operation, other_params, result[1]) diff --git a/autotune/python/genetic_operators.py b/autotune/python/genetic.py similarity index 74% rename from autotune/python/genetic_operators.py rename to autotune/python/genetic.py index fcad88b58..1ea5743db 100644 --- a/autotune/python/genetic_operators.py +++ b/autotune/python/genetic.py @@ -1,9 +1,12 @@ import random import time +import sys import tools import pyviennacl as vcl import numpy +from deap import algorithms + from collections import OrderedDict as odict def closest_divisor(N, x): @@ -154,28 +157,51 @@ class GeneticOperators(object): return individual, def evaluate(self, individual): - tupindividual = tuple(individual) - if tupindividual not in self.cache: + if tuple(individual) not in self.cache: template = self.build_template(self.TemplateType.Parameters(*individual)) - registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4 - lmem_usage = template.lmem_usage(vcl.atidlas.StatementsTuple(self.statement)) - local_size = template.parameters.local_size_0*template.parameters.local_size_1 - occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage) - if occupancy_record.occupancy < 15 : - self.cache[tupindividual] = 10 - else: - try: - template.execute(self.statement, True) - self.statement.result.context.finish_all_queues() - N = 0 - current_time = 0 - while current_time < 1e-2: - time_before = time.time() - template.execute(self.statement,False) - self.statement.result.context.finish_all_queues() - current_time += time.time() - time_before - N+=1 - self.cache[tupindividual] = current_time/N - except: - self.cache[tupindividual] = 10 - return self.cache[tupindividual], + try: + self.cache[tuple(individual)] = tools.benchmark(template, self.statement, self.device) + except: + self.cache[tuple(individual)] = 10 + return self.cache[tuple(individual)], + +def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime, maxgen, halloffame, compute_perf, perf_metric): + # Evaluate the individuals with an invalid fitness + invalid_ind = [ind for ind in population if not ind.fitness.valid] + fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) + for ind, fit in zip(invalid_ind, fitnesses): + ind.fitness.values = fit + + if halloffame is not None: + halloffame.update(population) + + # Begin the generational process + gen = 0 + maxtime = time.strptime(maxtime, '%Mm%Ss') + maxtime = maxtime.tm_min*60 + maxtime.tm_sec + start_time = time.time() + while time.time() - start_time < maxtime and gen < maxgen: + # Vary the population + offspring = algorithms.varOr(population, toolbox, lambda_, cxpb, mutpb) + + # Evaluate the individuals with an invalid fitness + invalid_ind = [ind for ind in offspring if not ind.fitness.valid] + fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) + for ind, fit in zip(invalid_ind, fitnesses): + ind.fitness.values = fit + + # Update the hall of fame with the generated individuals + if halloffame is not None: + halloffame.update(offspring) + + # Select the next generation population + population[:] = toolbox.select(population + offspring, mu) + + # Update the statistics with the new population + gen = gen + 1 + + best_profile = '(%s)'%','.join(map(str,halloffame[0])); + best_performance = compute_perf(halloffame[0].fitness.values[0]) + sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\n'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile)) + sys.stdout.write('\n') + return population diff --git a/autotune/python/optimize.py b/autotune/python/optimize.py index 9609436e4..fb66cc150 100644 --- a/autotune/python/optimize.py +++ b/autotune/python/optimize.py @@ -1,57 +1,39 @@ import array import numpy as np import random -import time -import sys -from deap import algorithms +import itertools + +import tools +import deap.tools + from deap import base from deap import creator -from deap import tools +from genetic import GeneticOperators +from genetic import eaMuPlusLambda -from genetic_operators import GeneticOperators - -def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime, maxgen, halloffame, compute_perf, perf_metric): - # Evaluate the individuals with an invalid fitness - invalid_ind = [ind for ind in population if not ind.fitness.valid] - fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) - for ind, fit in zip(invalid_ind, fitnesses): - ind.fitness.values = fit - - if halloffame is not None: - halloffame.update(population) - - # Begin the generational process - gen = 0 - maxtime = time.strptime(maxtime, '%Mm%Ss') - maxtime = maxtime.tm_min*60 + maxtime.tm_sec - start_time = time.time() - while time.time() - start_time < maxtime and gen < maxgen: - # Vary the population - offspring = algorithms.varOr(population, toolbox, lambda_, cxpb, mutpb) - - # Evaluate the individuals with an invalid fitness - invalid_ind = [ind for ind in offspring if not ind.fitness.valid] - fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) - for ind, fit in zip(invalid_ind, fitnesses): - ind.fitness.values = fit - - # Update the hall of fame with the generated individuals - if halloffame is not None: - halloffame.update(offspring) - - # Select the next generation population - population[:] = toolbox.select(population + offspring, mu) - - # Update the statistics with the new population - gen = gen + 1 - - best_profile = '(%s)'%','.join(map(str,halloffame[0])); - best_performance = compute_perf(halloffame[0].fitness.values[0]) - sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\n'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile)) - sys.stdout.write('\n') - return population +def exhaustive(statement, context, TemplateType, build_template, parameter_names, all_parameters, compute_perf, perf_metric, out): + device = context.devices[0] + nvalid = 0 + current = 0 + minT = float('inf') + for individual in itertools.product(*all_parameters): + template = build_template(TemplateType.Parameters(*individual)) + if not tools.skip(template, statement, device): + nvalid = nvalid + 1 + for individual in itertools.product(*all_parameters): + template = build_template(TemplateType.Parameters(*individual)) + try: + T = tools.benchmark(template,statement,device) + current = current + 1 + if T < minT: + minT = T + best = individual + print '%d / %d , Best is %d %s for %s\r'%(current, nvalid, compute_perf(minT), perf_metric, best) + except: + pass + def genetic(statement, context, TemplateType, build_template, parameter_names, all_parameters, compute_perf, perf_metric, out): gen = GeneticOperators(context.devices[0], statement, all_parameters, parameter_names, TemplateType, build_template) creator.create("FitnessMin", base.Fitness, weights=(-1.0,)) @@ -68,12 +50,12 @@ def genetic(statement, context, TemplateType, build_template, parameter_names, a toolbox.register("select", tools.selBest) pop = toolbox.population(n=30) - hof = tools.HallOfFame(1) + hof = deap.tools.HallOfFame(1) best_performer = lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x]) best_profile = lambda x: '(%s)'%','.join(map(str,hof[0])) - stats = tools.Statistics(lambda ind: ind.fitness.values) + stats = deap.tools.Statistics(lambda ind: ind.fitness.values) stats.register("max (" + perf_metric + ")", lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x])) stats.register("profile ", lambda x: '(%s)'%','.join(map(str,hof[0]))) diff --git a/autotune/python/tools.py b/autotune/python/tools.py index 93603ef98..512463646 100644 --- a/autotune/python/tools.py +++ b/autotune/python/tools.py @@ -1,5 +1,7 @@ from __future__ import division import pyopencl +import time +from pyviennacl.atidlas import StatementsTuple class PhysicalLimits: def __init__(self, dev): @@ -100,5 +102,37 @@ class OccupancyRecord: self.warps_per_mp = self.limit*allocated_warps self.occupancy = 100*self.warps_per_mp/physical_limits.warps_per_mp + +def skip(template, statement, device): + statements = StatementsTuple(statement) + registers_usage = template.registers_usage(statements)/4 + lmem_usage = template.lmem_usage(statements) + local_size = template.parameters.local_size_0*template.parameters.local_size_1 + occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage) + if template.check(statement) or occupancy_record.occupancy < 15: + return True + return False - +def benchmark(template, statement, device): + statements = StatementsTuple(statement) + registers_usage = template.registers_usage(statements)/4 + lmem_usage = template.lmem_usage(statements) + local_size = template.parameters.local_size_0*template.parameters.local_size_1 + occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage) + if occupancy_record.occupancy < 15 : + raise ValueError("Template has too low occupancy") + else: + try: + template.execute(statement, True) + statement.result.context.finish_all_queues() + N = 0 + current_time = 0 + while current_time < 1e-2: + time_before = time.time() + template.execute(statement,False) + statement.result.context.finish_all_queues() + current_time += time.time() - time_before + N+=1 + return current_time/N + except: + raise ValueError("Invalid template")