Added exhaustive search backend

This commit is contained in:
Philippe Tillet
2014-09-11 16:13:46 -04:00
parent bf48d69b17
commit 08d17aa58c
4 changed files with 119 additions and 77 deletions

View File

@@ -48,10 +48,10 @@ TYPES = { 'vector-axpy': {'template':vcl.atidlas.VectorAxpyTemplate,
def parameter_space(operation):
simd = [1, 2, 4, 8]
pow2_1D = [2**k for k in range(12)]
pow2_2D = [2**k for k in range(10)]
pow2_2D_unrolled = [2**k for k in range(6)]
pow2_2D = [8, 16]
pow2_2D_unrolled = [1, 2, 4, 8]
FetchingPolicy = vcl.atidlas.FetchingPolicy
fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS]
fetch = [FetchingPolicy.FETCH_FROM_LOCAL]
if operation == 'vector-axpy': return [simd, pow2_1D, pow2_1D, fetch]
if operation == 'reduction': return [simd, pow2_1D, pow2_1D, fetch]
if operation == 'matrix-axpy': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D, fetch]
@@ -97,7 +97,7 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
fname = os.devnull
with open(fname, "w+") as archive:
with vcl.Statement(node) as statement:
result = optimize.genetic(statement, ctx, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
result = optimize.exhaustive(statement, ctx, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
TYPES[operation]['parameter-names'], parameter_space(operation), lambda t: TYPES[operation]['perf-index']([datatype().itemsize, s, t]), TYPES[operation]['perf-measure'], archive)
if result and viennacl_root:
vclio.generate_viennacl_headers(viennacl_root, device, datatype, operation, other_params, result[1])

View File

@@ -1,9 +1,12 @@
import random
import time
import sys
import tools
import pyviennacl as vcl
import numpy
from deap import algorithms
from collections import OrderedDict as odict
def closest_divisor(N, x):
@@ -154,28 +157,51 @@ class GeneticOperators(object):
return individual,
def evaluate(self, individual):
tupindividual = tuple(individual)
if tupindividual not in self.cache:
if tuple(individual) not in self.cache:
template = self.build_template(self.TemplateType.Parameters(*individual))
registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4
lmem_usage = template.lmem_usage(vcl.atidlas.StatementsTuple(self.statement))
local_size = template.parameters.local_size_0*template.parameters.local_size_1
occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
if occupancy_record.occupancy < 15 :
self.cache[tupindividual] = 10
else:
try:
template.execute(self.statement, True)
self.statement.result.context.finish_all_queues()
N = 0
current_time = 0
while current_time < 1e-2:
time_before = time.time()
template.execute(self.statement,False)
self.statement.result.context.finish_all_queues()
current_time += time.time() - time_before
N+=1
self.cache[tupindividual] = current_time/N
self.cache[tuple(individual)] = tools.benchmark(template, self.statement, self.device)
except:
self.cache[tupindividual] = 10
return self.cache[tupindividual],
self.cache[tuple(individual)] = 10
return self.cache[tuple(individual)],
def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime, maxgen, halloffame, compute_perf, perf_metric):
# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in population if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
if halloffame is not None:
halloffame.update(population)
# Begin the generational process
gen = 0
maxtime = time.strptime(maxtime, '%Mm%Ss')
maxtime = maxtime.tm_min*60 + maxtime.tm_sec
start_time = time.time()
while time.time() - start_time < maxtime and gen < maxgen:
# Vary the population
offspring = algorithms.varOr(population, toolbox, lambda_, cxpb, mutpb)
# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
# Update the hall of fame with the generated individuals
if halloffame is not None:
halloffame.update(offspring)
# Select the next generation population
population[:] = toolbox.select(population + offspring, mu)
# Update the statistics with the new population
gen = gen + 1
best_profile = '(%s)'%','.join(map(str,halloffame[0]));
best_performance = compute_perf(halloffame[0].fitness.values[0])
sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\n'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile))
sys.stdout.write('\n')
return population

View File

@@ -1,56 +1,38 @@
import array
import numpy as np
import random
import time
import sys
from deap import algorithms
import itertools
import tools
import deap.tools
from deap import base
from deap import creator
from deap import tools
from genetic import GeneticOperators
from genetic import eaMuPlusLambda
from genetic_operators import GeneticOperators
def exhaustive(statement, context, TemplateType, build_template, parameter_names, all_parameters, compute_perf, perf_metric, out):
device = context.devices[0]
nvalid = 0
current = 0
minT = float('inf')
for individual in itertools.product(*all_parameters):
template = build_template(TemplateType.Parameters(*individual))
if not tools.skip(template, statement, device):
nvalid = nvalid + 1
for individual in itertools.product(*all_parameters):
template = build_template(TemplateType.Parameters(*individual))
try:
T = tools.benchmark(template,statement,device)
current = current + 1
if T < minT:
minT = T
best = individual
print '%d / %d , Best is %d %s for %s\r'%(current, nvalid, compute_perf(minT), perf_metric, best)
except:
pass
def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime, maxgen, halloffame, compute_perf, perf_metric):
# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in population if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
if halloffame is not None:
halloffame.update(population)
# Begin the generational process
gen = 0
maxtime = time.strptime(maxtime, '%Mm%Ss')
maxtime = maxtime.tm_min*60 + maxtime.tm_sec
start_time = time.time()
while time.time() - start_time < maxtime and gen < maxgen:
# Vary the population
offspring = algorithms.varOr(population, toolbox, lambda_, cxpb, mutpb)
# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
# Update the hall of fame with the generated individuals
if halloffame is not None:
halloffame.update(offspring)
# Select the next generation population
population[:] = toolbox.select(population + offspring, mu)
# Update the statistics with the new population
gen = gen + 1
best_profile = '(%s)'%','.join(map(str,halloffame[0]));
best_performance = compute_perf(halloffame[0].fitness.values[0])
sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\n'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile))
sys.stdout.write('\n')
return population
def genetic(statement, context, TemplateType, build_template, parameter_names, all_parameters, compute_perf, perf_metric, out):
gen = GeneticOperators(context.devices[0], statement, all_parameters, parameter_names, TemplateType, build_template)
@@ -68,12 +50,12 @@ def genetic(statement, context, TemplateType, build_template, parameter_names, a
toolbox.register("select", tools.selBest)
pop = toolbox.population(n=30)
hof = tools.HallOfFame(1)
hof = deap.tools.HallOfFame(1)
best_performer = lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x])
best_profile = lambda x: '(%s)'%','.join(map(str,hof[0]))
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats = deap.tools.Statistics(lambda ind: ind.fitness.values)
stats.register("max (" + perf_metric + ")", lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x]))
stats.register("profile ", lambda x: '(%s)'%','.join(map(str,hof[0])))

View File

@@ -1,5 +1,7 @@
from __future__ import division
import pyopencl
import time
from pyviennacl.atidlas import StatementsTuple
class PhysicalLimits:
def __init__(self, dev):
@@ -101,4 +103,36 @@ class OccupancyRecord:
self.occupancy = 100*self.warps_per_mp/physical_limits.warps_per_mp
def skip(template, statement, device):
statements = StatementsTuple(statement)
registers_usage = template.registers_usage(statements)/4
lmem_usage = template.lmem_usage(statements)
local_size = template.parameters.local_size_0*template.parameters.local_size_1
occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
if template.check(statement) or occupancy_record.occupancy < 15:
return True
return False
def benchmark(template, statement, device):
statements = StatementsTuple(statement)
registers_usage = template.registers_usage(statements)/4
lmem_usage = template.lmem_usage(statements)
local_size = template.parameters.local_size_0*template.parameters.local_size_1
occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
if occupancy_record.occupancy < 15 :
raise ValueError("Template has too low occupancy")
else:
try:
template.execute(statement, True)
statement.result.context.finish_all_queues()
N = 0
current_time = 0
while current_time < 1e-2:
time_before = time.time()
template.execute(statement,False)
statement.result.context.finish_all_queues()
current_time += time.time() - time_before
N+=1
return current_time/N
except:
raise ValueError("Invalid template")