194 lines
8.6 KiB
Python
194 lines
8.6 KiB
Python
import random, time, sys, copy
|
|
import misc_tools
|
|
|
|
import numpy as np
|
|
import pyatidlas as atd
|
|
import pyviennacl as vcl
|
|
|
|
from deap import algorithms
|
|
from deap import base
|
|
from deap import creator
|
|
from deap import tools as deap_tools
|
|
|
|
from collections import OrderedDict as odict
|
|
|
|
|
|
def closest_divisor(N, x):
|
|
x_low=x_high=max(1,min(round(x),N))
|
|
while N % x_low > 0 and x_low>0:
|
|
x_low = x_low - 1
|
|
while N % x_high > 0 and x_high < N:
|
|
x_high = x_high + 1
|
|
return x_low if x - x_low < x_high - x else x_high
|
|
|
|
def b_gray_to_bin(A='00000000', endian='big'):
|
|
assert type(endian) is str
|
|
assert endian == 'little' or endian == 'big'
|
|
if endian == 'little': A = A[::-1] # Make sure endianness is big before conversion
|
|
b = A[0]
|
|
for i in range(1, len(A)): b += str( int(b[i-1] != A[i]) )
|
|
if endian == 'little': b = b[::-1] # Convert back to little endian if necessary
|
|
return b
|
|
|
|
class GeneticOperators(object):
|
|
|
|
def __init__(self, device, statement, TemplateType, build_template, out):
|
|
self.device = device
|
|
self.statement = statement
|
|
self.TemplateType = TemplateType
|
|
self.ParameterType = TemplateType.Parameters
|
|
self.build_template = build_template
|
|
self.cache = {}
|
|
self.out = out
|
|
|
|
self.genome_info = {
|
|
atd.VectorAxpyTemplate: [3,4,4,atd.FetchingPolicy],
|
|
atd.ReductionTemplate: [3,4,4,atd.FetchingPolicy],
|
|
atd.MatrixAxpyTemplate: [3,3,3,3,3,atd.FetchingPolicy],
|
|
atd.RowWiseReductionTemplate: [3,3,3,4,atd.FetchingPolicy],
|
|
atd.MatrixProductTemplate: [3,3,3,3,3,3,3,atd.FetchingPolicy,atd.FetchingPolicy,3]
|
|
}[TemplateType]
|
|
self.indpb = 1.0/sum([1 if x==atd.FetchingPolicy else x for x in self.genome_info])
|
|
|
|
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
|
|
creator.create("Individual", list, fitness=creator.FitnessMin)
|
|
|
|
self.toolbox = base.Toolbox()
|
|
self.toolbox.register("population", self.init)
|
|
self.toolbox.register("evaluate", self.evaluate)
|
|
self.toolbox.register("mate", deap_tools.cxTwoPoint)
|
|
self.toolbox.register("mutate", self.mutate)
|
|
self.toolbox.register("select", deap_tools.selNSGA2)
|
|
|
|
def decode(self, genome):
|
|
FetchingPolicy = atd.FetchingPolicy
|
|
fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS]
|
|
decode_element = lambda x:2**int(b_gray_to_bin(''.join(x)), 2)
|
|
result = []
|
|
offset = 0
|
|
for x in self.genome_info:
|
|
if x==atd.FetchingPolicy:
|
|
result.append(fetch[genome[offset]])
|
|
offset = offset + 1
|
|
else:
|
|
result.append(decode_element(genome[offset:offset+x]))
|
|
offset = offset + x
|
|
#GEMM peculiarities
|
|
if self.TemplateType==atd.MatrixProductTemplate:
|
|
if FetchingPolicy.FETCH_FROM_LOCAL in result:
|
|
lf1 = result[1]*result[3]/result[9]
|
|
else:
|
|
result[9] = 0
|
|
lf1 = 0
|
|
result.append(lf1)
|
|
return result
|
|
|
|
def init(self, N):
|
|
result = []
|
|
allowed_idx = [0,1,2] if self.TemplateType==atd.MatrixProductTemplate else [1,2]
|
|
for idx in allowed_idx:
|
|
current = []
|
|
while len(current) < N/len(allowed_idx):
|
|
while True:
|
|
bincode = []
|
|
for i, x in enumerate(self.genome_info):
|
|
if x==atd.FetchingPolicy:
|
|
bincode = bincode + [idx]
|
|
else:
|
|
bincode = bincode + [str(random.randint(0,1)) for i in range(x)]
|
|
parameters = self.decode(bincode)
|
|
template = self.build_template(self.TemplateType.Parameters(*parameters))
|
|
registers_usage = template.registers_usage(vcl.pycore.StatementsTuple(self.statement))/4
|
|
lmem_usage = template.lmem_usage(vcl.pycore.StatementsTuple(self.statement))
|
|
local_size = template.parameters.local_size_0*template.parameters.local_size_1
|
|
occupancy_record = misc_tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
|
|
if not misc_tools.skip(template, self.statement, self.device):
|
|
current.append(creator.Individual(bincode))
|
|
break
|
|
result = result + current
|
|
return result
|
|
|
|
def mutate(self, individual):
|
|
while True:
|
|
new_individual = copy.deepcopy(individual)
|
|
for i in range(len(new_individual)):
|
|
if isinstance(individual[i], int) and random.random() < self.indpb:
|
|
while new_individual[i] == individual[i]:
|
|
new_individual[i] = random.randint(0, 2)
|
|
elif not isinstance(individual[i], int) and random.random() < self.indpb:
|
|
new_individual[i] = '1' if new_individual[i]=='0' else '0'
|
|
parameters = self.decode(new_individual)
|
|
template = self.build_template(self.TemplateType.Parameters(*parameters))
|
|
#print tools.skip(template, self.statement, self.device), parameters
|
|
if not misc_tools.skip(template, self.statement, self.device):
|
|
break
|
|
return new_individual,
|
|
|
|
def evaluate(self, individual):
|
|
if tuple(individual) not in self.cache:
|
|
parameters = self.decode(individual)
|
|
template = self.build_template(self.TemplateType.Parameters(*parameters))
|
|
try:
|
|
tt = misc_tools.benchmark(template, self.statement, self.device)
|
|
self.out.write(','.join([str(tt)]+map(str,map(int,parameters)))+'\n')
|
|
self.cache[tuple(individual)] = tt
|
|
except:
|
|
self.cache[tuple(individual)] = 10
|
|
return self.cache[tuple(individual)],
|
|
|
|
def optimize(self, maxtime, maxgen, compute_perf, perf_metric):
|
|
hof = deap_tools.HallOfFame(1)
|
|
# Begin the generational process
|
|
gen = 0
|
|
maxtime = time.strptime(maxtime, '%Mm%Ss')
|
|
maxtime = maxtime.tm_min*60 + maxtime.tm_sec
|
|
start_time = time.time()
|
|
|
|
mu = 30
|
|
cxpb = 0.2
|
|
mutpb = 0.7
|
|
|
|
population = self.init(mu)
|
|
invalid_ind = [ind for ind in population if not ind.fitness.valid]
|
|
fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
|
|
for ind, fit in zip(invalid_ind, fitnesses):
|
|
ind.fitness.values = fit
|
|
hof.update(population)
|
|
|
|
while time.time() - start_time < maxtime and gen < maxgen:
|
|
# Vary the population
|
|
offspring = []
|
|
for _ in xrange(mu):
|
|
op_choice = random.random()
|
|
if op_choice < cxpb: # Apply crossover
|
|
ind1, ind2 = map(self.toolbox.clone, random.sample(population, 2))
|
|
ind1, ind2 = self.toolbox.mate(ind1, ind2)
|
|
del ind1.fitness.values
|
|
offspring.append(ind1)
|
|
elif op_choice < cxpb + mutpb: # Apply mutation
|
|
ind = self.toolbox.clone(random.choice(population))
|
|
ind, = self.toolbox.mutate(ind)
|
|
del ind.fitness.values
|
|
offspring.append(ind)
|
|
else: # Apply reproduction
|
|
offspring.append(random.choice(population))
|
|
#for x in offspring:
|
|
#print self.decode(x)
|
|
# Evaluate the individuals with an invalid fitness
|
|
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
|
|
fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
|
|
for ind, fit in zip(invalid_ind, fitnesses):
|
|
ind.fitness.values = fit
|
|
# Update the hall of fame with the generated individuals
|
|
hof.update(offspring)
|
|
# Select the next generation population
|
|
population[:] = self.toolbox.select(population + offspring, mu)
|
|
#Update
|
|
gen = gen + 1
|
|
best_profile = '(%s)'%','.join(map(str,self.decode(hof[0])))
|
|
best_performance = compute_perf(hof[0].fitness.values[0])
|
|
sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\r'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile))
|
|
sys.stdout.flush()
|
|
sys.stdout.write('\n')
|
|
return self.decode(hof[0])
|