triton/python/autotune/pysrc/genetic.py

import random, time, sys, copy
import misc_tools

import numpy as np
import pyatidlas as atd
from deap import algorithms
from deap import base
from deap import creator
from deap import tools as deap_tools

from collections import OrderedDict as odict


def closest_divisor(N, x):
    x_low=x_high=max(1,min(round(x),N))
    while N % x_low > 0 and x_low>0:
        x_low = x_low - 1
    while N % x_high > 0 and x_high < N:
        x_high = x_high + 1
    return x_low if x - x_low < x_high - x else x_high

def b_gray_to_bin(A='00000000', endian='big'):
    assert type(endian) is str
    assert endian == 'little' or endian == 'big'
    if endian == 'little': A = A[::-1] # Make sure endianness is big before conversion
    b = A[0]
    for i in range(1, len(A)): b += str( int(b[i-1] != A[i]) )
    if endian == 'little': b = b[::-1] # Convert back to little endian if necessary
    return b

class GeneticOperators(object):

    def __init__(self, symbolic, Template, out):
        self.device = symbolic.context.queues[0].device
        self.symbolic = symbolic
        self.Template = Template
        self.cache = {}
        self.out = out


        self.genome_info = {
                            atd.vaxpy: [3,4,4,atd.fetching_policy_type],
                            atd.reduction: [3,4,4,atd.fetching_policy_type],
                            atd.maxpy: [3,3,3,3,3,atd.fetching_policy_type],
                            atd.mreduction_rows: [3,3,3,4,atd.fetching_policy_type],
                            atd.mreduction_cols: [3,3,3,4,atd.fetching_policy_type],
                            atd.mproduct_nn: [3,3,3,3,3,3,3,atd.fetching_policy_type,atd.fetching_policy_type,3],
                            atd.mproduct_nt: [3,3,3,3,3,3,3,atd.fetching_policy_type,atd.fetching_policy_type,3],
                            atd.mproduct_tn: [3,3,3,3,3,3,3,atd.fetching_policy_type,atd.fetching_policy_type,3],
                            atd.mproduct_tt: [3,3,3,3,3,3,3,atd.fetching_policy_type,atd.fetching_policy_type,3]
                           }[Template]
        self.indpb = 1.0/sum([1 if x==atd.fetching_policy_type else x for x in self.genome_info])

        creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMin)

        self.toolbox = base.Toolbox()
        self.toolbox.register("population", self.init)
        self.toolbox.register("evaluate", self.evaluate)
        self.toolbox.register("mate", deap_tools.cxTwoPoint)
        self.toolbox.register("mutate", self.mutate)
        self.toolbox.register("select", deap_tools.selNSGA2)

    def decode(self, genome):
        fetching_policy_type = atd.fetching_policy_type
        fetch = [fetching_policy_type.FETCH_FROM_LOCAL, fetching_policy_type.FETCH_FROM_GLOBAL_STRIDED, fetching_policy_type.FETCH_FROM_GLOBAL_CONTIGUOUS]
        decode_element = lambda x:2**int(b_gray_to_bin(''.join(x)), 2)
        result = []
        offset = 0
        for x in self.genome_info:
            if x==atd.fetching_policy_type:
                result.append(fetch[genome[offset]])
                offset = offset + 1
            else:
                result.append(decode_element(genome[offset:offset+x]))
                offset = offset + x
        #GEMM peculiarities
        if self.Template in [atd.mproduct_nn, atd.mproduct_nt, atd.mproduct_tn, atd.mproduct_tt]:
            if fetching_policy_type.FETCH_FROM_LOCAL in result:
                lf1 = result[1]*result[3]/result[9]
            else:
                result[9] = 0
                lf1 = 0
            result.append(lf1)
        return result

    def init(self, N):
        result = []
        allowed_idx = [0,1,2] if self.Template in [atd.mproduct_nn, atd.mproduct_nt, atd.mproduct_tn, atd.mproduct_tt] else [1,2]
        for idx in allowed_idx:
            current = []
            while len(current) < N/len(allowed_idx):
                while True:
                    bincode = []
                    for i, x in enumerate(self.genome_info):
                        if x==atd.fetching_policy_type:
                            bincode = bincode + [idx]
                        else:
                            bincode = bincode + [str(random.randint(0,1)) for i in range(x)]
                    parameters = self.decode(bincode)
                    template = self.Template(*parameters)
                    array_expressions = atd.array_expression_container(self.symbolic)
                    registers_usage = template.registers_usage(array_expressions)/4
                    lmem_usage = template.lmem_usage(array_expressions)
                    local_size = parameters[1]*parameters[3]
                    occupancy_record = misc_tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
                    if not misc_tools.skip(template, self.symbolic):
                        current.append(creator.Individual(bincode))
                        break
            result = result + current
        return result

    def mutate(self, individual):
        while True:
            new_individual = copy.deepcopy(individual)
            for i in range(len(new_individual)):
                if isinstance(individual[i], int) and random.random() < self.indpb:
                    while new_individual[i] == individual[i]:
                        new_individual[i] = random.randint(0, 2)
                elif not isinstance(individual[i], int) and random.random() < self.indpb:
                    new_individual[i] = '1' if new_individual[i]=='0' else '0'
            parameters = self.decode(new_individual)
            template = self.Template(*parameters)
            #print tools.skip(template, self.symbolic), parameters
            if not misc_tools.skip(template, self.symbolic):
                break
        return new_individual,

    def evaluate(self, individual):
        if tuple(individual) not in self.cache:
            parameters = self.decode(individual)
            template = self.Template(*parameters)
            try:
                tt = misc_tools.benchmark(template, self.symbolic)
                self.out.write(','.join([str(tt)]+map(str,map(int,parameters)))+'\n')
                self.cache[tuple(individual)] = tt
            except ValueError:
                self.cache[tuple(individual)] = 10
        return self.cache[tuple(individual)],

    def optimize(self, maxtime, maxgen, compute_perf, perf_metric):
        hof = deap_tools.HallOfFame(1)
        # Begin the generational process
        gen = 0
        maxtime = time.strptime(maxtime, '%Mm%Ss')
        maxtime = maxtime.tm_min*60 + maxtime.tm_sec
        start_time = time.time()

        mu = 30
        cxpb = 0.2
        mutpb = 0.7

        population = self.init(mu)
        invalid_ind = [ind for ind in population if not ind.fitness.valid]
        fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
        hof.update(population)

        while time.time() - start_time < maxtime and gen < maxgen:
            # Vary the population
            offspring = []
            for _ in xrange(mu):
                op_choice = random.random()
                if op_choice < cxpb:            # Apply crossover
                    while True:
                        ind1, ind2 = map(self.toolbox.clone, random.sample(population, 2))
                        ind1, ind2 = self.toolbox.mate(ind1, ind2)
                        del ind1.fitness.values
                        parameters = self.decode(ind1)
                        template = self.Template(*parameters)
                        if not misc_tools.skip(template, self.symbolic):
                            break
                    offspring.append(ind1)
                elif op_choice < cxpb + mutpb:  # Apply mutation
                    ind = self.toolbox.clone(random.choice(population))
                    ind, = self.toolbox.mutate(ind)
                    del ind.fitness.values
                    offspring.append(ind)
                else:                           # Apply reproduction
                    offspring.append(random.choice(population))
            #for x in offspring:
                    #print self.decode(x)
            # Evaluate the individuals with an invalid fitness
            invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
            fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
            for ind, fit in zip(invalid_ind, fitnesses):
                ind.fitness.values = fit
            # Update the hall of fame with the generated individuals
            hof.update(offspring)
            # Select the next generation population
            population[:] = self.toolbox.select(population + offspring, mu)
            #Update
            gen = gen + 1
            best_profile = '(%s)'%','.join(map(str,self.decode(hof[0])))
            best_performance = compute_perf(hof[0].fitness.values[0])
            sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\r'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile))
            sys.stdout.flush()
        sys.stdout.write('\n')
        return self.decode(hof[0])