Auto-tuner : Initial push

This commit is contained in:
Philippe Tillet
2014-09-02 22:03:20 -04:00
parent 8489ab2513
commit 544583e6ca
5 changed files with 535 additions and 0 deletions

168
autotune/autotune.py Normal file
View File

@@ -0,0 +1,168 @@
from __future__ import division
import argparse
import itertools
import os
from external.configobj import ConfigObj
import pyopencl as cl
import pyviennacl as vcl
from pyviennacl import backend
from pyviennacl import opencl
from pyviennacl import atidlas
import utils
import vclio
import optimize
import sys
DATATYPES = { 'single' : vcl.float32,
'double' : vcl.float64
}
TYPES = { 'vector-axpy': vcl.atidlas.VectorAxpyTemplate,
'matrix-axpy': vcl.atidlas.MatrixAxpyTemplate,
'reduction': vcl.atidlas.ReductionTemplate,
'row-wise-reduction': vcl.atidlas.RowWiseReductionTemplate,
'matrix-product': vcl.atidlas.MatrixProductTemplate
}
PNAMES = {
'vector-axpy': ['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
'matrix-axpy': ['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'num-groups-1', 'fetch'],
'reduction': ['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
'row-wise-reduction': ['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'fetch'],
'matrix-product': ['simd-width', 'local-size-0', 'kL', 'local-size-1', 'mS', 'kS', 'nS', 'A-fetch-policy', 'B-fetch-policy', 'local-fetch-size-0', 'local-fetch-size-1']
}
PERFINDEX = { 'vector-axpy' : (lambda x: 3*x[0]*x[1][0]/x[2]*1e-9, 'GB/s') ,
'matrix-axpy' : (lambda x: 3*x[0]*x[1][0]*x[1][1]/x[2]*1e-9, 'GB/s'),
'reduction' : (lambda x: 2*x[0]*x[1][0]*x[1][1]/x[2]*1e-9, 'GB/s'),
'row-wise-reduction' : (lambda x: x[0]*x[1][0]*x[1][1]/x[2]*1e-9, 'GB/s'),
'matrix-product': (lambda x: 2*x[1][0]*x[1][1]*x[1][2]/x[2]*1e-9, 'GFLOPs/s')
}
def parameter_space(operation):
simd = [1, 2, 4, 8]
pow2_1D = [2**k for k in range(12)]
pow2_2D = [2**k for k in range(10)]
pow2_2D_unrolled = [2**k for k in range(6)]
FetchingPolicy = vcl.device_specific.FetchingPolicy
fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS]
if operation == 'vector-axpy': return [simd, pow2_1D, pow2_1D, fetch]
if operation == 'reduction': return [simd, pow2_1D, pow2_1D, fetch]
if operation == 'matrix-axpy': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D, fetch]
if operation == 'row-wise-reduction': return [simd, pow2_2D, pow2_2D, pow2_1D, fetch]
if operation == 'matrix-product': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D_unrolled, pow2_2D_unrolled, pow2_2D_unrolled, fetch, fetch, pow2_2D, pow2_2D]
def do_tuning(config_fname, spec_fname, viennacl_root):
config = ConfigObj(config_fname, configspec=spec_fname)
map_to_list = lambda T: list(map(T[0], T[1] if isinstance(T[1], list) else [T[1]]))
for operation in ['vector-axpy', 'matrix-axpy', 'row-wise-reduction', 'matrix-product']:
tmp_folder = config['tmp-folder'] if 'tmp-folder' in config else ""
if operation in config:
p = config[operation]
confdevices = p['devices']
devices = utils.DEVICES_PRESETS[confdevices] if confdevices in utils.DEVICES_PRESETS else [utils.all_devices[int(i)] for i in confdevices]
precisions = ['single', 'double'] if 'all' in p['precision'] else p['precision']
datatypes = [DATATYPES[k] for k in precisions]
s = map_to_list((int, p['size']))
for datatype, device in itertools.product(datatypes, devices):
ctx = cl.Context([device])
ctx = vcl.backend.Context(ctx)
device = ctx.current_device
if datatype is vcl.float64 and not device.double_fp_config:
sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...')
continue
pairs = []
def execute(node, other_params):
print('-----')
print(' '.join(map(str, ("Now tuning:", datatype.__name__, '-', operation, '-'.join(other_params), '[' + device.name, '(' + device.platform.name + ')]'))))
tmp_file = os.path.join(tmp_folder, utils.sanitize_string(device.name) + "-" + datatype.__name__ + "-" + operation + '-'.join(other_params) + ".dat")
if tmp_folder:
print('Saving history to ' + tmp_file)
fname = tmp_file
else:
fname = os.devnull
with open(fname, "w+") as archive:
with vcl.Statement(node) as statement:
result = optimize.genetic(statement, ctx, TYPES[operation], lambda p: TYPES[operation](p, *other_params),
PNAMES[operation], parameter_space(operation), lambda t: PERFINDEX[operation][0]([datatype().itemsize, s, t]), PERFINDEX[operation][1], archive)
if result and viennacl_root:
vclio.generate_viennacl_headers(viennacl_root, device, datatype, operation, other_params, result[1])
if operation=='vector-axpy':
x = vcl.Vector(s[0], context=ctx, dtype=datatype)
y = vcl.Vector(s[0], context=ctx, dtype=datatype)
execute(vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y)), ())
if operation=='matrix-axpy':
A = vcl.Matrix(s, context=ctx, dtype=datatype)
B = vcl.Matrix(s, context=ctx, dtype=datatype)
execute(A+B, ())
if operation=='row-wise-reduction':
layouts = map_to_list((str,p['layout']))
if 'all' in layouts:
layouts = ['N', 'T']
for A_trans in layouts:
A = vcl.Matrix(s if A_trans=='N' else s[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR)
x = vcl.Vector(s[1] if A_trans=='N' else s[0], context=ctx, dtype=datatype)
LHS = A if A_trans=='N' else A.T
execute(LHS*x, (A_trans,))
if operation=='matrix-product':
layouts = map_to_list((str,p['layout']))
if 'all' in layouts:
layouts = ['NN', 'NT', 'TN', 'TT']
for layout in layouts:
A_trans = layout[0]
B_trans = layout[1]
A = vcl.Matrix((s[0], s[1]) if A_trans=='N' else (s[1],s[0]), context=ctx, dtype=datatype, layout=vcl.COL_MAJOR);
B = vcl.Matrix((s[1], s[2]) if B_trans=='N' else (s[2],s[1]), context=ctx, dtype=datatype, layout=vcl.COL_MAJOR);
LHS = A if A_trans=='N' else A.T
RHS = B if B_trans=='N' else B.T
alpha = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
C = vcl.Matrix((s[0], s[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR)
execute(vcl.Assign(C,LHS*RHS*alpha + C*beta),(A_trans, B_trans))
if __name__ == "__main__":
parser = argparse.ArgumentParser();
subparsers = parser.add_subparsers(dest='action')
print_devices_parser = subparsers.add_parser('list-devices', help='list the devices available')
tune_parser = subparsers.add_parser('tune', help='tune using a specific configuration file')
tune_parser.add_argument("--config", default="config.ini", required=False, type=str)
tune_parser.add_argument("--viennacl-root", default='', required=False, type=str)
args = parser.parse_args()
if(args.action=='list-devices'):
print("----------------")
print("Devices available:")
print("----------------")
devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
for (i, d) in enumerate(devices):
print('Device', i, ':', utils.DEVICE_TYPE_PREFIX[d.type].upper() + ':', d.name, 'on', d.platform.name)
print("----------------")
else:
print("------")
print("Auto-tuning")
print("------")
do_tuning(args.config, 'config_spec.ini', args.viennacl_root)

View File

@@ -0,0 +1,126 @@
import random
import time
import pyviennacl as vcl
from collections import OrderedDict as odict
def closest_divisor(N, x):
x_low=x_high=max(1,min(round(x),N))
while N % x_low > 0 and x_low>0:
x_low = x_low - 1
while N % x_high > 0 and x_high < N:
x_high = x_high + 1
return x_low if x - x_low < x_high - x else x_high
class GeneticOperators(object):
def __init__(self, device, statement, parameters, parameter_names, TemplateType, build_template):
self.device = device
self.statement = statement
self.parameters = parameters
self.parameter_names = parameter_names
self.TemplateType = TemplateType
self.ParameterType = TemplateType.Parameters
self.build_template = build_template
self.cache = {}
def init(self):
result = [random.choice(L) for L in self.parameters]
while self.build_template(self.TemplateType.Parameters(*result)).check(self.statement)!=0:
result = [random.choice(L) for L in self.parameters]
return result
@staticmethod
def min_to_hyperbol(a, tup):
x = 1
for i in range(100):
dx = 2*(-a**2/x**3 + a*tup[1]/x**2 - tup[0] + x);
ddx = 6*a**2/x**4 - 4*a*tup[1]/x**3 + 2;
if abs(dx) < 1e-7 or abs(ddx) < 1e-7:
break
x-=dx/ddx;
if x<1 or x>a:
x = max(1, min(x, a))
break
new_x = int(closest_divisor(a, x))
new_y = int(a / new_x)
return (new_x, new_y)
def repair(self,func):
def repair_impl(child):
D = odict(zip(self.parameter_names, child))
dummy_template = self.build_template(self.ParameterType(*D.values()))
FetchingPolicy = vcl.atidlas.FetchingPolicy;
if 'local-size-1' not in D:
D['local-size-0'] = min(D['local-size-0'], self.device.max_work_group_size)
elif D['local-size-0']*D['local-size-1'] > self.device.max_work_group_size:
res = GeneticOperators.min_to_hyperbol(self.device.max_work_group_size, (D['local-size-0'], D['local-size-1']))
D['local-size-0'] = res[0]
D['local-size-1'] = res[1]
if self.ParameterType is vcl.atidlas.MatrixProductTemplate.Parameters:
if dummy_template.A_trans != 'N' and dummy_template.B_trans != 'T':
D['simd-width'] = 1
D['mS'] = max(D['mS'], D['simd-width'])
D['mS'] = D['mS'] - D['mS']%D['simd-width']
D['nS'] = max(D['nS'], D['simd-width'])
D['nS'] = D['nS'] - D['nS']%D['simd-width']
if D['A-fetch-policy']!=FetchingPolicy.FETCH_FROM_LOCAL and D['B-fetch-policy']!=FetchingPolicy.FETCH_FROM_LOCAL:
D['local-fetch-size-0']=D['local-fetch-size-1']=0
else:
res = GeneticOperators.min_to_hyperbol(D['local-size-0']*D['local-size-1'], (D['local-fetch-size-0'], D['local-fetch-size-1']))
D['local-fetch-size-0'] = res[0]
D['local-fetch-size-1'] = res[1]
if D['A-fetch-policy']==FetchingPolicy.FETCH_FROM_LOCAL and dummy_template.A_trans=='N' and D['kL'] % D['local-fetch-size-1'] > 0:
D['kL'] = max(1,round(D['kL']/D['local-fetch-size-1']))*D['local-fetch-size-1']
if D['B-fetch-policy']==FetchingPolicy.FETCH_FROM_LOCAL and dummy_template.B_trans=='T' and D['kL'] % D['local-fetch-size-1'] > 0:
D['kL'] = max(1,round(D['kL']/D['local-fetch-size-1']))*D['local-fetch-size-1']
D['kS'] = min(D['kL'], D['kS'])
return D.values()
def wrappper(*args, **kargs):
offspring = func(*args, **kargs)
for child in offspring:
new_child = repair_impl(child)
for i in range(len(child)):
if child[i] != new_child[i]:
child[i] = new_child[i]
return offspring
return wrappper
def mutate(self, individual, indpb):
for i in range(len(individual)):
if random.random() < indpb:
individual[i] = random.choice(self.parameters[i])
return individual,
def evaluate(self, individual):
tupindividual = tuple(individual)
print tupindividual
if tupindividual not in self.cache:
template = self.build_template(self.TemplateType.Parameters(*individual))
if template.check(self.statement)!=0:
self.cache[tupindividual] = 100
else:
template.execute(self.statement, True)
self.statement.result.context.finish_all_queues()
N = 0
current_time = 0
while current_time < 1e-2:
time_before = time.time()
template.execute(self.statement,False)
self.statement.result.context.finish_all_queues()
current_time += time.time() - time_before
N+=1
self.cache[tupindividual] = current_time/N
return self.cache[tupindividual],

92
autotune/optimize.py Normal file
View File

@@ -0,0 +1,92 @@
import array
import numpy as np
import random
import time
from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from genetic_operators import GeneticOperators
def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime,
stats=None, halloffame=None, verbose=__debug__):
logbook = tools.Logbook()
logbook.header = ['gen', 'nevals'] + (stats.fields if stats else [])
# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in population if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
if halloffame is not None:
halloffame.update(population)
record = stats.compile(population) if stats is not None else {}
logbook.record(gen=0, nevals=len(invalid_ind), **record)
if verbose:
print logbook.stream
# Begin the generational process
gen = 0
maxtime = time.strptime(maxtime, '%Mm%Ss')
maxtime = maxtime.tm_min*60 + maxtime.tm_sec
start_time = time.time()
while time.time() - start_time < maxtime:
# Vary the population
offspring = algorithms.varOr(population, toolbox, lambda_, cxpb, mutpb)
# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
# Update the hall of fame with the generated individuals
if halloffame is not None:
halloffame.update(offspring)
# Select the next generation population
population[:] = toolbox.select(population + offspring, mu)
# Update the statistics with the new population
gen = gen + 1
record = stats.compile(population) if stats is not None else {}
logbook.record(gen=gen, nevals=len(invalid_ind), **record)
if verbose:
print logbook.stream
return population, logbook
def genetic(statement, context, TemplateType, build_template, parameter_names, all_parameters, compute_perf, perf_metric, out):
gen = GeneticOperators(context.devices[0], statement, all_parameters, parameter_names, TemplateType, build_template)
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, gen.init)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.decorate("population", gen.repair)
toolbox.register("evaluate", gen.evaluate)
toolbox.register("mate", tools.cxUniform, indpb=0.3)
toolbox.decorate("mate", gen.repair)
toolbox.register("mutate", gen.mutate, indpb=0.2)
toolbox.decorate("mutate", gen.repair)
toolbox.register("select", tools.selNSGA2)
pop = toolbox.population(n=10)
hof = tools.HallOfFame(1)
best_performer = lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x])
best_profile = lambda x: '(%s)'%','.join(map(str,hof[0]))
cxpb = 0.5
mutpb = 0.2
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("max (" + perf_metric + ")", lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x]))
stats.register("profile ", lambda x: '(%s)'%','.join(map(str,hof[0])))
pop, log = eaMuPlusLambda(pop, toolbox, 10, 20, cxpb=0.2, mutpb=0.2, maxtime='5m0s', stats=stats, halloffame=hof, verbose=True)

33
autotune/utils.py Normal file
View File

@@ -0,0 +1,33 @@
import pyopencl as cl
import pyviennacl as vcl
all_devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
DEVICE_TYPE_PREFIX = { cl.device_type.GPU: 'gpu',
cl.device_type.CPU: 'cpu',
cl.device_type.ACCELERATOR: 'accelerator'
}
DEVICE_TYPE_CL_NAME = { cl.device_type.GPU: 'CL_DEVICE_TYPE_GPU',
cl.device_type.CPU: 'CL_DEVICE_TYPE_CPU',
cl.device_type.ACCELERATOR: 'CL_DEVICE_TYPE_ACCELERATOR'
}
VENDOR_PREFIX = { vcl.opencl.VendorId.beignet_id: 'beignet',
vcl.opencl.VendorId.nvidia_id: 'nvidia',
vcl.opencl.VendorId.amd_id: 'amd',
vcl.opencl.VendorId.intel_id: 'intel'
}
DEVICES_PRESETS = {'all': all_devices,
'gpus': [d for d in all_devices if d.type==cl.device_type.GPU],
'cpus': [d for d in all_devices if d.type==cl.device_type.CPU],
'accelerators': [d for d in all_devices if d.type==cl.device_type.ACCELERATOR]
}
def sanitize_string(string, keep_chars = ['_']):
string = string.replace(' ', '_').lower()
string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip()
return string

116
autotune/vclio.py Normal file
View File

@@ -0,0 +1,116 @@
import sys
import os
import utils
def append_include(data, path):
include_name = '#include "' + path +'"\n'
already_included = data.find(include_name)
if already_included == -1:
insert_index = data.index('\n', data.index('#define')) + 1
return data[:insert_index] + '\n' + include_name + data[insert_index:]
return data
def generate_viennacl_headers(viennacl_root, device, datatype, operation, additional_parameters, parameters):
builtin_database_dir = os.path.join(viennacl_root, "device_specific", "builtin_database")
if not os.path.isdir(builtin_database_dir):
raise EnvironmentError('ViennaCL root path is incorrect. Cannot access ' + builtin_database_dir + '!\n'
'Your version of ViennaCL may be too old and/or corrupted.')
function_name_dict = { vcl.float32: 'add_4B',
vcl.float64: 'add_8B' }
additional_parameters_dict = {'N': "char_to_type<'N'>",
'T': "char_to_type<'T'>"}
#Create the device-specific headers
cpp_device_name = utils.sanitize_string(device.name)
function_name = function_name_dict[datatype]
operation = operation.replace('-','_')
cpp_class_name = operation + '_template'
header_name = cpp_device_name + ".hpp"
function_declaration = 'inline void ' + function_name + '(' + ', '.join(['database_type<' + cpp_class_name + '::parameters_type> & db'] + \
[additional_parameters_dict[x] for x in additional_parameters]) + ')'
device_type_prefix = utils.DEVICE_TYPE_PREFIX[device.type]
vendor_prefix = utils.VENDOR_PREFIX[device.vendor_id]
architecture_family = vcl.opencl.architecture_family(device.vendor_id, device.name)
header_hierarchy = ["devices", device_type_prefix, vendor_prefix, architecture_family]
header_directory = os.path.join(builtin_database_dir, *header_hierarchy)
header_path = os.path.join(header_directory, header_name)
if not os.path.exists(header_directory):
os.makedirs(header_directory)
if os.path.exists(header_path):
with open (header_path, "r") as myfile:
data=myfile.read()
else:
data = ''
if not data:
ifndef_suffix = ('_'.join(header_hierarchy) + '_hpp_').upper()
data = ('#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
'#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
'\n'
'#include "viennacl/device_specific/forwards.h"\n'
'#include "viennacl/device_specific/builtin_database/common.hpp"\n'
'\n'
'namespace viennacl{\n'
'namespace device_specific{\n'
'namespace builtin_database{\n'
'namespace devices{\n'
'namespace ' + device_type_prefix + '{\n'
'namespace ' + vendor_prefix + '{\n'
'namespace ' + architecture_family + '{\n'
'namespace ' + cpp_device_name + '{\n'
'\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'#endif\n'
'')
data = append_include(data, 'viennacl/device_specific/templates/' + cpp_class_name + '.hpp')
add_to_database_arguments = [vendor_prefix + '_id', utils.DEVICE_TYPE_CL_NAME[device.type], 'ocl::'+architecture_family,
'"' + device.name + '"', cpp_class_name + '::parameters' + str(parameters)]
core = ' db.' + function_name + '(' + ', '.join(add_to_database_arguments) + ');'
already_declared = data.find(function_declaration)
if already_declared==-1:
substr = 'namespace ' + cpp_device_name + '{\n'
insert_index = data.index(substr) + len(substr)
data = data[:insert_index] + '\n' + function_declaration + '\n{\n' + core + '\n}\n' + data[insert_index:]
else:
i1 = data.find('{', already_declared)
if data[i1-1]=='\n':
i1 = i1 - 1
i2 = data.find('}', already_declared) + 1
data = data[:i1] + '\n{\n' + core + '\n}' + data[i2:]
#Write the header file
with open(header_path, "w+") as myfile:
myfile.write(data)
#Updates the global ViennaCL headers
with open(os.path.join(builtin_database_dir, operation + '.hpp'), 'r+') as operation_header:
data = operation_header.read()
data = append_include(data, os.path.relpath(header_path, os.path.join(viennacl_root, os.pardir)))
scope_name = '_'.join(('init', operation) + additional_parameters)
scope = data.index(scope_name)
function_call = ' ' + '::'.join(header_hierarchy + [cpp_device_name, function_name]) + '(' + ', '.join(['result'] + [additional_parameters_dict[k] + '()' for k in additional_parameters]) + ')'
if function_call not in data:
insert_index = data.rindex('\n', 0, data.index('return result', scope))
data = data[:insert_index] + function_call + ';\n' + data[insert_index:]
operation_header.seek(0)
operation_header.truncate()
operation_header.write(data)