Now compiling ATIDLAS

This commit is contained in:
Philippe Tillet
2014-10-14 23:49:18 -04:00
parent f60adab3dc
commit f91d3b422a
12 changed files with 321 additions and 2501 deletions

172
python/autotune/autotune.py Normal file
View File

@@ -0,0 +1,172 @@
from __future__ import division
import argparse
import itertools
import os
from configobj import ConfigObj
from numpy import random
import pyopencl as cl
import pyviennacl as vcl
from pyviennacl import backend, opencl, atidlas
from dataset import generate_dataset
from model import train_model
import tools
import optimize
import sys
DATATYPES = { 'single' : vcl.float32,
'double' : vcl.float64 }
TYPES = { 'vector-axpy': {'template':vcl.atidlas.VectorAxpyTemplate,
'perf-index':lambda x: 3*x[0]*x[1][0]/x[2]*1e-9,
'perf-measure':'GB/s'},
'matrix-axpy': {'template':vcl.atidlas.MatrixAxpyTemplate,
'perf-index':lambda x: 3*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
'perf-measure':'GB/s'},
'reduction': {'template':vcl.atidlas.ReductionTemplate,
'perf-index':lambda x: 2*x[0]*x[1][0]/x[2]*1e-9,
'perf-measure':'GB/s'},
'row-wise-reduction': {'template':vcl.atidlas.RowWiseReductionTemplate,
'perf-index':lambda x: x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
'perf-measure':'GB/s'},
'matrix-product': {'template':vcl.atidlas.MatrixProductTemplate,
'perf-index': lambda x: 2*x[1][0]*x[1][1]*x[1][2]/x[2]*1e-9,
'perf-measure': 'GFLOP/s'} }
def do_tuning(config_fname, viennacl_root):
config = ConfigObj(config_fname)
def map_to_list(T, x):
return list(map(T, x if isinstance(x, list) else [x]))
for operation in ['vector-axpy', 'matrix-axpy', 'reduction', 'row-wise-reduction', 'matrix-product']:
if operation in config:
p = config[operation]
confdevices = p['devices']
all_devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
DEVICES_PRESETS = {'all': all_devices,
'gpus': [d for d in all_devices if d.type==cl.device_type.GPU],
'cpus': [d for d in all_devices if d.type==cl.device_type.CPU],
'accelerators': [d for d in all_devices if d.type==cl.device_type.ACCELERATOR]
}
devices = DEVICES_PRESETS[confdevices] if confdevices in DEVICES_PRESETS else [all_devices[int(i)] for i in confdevices]
precisions = map_to_list(str, p['precision'])
if 'all' in precisions:
precisions = ['single','double']
datatypes = [DATATYPES[k] for k in precisions]
#Iterate through the datatypes and the devices
for datatype, device in itertools.product(datatypes, devices):
ctx = cl.Context([device])
ctx = vcl.backend.Context(ctx)
device = ctx.current_device
#Check data-type
if datatype is vcl.float64 and not device.double_fp_config:
sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...')
continue
#Helper for execution
def execute(device, node, other_params, sizes, fname = os.devnull, parameters = None):
with vcl.Statement(node) as statement:
if parameters:
TemplateType = TYPES[operation]['template']
return tools.benchmark(TemplateType(TemplateType.Parameters(*parameters),*other_params), statement, device)
print('-----')
print(' '.join(map(str, ("Now tuning:", datatype.__name__, '-', operation, '-'.join(other_params), '[' + device.name, '(' + device.platform.name + ')] for sizes', sizes))))
with open(fname, "w+") as archive:
return optimize.genetic(statement, device, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive)
#Helper for tuning
def tune(execution_handler, nTuning, nDataPoints, draw, additional_parameters):
if 'size' in p:
profile = execution_handler(map_to_list(int, p['size']))
if 'viennacl-src-root' in config:
tools.update_viennacl_headers(config['viennacl-src-root'],device,datatype,operation,additional_parameters,profile)
else:
def compute_perf(x, t):
return TYPES[operation]['perf-index']([datatype().itemsize, x, t])
X, Y, profiles = generate_dataset(TYPES[operation]['template'], execution_handler, nTuning, nDataPoints, draw)
train_model(X, Y, profiles, TYPES[operation]['perf-measure'])
#Vector AXPY
if operation=='vector-axpy':
def execution_handler(sizes, fname=os.devnull, parameters=None):
x = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
y = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
z = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
return execute(device, vcl.Assign(z, vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y))), (), sizes, fname, parameters)
tune(execution_handler, 30, 1000, lambda : 64*np.random.randint(low=10, high=100000, size=1), ())
#Reduction
if operation=='reduction':
def execution_handler(sizes, fname=os.devnull, parameters=None):
x = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
y = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
s = vcl.Scalar(0, context=ctx, dtype=datatype)
return execute(device, vcl.Assign(s, vcl.Dot(x,y)), (), sizes, fname, parameters)
tune(execution_handler, 50, 1000, lambda : 64*np.random.randint(low=10, high=100000, size=1), ())
#Matrix AXPY
if operation=='matrix-axpy':
def execution_handler(sizes, fname=os.devnull, parameters=None):
A = vcl.Matrix(sizes, context=ctx, dtype=datatype)
B = vcl.Matrix(sizes, context=ctx, dtype=datatype)
C = vcl.Matrix(sizes, context=ctx, dtype=datatype)
return execute(device, vcl.Assign(C,A+B), (), sizes, fname, parameters)
tune(execution_handler, 50, 1000, lambda : 64*np.random.randint(low=5, high=100, size=2), ())
#Row-wise reduction
if operation=='row-wise-reduction':
layouts = map_to_list(str,p['layout'])
if 'all' in layouts:
layouts = ['N', 'T']
for A_trans in layouts:
def execution_handler(sizes, fname=os.devnull, parameters=None):
A = vcl.Matrix(sizes if A_trans=='N' else sizes[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR)
x = vcl.Vector(sizes[1] if A_trans=='N' else sizes[0], context=ctx, dtype=datatype)
y = vcl.Vector(sizes[0] if A_trans=='N' else sizes[1], context=ctx, dtype=datatype)
LHS = A if A_trans=='N' else A.T
return execute(device, vcl.Assign(y, LHS*x), (), sizes, fname, parameters)
tune(execution_handler, 50, 1000, lambda : 64*np.random.randint(low=5, high=100, size=2), (A_trans,))
#Matrix Product
if operation=='matrix-product':
layouts = map_to_list(str,p['layout'])
if 'all' in layouts:
layouts = ['NN', 'NT', 'TN', 'TT']
for layout in layouts:
def execution_handler(sizes, fname=os.devnull, parameters=None):
A_trans = layout[0]
B_trans = layout[1]
A = vcl.Matrix((sizes[0], sizes[1]) if A_trans=='N' else (sizes[1],sizes[0]), context=ctx, dtype=datatype, layout=vcl.COL_MAJOR);
B = vcl.Matrix((sizes[1], sizes[2]) if B_trans=='N' else (sizes[2],sizes[1]), context=ctx, dtype=datatype, layout=vcl.COL_MAJOR);
LHS = A if A_trans=='N' else A.T
RHS = B if B_trans=='N' else B.T
alpha = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR)
return execute(device, vcl.Assign(C,LHS*RHS*alpha + C*beta),(A_trans, B_trans), sizes, fname, parameters)
tune(execution_handler, 50, 2000, lambda : 64*np.random.randint(low=1, high=40, size=3),(layout[0], layout[1]))
if __name__ == "__main__":
parser = argparse.ArgumentParser();
subparsers = parser.add_subparsers(dest='action')
print_devices_parser = subparsers.add_parser('list-devices', help='list the devices available')
tune_parser = subparsers.add_parser('tune', help='tune using a specific configuration file')
tune_parser.add_argument("--config", default="config.ini", required=False, type=str)
tune_parser.add_argument("--viennacl-root", default='', required=False, type=str)
args = parser.parse_args()
if(args.action=='list-devices'):
print("----------------")
print("Devices available:")
print("----------------")
devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
for (i, d) in enumerate(devices):
print('Device', i, ':', tools.DEVICE_TYPE_PREFIX[d.type].upper() + ':', d.name, 'on', d.platform.name)
print("----------------")
else:
print("------")
print("Auto-tuning")
print("------")
do_tuning(args.config, args.viennacl_root)

View File

@@ -0,0 +1,63 @@
import os
import sys
import re
import random
import numpy as np
from pyviennacl.atidlas import FetchingPolicy
def resample(X, draw):
Xtuples = [tuple(x) for x in X]
r = random.random()
while(True):
x = draw()
if tuple(x) not in Xtuples:
break
return x.astype(int)
def generate_dataset(TemplateType, execution_handler, nTuning, nDataPoints, draw):
print "Getting some good profiles..."
nDim = draw().size
X = np.empty((nTuning, nDim))
t = np.empty(nTuning)
profiles = []
for i in range(nTuning):
x = resample(X, draw)
y = execution_handler(x)
if y not in profiles:
profiles.append(y)
idx = profiles.index(y)
X[i,:] = x
t[i] = idx
print "Generating the dataset..."
Y = np.empty((nDataPoints, len(profiles)))
X = np.empty((nDataPoints, nDim))
t = []
for i in range(nDataPoints):
x = resample(X, draw)
for j,y in enumerate(profiles):
T = execution_handler(x, os.devnull, y)
Y[i,j] = T
idx = np.argmax(Y[i,:])
X[i,:] = x
t = np.argmax(Y[:i+1,], axis=1)
if i%10==0:
sys.stdout.write('%d data points generated\r'%i)
sys.stdout.flush()
template_name = TemplateType.__name__
dir = os.path.join("data", template_name)
if not os.path.exists(dir):
os.makedirs(dir)
np.savetxt(os.path.join(dir,"profiles.csv"), profiles)
np.savetxt(os.path.join(dir,"X.csv"), X)
np.savetxt(os.path.join(dir,"Y.csv"), Y)
profiles = np.loadtxt(os.path.join(dir, "profiles.csv"))
X = np.loadtxt(os.path.join(dir, "X.csv"),ndmin=2)
Y = np.loadtxt(os.path.join(dir, "Y.csv"),ndmin=2)
return X, Y, profiles

190
python/autotune/genetic.py Normal file
View File

@@ -0,0 +1,190 @@
import random
import time
import sys
import tools
import pyviennacl as vcl
import numpy as np
import copy
from deap import algorithms
from deap import base
from deap import creator
from deap import tools as deap_tools
from collections import OrderedDict as odict
def closest_divisor(N, x):
x_low=x_high=max(1,min(round(x),N))
while N % x_low > 0 and x_low>0:
x_low = x_low - 1
while N % x_high > 0 and x_high < N:
x_high = x_high + 1
return x_low if x - x_low < x_high - x else x_high
def b_gray_to_bin(A='00000000', endian='big'):
assert type(endian) is str
assert endian == 'little' or endian == 'big'
if endian == 'little': A = A[::-1] # Make sure endianness is big before conversion
b = A[0]
for i in range(1, len(A)): b += str( int(b[i-1] != A[i]) )
if endian == 'little': b = b[::-1] # Convert back to little endian if necessary
return b
class GeneticOperators(object):
def __init__(self, device, statement, TemplateType, build_template, out):
self.device = device
self.statement = statement
self.TemplateType = TemplateType
self.ParameterType = TemplateType.Parameters
self.build_template = build_template
self.cache = {}
self.out = out
self.genome_info = {
vcl.atidlas.VectorAxpyTemplate: [3,4,4,vcl.atidlas.FetchingPolicy],
vcl.atidlas.ReductionTemplate: [3,4,4,vcl.atidlas.FetchingPolicy],
vcl.atidlas.MatrixAxpyTemplate: [3,3,3,3,3,vcl.atidlas.FetchingPolicy],
vcl.atidlas.RowWiseReductionTemplate: [3,3,3,4,vcl.atidlas.FetchingPolicy],
vcl.atidlas.MatrixProductTemplate: [3,3,3,3,3,3,3,vcl.atidlas.FetchingPolicy,vcl.atidlas.FetchingPolicy,3]
}[TemplateType]
self.indpb = 1.0/sum([1 if x==vcl.atidlas.FetchingPolicy else x for x in self.genome_info])
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)
self.toolbox = base.Toolbox()
self.toolbox.register("population", self.init)
self.toolbox.register("evaluate", self.evaluate)
self.toolbox.register("mate", deap_tools.cxTwoPoint)
self.toolbox.register("mutate", self.mutate)
self.toolbox.register("select", deap_tools.selNSGA2)
def decode(self, genome):
FetchingPolicy = vcl.atidlas.FetchingPolicy
fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED]
decode_element = lambda x:2**int(b_gray_to_bin(''.join(x)), 2)
result = []
offset = 0
for x in self.genome_info:
if x==vcl.atidlas.FetchingPolicy:
result.append(fetch[genome[offset]])
offset = offset + 1
else:
result.append(decode_element(genome[offset:offset+x]))
offset = offset + x
#GEMM peculiarities
if self.TemplateType==vcl.atidlas.MatrixProductTemplate:
if FetchingPolicy.FETCH_FROM_LOCAL in result:
lf1 = result[1]*result[3]/result[9]
else:
result[9] = 0
lf1 = 0
result.append(lf1)
return result
def init(self, N):
result = []
while len(result) < N:
while True:
bincode = []
for x in self.genome_info:
if x==vcl.atidlas.FetchingPolicy:
bincode = bincode + [random.randint(0,2)]
else:
bincode = bincode + [str(random.randint(0,1)) for i in range(x)]
parameters = self.decode(bincode)
template = self.build_template(self.TemplateType.Parameters(*parameters))
registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4
lmem_usage = template.lmem_usage(vcl.atidlas.StatementsTuple(self.statement))
local_size = template.parameters.local_size_0*template.parameters.local_size_1
occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
if not tools.skip(template, self.statement, self.device):
result.append(creator.Individual(bincode))
break
return result
def mutate(self, individual):
while True:
new_individual = copy.deepcopy(individual)
for i in range(len(new_individual)):
if isinstance(individual[i], int) and random.random() < self.indpb:
while new_individual[i] == individual[i]:
new_individual[i] = random.randint(0, 2)
elif not isinstance(individual[i], int) and random.random() < self.indpb:
new_individual[i] = '1' if new_individual[i]=='0' else '0'
parameters = self.decode(new_individual)
template = self.build_template(self.TemplateType.Parameters(*parameters))
#print tools.skip(template, self.statement, self.device), parameters
if not tools.skip(template, self.statement, self.device):
break
return new_individual,
def evaluate(self, individual):
if tuple(individual) not in self.cache:
parameters = self.decode(individual)
template = self.build_template(self.TemplateType.Parameters(*parameters))
try:
tt = tools.benchmark(template, self.statement, self.device)
self.out.write(','.join([str(tt)]+map(str,map(int,parameters)))+'\n')
self.cache[tuple(individual)] = tt
except:
self.cache[tuple(individual)] = 10
return self.cache[tuple(individual)],
def optimize(self, maxtime, maxgen, compute_perf, perf_metric):
hof = deap_tools.HallOfFame(1)
# Begin the generational process
gen = 0
maxtime = time.strptime(maxtime, '%Mm%Ss')
maxtime = maxtime.tm_min*60 + maxtime.tm_sec
start_time = time.time()
mu = 30
cxpb = 0.2
mutpb = 0.7
population = self.init(mu)
invalid_ind = [ind for ind in population if not ind.fitness.valid]
fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
hof.update(population)
while time.time() - start_time < maxtime and gen < maxgen:
# Vary the population
offspring = []
for _ in xrange(mu):
op_choice = random.random()
if op_choice < cxpb: # Apply crossover
ind1, ind2 = map(self.toolbox.clone, random.sample(population, 2))
ind1, ind2 = self.toolbox.mate(ind1, ind2)
del ind1.fitness.values
offspring.append(ind1)
elif op_choice < cxpb + mutpb: # Apply mutation
ind = self.toolbox.clone(random.choice(population))
ind, = self.toolbox.mutate(ind)
del ind.fitness.values
offspring.append(ind)
else: # Apply reproduction
offspring.append(random.choice(population))
#for x in offspring:
#print self.decode(x)
# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
# Update the hall of fame with the generated individuals
hof.update(offspring)
# Select the next generation population
population[:] = self.toolbox.select(population + offspring, mu)
#Update
gen = gen + 1
best_profile = '(%s)'%','.join(map(str,self.decode(hof[0])))
best_performance = compute_perf(hof[0].fitness.values[0])
sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\r'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile))
sys.stdout.flush()
sys.stdout.write('\n')
return self.decode(hof[0])

83
python/autotune/model.py Normal file
View File

@@ -0,0 +1,83 @@
from sklearn import tree
from sklearn import ensemble
from numpy import array, bincount, mean, std, max, argmax, min, argmin, median
from scipy.stats import gmean
# def random_forest(Xtr, Ytr):
# clf = ensemble.RandomForestRegressor(10, max_depth=7).fit(Xtr,Ytr)
#
# def predict_tree(tree, x):
# tree_ = tree.tree_
# children_left = tree_.children_left
# children_right = tree_.children_right
# threshold = tree_.threshold
# feature = tree_.feature
# value = tree_.value
# idx = 0
# while children_left[idx]!=-1:
# if x[0, feature[idx]] <= threshold[idx]:
# idx = children_left[idx]
# else:
# idx = children_right[idx]
# return value[[idx],:,:][:,:,0]
#
# s = 0
# for e in clf.estimators_:
# tree_ = e.tree_
# children_left = tree_.children_left
# children_right = tree_.children_right
# threshold = tree_.threshold
# feature = tree_.feature
# value = tree_.value
# s = s + value.size + feature.size + threshold.size + children_right.size + children_left.size
# print s*4*1e-3
# return clf, clf.predict
#
# def train_nn(layer_sizes, XTr, YTr, XTe, YTe):
# optimizer = HF(open(os.devnull, 'w'), 15)
# optimizer.doCGBacktracking = True
# net = FeedforwardNeuralNet(layer_sizes, [Act.Tanh() for i in range(len(layer_sizes)-2)], Act.Linear(), 1e-5)
#
# nbatch=10
# bsize = XTr.shape[0]/nbatch
# data = ((XTr[(i%nbatch)*bsize:(i%nbatch+1)*bsize,:], YTr[(i%nbatch)*bsize:(i%nbatch+1)*bsize,:]) for i in range(nbatch))
# data = HFDataSource(data, bsize, gradBatchSize = nbatch*bsize, curvatureBatchSize = bsize, lineSearchBatchSize =nbatch*bsize, gradBatchIsTrainingSet=True)
# iters = optimizer.optimize(HFModel(net), data, 300, otherPrecondDampingTerm=net.L2Cost)
# bestte = collections.deque([float("inf")]*5, maxlen=5)
# for i,w in enumerate(iters):
# Diffte = YTe - net.predictions(XTe).as_numpy_array()
# Difftr = YTr - net.predictions(XTr).as_numpy_array()
# Ete = np.sum(Diffte**2)
# Etr = np.sum(Difftr**2)
# bestte.append(min(min(bestte),Ete))
# if min(bestte)==max(bestte):
# print 'Final test error: ', Ete
# return net, net.predictions
# print 'Iteration %d | Test error = %.2f | Train error = %.2f'%(i, Ete, Etr)
# return net, net.predictions
def train_model(X, Y, profiles, metric):
print("Building the model...")
Xmean = mean(X)
Xstd = std(X)
X = (X - Xmean)/Xstd
Y = Y[:, :]
Ymax = max(Y)
Y = Y/Ymax
ref = argmax(bincount(argmin(Y, axis=1))) #most common profile
cut = int(0.800*X.shape[0]+1)
#Train the model
clf = ensemble.RandomForestRegressor(10, max_depth=10).fit(X[:cut,:], Y[:cut,:])
t = argmin(clf.predict(X[cut:,:]), axis = 1)
s = array([y[ref]/y[k] for y,k in zip(Y[cut:,:], t)])
tt = argmin(Y[cut:,:], axis = 1)
ss = array([y[ref]/y[k] for y,k in zip(Y[cut:,:], tt)])
print("Testing speedup : mean = %.3f, median = %.3f, min = %.3f, max %.3f"%(gmean(s), median(s), min(s), max(s)))
print("Optimal speedup : mean = %.3f, median = %.3f, min = %.3f, max %.3f"%(gmean(ss), median(ss), min(ss), max(ss)))

View File

@@ -0,0 +1,53 @@
import array
import numpy as np
import random
import sys
import itertools
import tools
import deap.tools
from genetic import GeneticOperators
#~ def parameter_space(operation):
#~ simd = [1, 2, 4, 8]
#~ pow2_1D = [2**k for k in range(12)]
#~ pow2_2D = [2**i for i in range(8)]
#~ pow2_2D_unrolled = [2**i for i in range(8)]
#~ FetchingPolicy = vcl.atidlas.FetchingPolicy
#~ fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED]
#~ if operation == 'vector-axpy': return [simd, pow2_1D, pow2_1D, fetch]
#~ if operation == 'reduction': return [simd, pow2_1D, pow2_1D, fetch]
#~ if operation == 'matrix-axpy': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D, fetch]
#~ if operation == 'row-wise-reduction': return [simd, pow2_2D, pow2_2D, pow2_1D, fetch]
#~ if operation == 'matrix-product': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D_unrolled, pow2_2D_unrolled, pow2_2D_unrolled, fetch, fetch, [0] + pow2_2D, [0] + pow2_2D]
#~
#~ def exhaustive(statement, context, TemplateType, build_template, parameter_names, all_parameters, compute_perf, perf_metric, out):
#~ device = context.devices[0]
#~ nvalid = 0
#~ current = 0
#~ minT = float('inf')
#~ for individual in itertools.product(*all_parameters):
#~ template = build_template(TemplateType.Parameters(*individual))
#~ if not tools.skip(template, statement, device):
#~ nvalid = nvalid + 1
#~ for individual in itertools.product(*all_parameters):
#~ template = build_template(TemplateType.Parameters(*individual))
#~ try:
#~ T = tools.benchmark(template,statement,device)
#~ current = current + 1
#~ if T < minT:
#~ minT = T
#~ best = individual
#~ sys.stdout.write('%d / %d , Best is %d %s for %s\r'%(current, nvalid, compute_perf(minT), perf_metric, best))
#~ sys.stdout.flush()
#~ except:
#~ pass
#~ sys.stdout.write('\n')
#~ sys.stdout.flush()
#~
def genetic(statement, device, TemplateType, build_template, compute_perf, perf_metric, out):
GA = GeneticOperators(device, statement, TemplateType, build_template, out)
return GA.optimize(maxtime='2m30s', maxgen=1000, compute_perf=compute_perf, perf_metric=perf_metric)

343
python/autotune/tools.py Normal file
View File

@@ -0,0 +1,343 @@
from __future__ import division
import pyopencl
import time
import os
import sys
import pyopencl as cl
import pyviennacl as vcl
from pyviennacl.atidlas import StatementsTuple
class PhysicalLimitsNV:
def __init__(self, dev):
self.compute_capability = pyopencl.characterize.nv_compute_capability(dev)
if self.compute_capability[0]==1:
if self.compute_capability[1]<=1:
self.warps_per_mp = 24
self.threads_per_mp = 768
self.num_32b_reg_per_mp = 8192
self.reg_alloc_unit_size = 256
else:
self.warps_per_mp = 32
self.threads_per_mp = 1024
self.num_32b_reg_per_mp = 16384
self.reg_alloc_unit_size = 512
self.threads_per_warp = 32
self.thread_blocks_per_mp = 8
self.reg_alloc_granularity = 'block'
self.reg_per_thread = 124
self.shared_mem_per_mp = 16384
self.shared_mem_alloc_unit_size = 512
self.warp_alloc_granularity = 2
self.max_thread_block_size = 512
elif self.compute_capability[0]==2:
self.threads_per_warp = 32
self.warps_per_mp = 48
self.threads_per_mp = 1536
self.thread_blocks_per_mp = 8
self.num_32b_reg_per_mp = 32768
self.reg_alloc_unit_size = 64
self.reg_alloc_granularity = 'warp'
self.reg_per_thread = 63
self.shared_mem_per_mp = 49152
self.shared_mem_alloc_unit_size = 128
self.warp_alloc_granularity = 2
self.max_thread_block_size = 1024
elif self.compute_capability[0]==3:
self.threads_per_warp = 32
self.warps_per_mp = 64
self.threads_per_mp = 2048
self.thread_blocks_per_mp = 16
self.num_32b_reg_per_mp = 65536
self.reg_alloc_unit_size = 256
self.reg_alloc_granularity = 'warp'
if(self.compute_capability[1]==5):
self.reg_per_thread = 255
else:
self.reg_per_thread = 63
self.shared_mem_per_mp = 49152
self.shared_mem_alloc_unit_size = 256
self.warp_alloc_granularity = 4
self.max_thread_block_size = 1024
else:
raise Exception('Compute capability not supported!')
class PhysicalLimitsAMD:
def __init__(self, dev):
infos =\
{
#HD5000
'Cedar': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 8192},
'Redwood': {'arch': 'VLIW', 'WFmax_cu': 62, 'LDS_cu': 32768, 'GPR_cu': 16384},
'Juniper': {'arch': 'VLIW', 'WFmax_cu': 24.8, 'LDS_cu': 32768, 'GPR_cu': 16384},
'Cypress': {'arch': 'VLIW', 'WFmax_cu': 27.6, 'LDS_cu': 32768, 'GPR_cu': 16384},
'Hemlock': {'arch': 'VLIW', 'WFmax_cu': 24.8, 'LDS_cu': 32768, 'GPR_cu': 16384},
#HD6000
'Seymour': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 16384},
'Caicos': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 16384},
'Turks': {'arch': 'VLIW', 'WFmax_cu': 41.3, 'LDS_cu': 32768, 'GPR_cu': 16384},
'Whistler': {'arch': 'VLIW', 'WFmax_cu': 41.3, 'LDS_cu': 32768, 'GPR_cu': 16384},
'Bart': {'arch': 'VLIW', 'WFmax_cu': 49.6, 'LDS_cu': 32768, 'GPR_cu': 16384},
#HD7000
'Capeverde': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
'Pitcairn': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
'Bonaire': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
'Tahiti': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
#Rx 200
'Oland': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
'Tonga': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
'Hawaii': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536}
}
self.WFsize = 64
self.WFmax_cu = infos[dev.name]['WFmax_cu']
self.LDS_cu = infos[dev.name]['LDS_cu']
self.GPR_cu = infos[dev.name]['GPR_cu']
self.arch = infos[dev.name]['arch']
pass
def _int_floor(value, multiple_of=1):
"""Round C{value} down to be a C{multiple_of} something."""
# Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
from math import floor
return int(floor(value/multiple_of))*multiple_of
def _int_ceiling(value, multiple_of=1):
"""Round C{value} up to be a C{multiple_of} something."""
# Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
from math import ceil
return int(ceil(value/multiple_of))*multiple_of
class OccupancyRecord:
def init_nvidia(self, dev, threads, shared_mem, registers):
pl = PhysicalLimitsNV(dev)
limits = []
allocated_warps = max(1,_int_ceiling(threads/pl.threads_per_warp))
max_warps_per_mp = pl.warps_per_mp
limits.append((min(pl.thread_blocks_per_mp, _int_floor(max_warps_per_mp/allocated_warps)), 'warps'))
if registers>0:
if registers > pl.reg_per_thread:
limits.append((0, 'registers'))
else:
allocated_regs = {'warp': allocated_warps,
'block': _int_ceiling(_int_ceiling(allocated_warps, pl.warp_alloc_granularity)*registers*pl.threads_per_warp,allocated_warps)}[pl.reg_alloc_granularity]
max_reg_per_mp = {'warp': _int_floor(pl.num_32b_reg_per_mp/_int_ceiling(registers*pl.threads_per_warp, pl.reg_alloc_unit_size), pl.warp_alloc_granularity),
'block':pl.num_32b_reg_per_mp}[pl.reg_alloc_granularity]
limits.append((_int_floor(max_reg_per_mp/allocated_regs), 'registers'))
if shared_mem>0:
allocated_shared_mem = _int_ceiling(shared_mem, pl.shared_mem_alloc_unit_size)
max_shared_mem_per_mp = pl.shared_mem_per_mp
limits.append((_int_floor(max_shared_mem_per_mp/allocated_shared_mem), 'shared memory'))
limit, limited_by = min(limits)
warps_per_mp = limit*allocated_warps
self.occupancy = 100*warps_per_mp/pl.warps_per_mp
def init_amd(self, dev, threads, shared_mem, NReg):
pl = PhysicalLimitsAMD(dev)
limits = {}
WFwg = _int_ceiling(threads/pl.WFsize)
#WFmax without constraint
if pl.arch=='VLIW':
limits['wg'] = pl.WFmax_cu if WFwg > pl.WFmax_cu else _int_floor(pl.WFmax_cu,WFwg)
else:
limits['wg'] = min(16*WFwg, pl.WFmax_cu)
#WFmax with LDS constraints
if shared_mem > 0:
WGmax = _int_floor(pl.LDS_cu/shared_mem)
limits['lds'] = WGmax*WFwg
#WFmax with GPR constraints
if NReg > 0:
#Amount of work group per CU
NRegWG = NReg*pl.WFsize*WFwg
WGmax = _int_floor(pl.GPR_cu/NRegWG)
limits['gpr'] = WFwg*WGmax
self.occupancy = 100.0*min(list(limits.values()))/pl.WFmax_cu
def __init__(self, dev, threads, shared_mem=0, registers=0):
if 'advanced micro devices' in dev.vendor.lower():
self.init_amd(dev, threads, shared_mem, registers)
elif 'nvidia' in dev.vendor.lower():
self.init_nvidia(dev, threads, shared_mem, registers)
def skip(template, statement, device):
statements = StatementsTuple(statement)
registers_usage = template.registers_usage(statements)/4
lmem_usage = template.lmem_usage(statements)
local_size = template.parameters.local_size_0*template.parameters.local_size_1
occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
if template.check(statement) or occupancy_record.occupancy < 15:
return True
return False
def benchmark(template, statement, device):
statements = StatementsTuple(statement)
registers_usage = template.registers_usage(statements)/4
lmem_usage = template.lmem_usage(statements)
local_size = template.parameters.local_size_0*template.parameters.local_size_1
occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
if occupancy_record.occupancy < 15 :
raise ValueError("Template has too low occupancy")
else:
template.execute(statement, True)
statement.result.context.finish_all_queues()
N = 0
current_time = 0
while current_time < 1e-1:
time_before = time.time()
template.execute(statement,False)
statement.result.context.finish_all_queues()
current_time = current_time + time.time() - time_before
N+=1
return current_time/N
def update_viennacl_headers(viennacl_root, device, datatype, operation, additional_parameters, parameters):
def sanitize_string(string, keep_chars = ['_']):
string = string.replace(' ', '_').replace('-', '_').lower()
string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip()
return string
def append_include(data, path):
include_name = '#include "' + path +'"\n'
already_included = data.find(include_name)
if already_included == -1:
insert_index = data.index('\n', data.index('#define')) + 1
return data[:insert_index] + '\n' + include_name + data[insert_index:]
return data
builtin_database_dir = os.path.join(viennacl_root, "device_specific", "builtin_database")
if not os.path.isdir(builtin_database_dir):
raise EnvironmentError('ViennaCL root path is incorrect. Cannot access ' + builtin_database_dir + '!\n'
'Your version of ViennaCL may be too old and/or corrupted.')
function_name_dict = { vcl.float32: 'add_4B',
vcl.float64: 'add_8B' }
additional_parameters_dict = {'N': "char_to_type<'N'>",
'T': "char_to_type<'T'>"}
#Create the device-specific headers
cpp_device_name = sanitize_string(device.name)
function_name = function_name_dict[datatype]
operation = operation.replace('-','_')
cpp_class_name = operation + '_template'
header_name = cpp_device_name + ".hpp"
function_declaration = 'inline void ' + function_name + '(' + ', '.join(['database_type<' + cpp_class_name + '::parameters_type> & db'] + \
[additional_parameters_dict[x] for x in additional_parameters]) + ')'
device_type_prefix = {
cl.device_type.GPU: 'gpu',
cl.device_type.CPU: 'cpu',
cl.device_type.ACCELERATOR: 'accelerator'
}[device.type]
vendor_prefix = {
vcl.opencl.VendorId.beignet_id: 'beignet',
vcl.opencl.VendorId.nvidia_id: 'nvidia',
vcl.opencl.VendorId.amd_id: 'amd',
vcl.opencl.VendorId.intel_id: 'intel'
}[device.vendor_id]
architecture_family = vcl.opencl.architecture_family(device.vendor_id, device.name)
header_hierarchy = ["devices", device_type_prefix, vendor_prefix, architecture_family]
header_directory = os.path.join(builtin_database_dir, *header_hierarchy)
header_path = os.path.join(header_directory, header_name)
if not os.path.exists(header_directory):
os.makedirs(header_directory)
if os.path.exists(header_path):
with open (header_path, "r") as myfile:
data=myfile.read()
else:
data = ''
if not data:
ifndef_suffix = ('_'.join(header_hierarchy) + '_hpp_').upper()
data = ('#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
'#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
'\n'
'#include "viennacl/device_specific/forwards.h"\n'
'#include "viennacl/device_specific/builtin_database/common.hpp"\n'
'\n'
'namespace viennacl{\n'
'namespace device_specific{\n'
'namespace builtin_database{\n'
'namespace devices{\n'
'namespace ' + device_type_prefix + '{\n'
'namespace ' + vendor_prefix + '{\n'
'namespace ' + architecture_family + '{\n'
'namespace ' + cpp_device_name + '{\n'
'\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'#endif\n'
'')
data = append_include(data, 'viennacl/device_specific/templates/' + cpp_class_name + '.hpp')
device_type = {
cl.device_type.GPU: 'CL_DEVICE_TYPE_GPU',
cl.device_type.CPU: 'CL_DEVICE_TYPE_CPU',
cl.device_type.ACCELERATOR: 'CL_DEVICE_TYPE_ACCELERATOR'
}[device.type]
add_to_database_arguments = [vendor_prefix + '_id', device_type, 'ocl::'+architecture_family,
'"' + device.name + '"', cpp_class_name + '::parameters_type(' + ','.join(map(str,parameters)) + ')']
core = ' db.' + function_name + '(' + ', '.join(add_to_database_arguments) + ');'
already_declared = data.find(function_declaration)
if already_declared==-1:
substr = 'namespace ' + cpp_device_name + '{\n'
insert_index = data.index(substr) + len(substr)
data = data[:insert_index] + '\n' + function_declaration + '\n{\n' + core + '\n}\n' + data[insert_index:]
else:
i1 = data.find('{', already_declared)
if data[i1-1]=='\n':
i1 = i1 - 1
i2 = data.find('}', already_declared) + 1
data = data[:i1] + '\n{\n' + core + '\n}' + data[i2:]
#Write the header file
with open(header_path, "w+") as myfile:
myfile.write(data)
#Updates the global ViennaCL headers
with open(os.path.join(builtin_database_dir, operation + '.hpp'), 'r+') as operation_header:
data = operation_header.read()
data = append_include(data, os.path.relpath(header_path, os.path.join(viennacl_root, os.pardir)))
scope_name = '_'.join(('init', operation) + additional_parameters)
scope = data.index(scope_name)
function_call = ' ' + '::'.join(header_hierarchy + [cpp_device_name, function_name]) + '(' + ', '.join(['result'] + [additional_parameters_dict[k] + '()' for k in additional_parameters]) + ')'
if function_call not in data:
insert_index = data.rindex('\n', 0, data.index('return result', scope))
data = data[:insert_index] + function_call + ';\n' + data[insert_index:]
operation_header.seek(0)
operation_header.truncate()
operation_header.write(data)