Input-dependent models now activated for all the operations

This commit is contained in:
Philippe Tillet
2014-10-04 08:58:11 +02:00
parent 044419f9f0
commit fc8b450a7c
5 changed files with 89 additions and 90 deletions

View File

@@ -4,16 +4,16 @@ tmp-folder = /tmp/
[vector-axpy] [vector-axpy]
devices = 0 devices = 0
precision = single precision = single
size = 10000000 #~ size = 10000000
#~
#~ [matrix-axpy] #~ [matrix-axpy]
#~ devices = 0 #~ devices = 0
#~ precision = all #~ precision = single
#~ size = 3072, 3072 #~ size = 3072, 3072
#~
#~ [row-wise-reduction] #~ [row-wise-reduction]
#~ devices = 0 #~ devices = 0
#~ precision = all #~ precision = single
#~ layout = N, T #~ layout = N, T
#~ size = 3968, 3968 #~ size = 3968, 3968
@@ -21,4 +21,4 @@ size = 10000000
devices = 0 devices = 0
precision = single precision = single
layout = NT layout = NT
size = 1536, 1536, 1536 #size = 1536, 1536, 1536

View File

@@ -8,6 +8,7 @@ from external.configobj import ConfigObj
import pyopencl as cl import pyopencl as cl
import pyviennacl as vcl import pyviennacl as vcl
import numpy as np
from pyviennacl import backend from pyviennacl import backend
from pyviennacl import opencl from pyviennacl import opencl
from pyviennacl import atidlas from pyviennacl import atidlas
@@ -73,32 +74,45 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
with open(fname, "w+") as archive: with open(fname, "w+") as archive:
return optimize.genetic(statement, device, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params), return optimize.genetic(statement, device, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive) lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive)
#Helper
def tune(execution_handler, nTuning, nDataPoints, draw):
if 'size' in p:
profile = execution_handler(map_to_list(int, p['size']))
else:
def compute_perf(x, t):
return TYPES[operation]['perf-index']([datatype().itemsize, x, t])
X, Y, profiles = generate_dataset(TYPES[operation]['template'], execution_handler, nTuning, nDataPoints, compute_perf, draw)
train_model(X, Y, profiles, TYPES[operation]['perf-measure'])
#Vector AXPY #Vector AXPY
if operation=='vector-axpy': if operation=='vector-axpy':
def execution_handler(sizes, fname=os.devnull, parameters=None): def execution_handler(sizes, fname=os.devnull, parameters=None):
x = vcl.Vector(sizes[0], context=ctx, dtype=datatype) x = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
y = vcl.Vector(sizes[0], context=ctx, dtype=datatype) y = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
return execute(device, vcl.Statement(vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y))), (), sizes, fname, parameters) return execute(device, vcl.Statement(vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y))), (), sizes, fname, parameters)
if 'size' in p: tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=10, high=100000, size=1))
profile = execution_handler(map_to_list(int, p['size']))
#Matrix AXPY #Matrix AXPY
if operation=='matrix-axpy': if operation=='matrix-axpy':
A = vcl.Matrix(s, context=ctx, dtype=datatype) def execution_handler(sizes, fname=os.devnull, parameters=None):
B = vcl.Matrix(s, context=ctx, dtype=datatype) A = vcl.Matrix(sizes, context=ctx, dtype=datatype)
execute(A+B, ()) B = vcl.Matrix(sizes, context=ctx, dtype=datatype)
return execute(device, vcl.Statement(A+B), (), sizes, fname, parameters)
tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=5, high=100, size=2))
#Row-wise reduction #Row-wise reduction
if operation=='row-wise-reduction': if operation=='row-wise-reduction':
layouts = map_to_list((str,p['layout'])) layouts = map_to_list(str,p['layout'])
if 'all' in layouts: if 'all' in layouts:
layouts = ['N', 'T'] layouts = ['N', 'T']
for A_trans in layouts: for A_trans in layouts:
A = vcl.Matrix(s if A_trans=='N' else s[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR) def execution_handler(sizes, fname=os.devnull, parameters=None):
x = vcl.Vector(s[1] if A_trans=='N' else s[0], context=ctx, dtype=datatype) A = vcl.Matrix(sizes if A_trans=='N' else sizes[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR)
LHS = A if A_trans=='N' else A.T x = vcl.Vector(sizes[1] if A_trans=='N' else sizes[0], context=ctx, dtype=datatype)
execute(LHS*x, ()) LHS = A if A_trans=='N' else A.T
execute(device, vcl.Statement(LHS*x), (), sizes, fname, parameters)
tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=5, high=100, size=2))
#Matrix Product #Matrix Product
if operation=='matrix-product': if operation=='matrix-product':
layouts = map_to_list((str,p['layout'])) layouts = map_to_list(str,p['layout'])
if 'all' in layouts: if 'all' in layouts:
layouts = ['NN', 'NT', 'TN', 'TT'] layouts = ['NN', 'NT', 'TN', 'TT']
for layout in layouts: for layout in layouts:
@@ -114,11 +128,7 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR) C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR)
statement = vcl.Statement(vcl.Assign(C,LHS*RHS*alpha + C*beta)) statement = vcl.Statement(vcl.Assign(C,LHS*RHS*alpha + C*beta))
return execute(device, statement,(A_trans, B_trans), sizes, fname, parameters) return execute(device, statement,(A_trans, B_trans), sizes, fname, parameters)
if 'size' in p: tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=1, high=40, size=3))
profile = execution_handler(map(int, p['size']))
else:
X, Y, profiles = generate_dataset(TYPES[operation]['template'], execution_handler)
train_model(X, Y, profiles)

View File

@@ -6,71 +6,59 @@ import numpy as np
from sklearn.neighbors.kde import KernelDensity from sklearn.neighbors.kde import KernelDensity
from pyviennacl.atidlas import FetchingPolicy from pyviennacl.atidlas import FetchingPolicy
def decode(y): def resample(X, draw):
fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED]
y[7] = fetch[y[7]]
y[8] = fetch[y[8]]
return y
def resample(X, tbincount, densities, step):
Xtuples = [tuple(x) for x in X] Xtuples = [tuple(x) for x in X]
r = random.random() r = random.random()
while(True): while(True):
if(len(tbincount)==0 or len(densities)==0 or r<=1.0/len(densities)): x = draw()
x = np.array([step*random.randint(1,40), step*random.randint(1,40), step*random.randint(1,40)])
else:
probs = [1.0/x if x>0 else 0 for x in tbincount]
distr = np.random.choice(range(tbincount.size), p = probs/np.sum(probs))
x = densities[distr].sample()[0]
x = np.maximum(np.ones(x.shape),(x - step/2).astype(int)/step + 1)*step
if tuple(x) not in Xtuples: if tuple(x) not in Xtuples:
break break
return x.astype(int) return x.astype(int)
def generate_dataset(TemplateType, execution_handler): def generate_dataset(TemplateType, execution_handler, nTuning, nDataPoints, compute_perf, draw):
I = 50
step = 64
path = "./data"
# print "Getting some good profiles..." print "Getting some good profiles..."
# X = np.empty((I, 3)) nDim = draw().size
# t = np.empty(I) X = np.empty((nTuning, nDim))
# profiles = [] t = np.empty(nTuning)
# for i in range(I): profiles = []
# x = resample(X, [], [], step) for i in range(nTuning):
# y = execution_handler(x) x = resample(X, draw)
# if y not in profiles: y = execution_handler(x)
# profiles.append(y) if y not in profiles:
# idx = profiles.index(y) profiles.append(y)
# X[i,:] = x idx = profiles.index(y)
# t[i] = idx X[i,:] = x
# densities = [KernelDensity(kernel='gaussian', bandwidth=2*step).fit(X[t==i,:]) for i in range(int(max(t))+1)]; t[i] = idx
#
# print "Generating the dataset..."
# N = 10000
# Y = np.empty((N, len(profiles)))
# X = np.empty((N,3))
# t = []
#
# for i in range(N):
# x = resample(X, [], [], step)
# for j,y in enumerate(profiles):
# T = execution_handler(x, os.devnull, decode(map(int, y)))
# Y[i,j] = 2*1e-9*x[0]*x[1]*x[2]/T
# idx = np.argmax(Y[i,:])
# X[i,:] = x
# t = np.argmax(Y[:i+1,], axis=1)
# densities[idx].fit(X[t==idx,:])
# if i%10==0:
# sys.stdout.write('%d data points generated\r'%i)
# sys.stdout.flush()
#
# np.savetxt(os.path.join(path,"profiles.csv"), profiles)
# np.savetxt(os.path.join(path,"X.csv"), X)
# np.savetxt(os.path.join(path,"Y.csv"), Y)
profiles = np.loadtxt(os.path.join(path,"profiles.csv")) print "Generating the dataset..."
X = np.loadtxt(os.path.join(path,"X.csv")) Y = np.empty((nDataPoints, len(profiles)))
Y = np.loadtxt(os.path.join(path,"Y.csv")) X = np.empty((nDataPoints, nDim))
t = []
for i in range(nDataPoints):
x = resample(X, draw)
for j,y in enumerate(profiles):
T = execution_handler(x, os.devnull, y)
Y[i,j] = compute_perf(x, T)
idx = np.argmax(Y[i,:])
X[i,:] = x
t = np.argmax(Y[:i+1,], axis=1)
if i%10==0:
sys.stdout.write('%d data points generated\r'%i)
sys.stdout.flush()
template_name = TemplateType.__name__
dir = os.path.join("data", template_name)
if not os.path.exists(dir):
os.makedirs(dir)
np.savetxt(os.path.join(dir,"profiles.csv"), profiles)
np.savetxt(os.path.join(dir,"X.csv"), X)
np.savetxt(os.path.join(dir,"Y.csv"), Y)
profiles = np.loadtxt(os.path.join(dir, "profiles.csv"))
X = np.loadtxt(os.path.join(dir, "X.csv"),ndmin=2)
Y = np.loadtxt(os.path.join(dir, "Y.csv"),ndmin=2)
return X, Y, profiles return X, Y, profiles

View File

@@ -40,13 +40,15 @@ class GeneticOperators(object):
self.ParameterType = TemplateType.Parameters self.ParameterType = TemplateType.Parameters
self.build_template = build_template self.build_template = build_template
self.cache = {} self.cache = {}
self.indpb = 0.05
self.out = out self.out = out
self.genome_info = { self.genome_info = {
vcl.atidlas.VectorAxpyTemplate: [3,4,4,vcl.atidlas.FetchingPolicy], vcl.atidlas.VectorAxpyTemplate: [3,4,4,vcl.atidlas.FetchingPolicy],
vcl.atidlas.MatrixAxpyTemplate: [3,3,3,3,3,vcl.atidlas.FetchingPolicy],
vcl.atidlas.RowWiseReductionTemplate: [3,3,3,4,vcl.atidlas.FetchingPolicy],
vcl.atidlas.MatrixProductTemplate: [3,3,3,3,3,3,3,vcl.atidlas.FetchingPolicy,vcl.atidlas.FetchingPolicy,3] vcl.atidlas.MatrixProductTemplate: [3,3,3,3,3,3,3,vcl.atidlas.FetchingPolicy,vcl.atidlas.FetchingPolicy,3]
}[TemplateType] }[TemplateType]
self.indpb = 1.0/sum([1 if x==vcl.atidlas.FetchingPolicy else x for x in self.genome_info])
creator.create("FitnessMin", base.Fitness, weights=(-1.0,)) creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin) creator.create("Individual", list, fitness=creator.FitnessMin)
@@ -149,7 +151,7 @@ class GeneticOperators(object):
ind.fitness.values = fit ind.fitness.values = fit
hof.update(population) hof.update(population)
while time.time() - start_time < maxtime: while time.time() - start_time < maxtime and gen < maxgen:
# Vary the population # Vary the population
offspring = [] offspring = []
for _ in xrange(mu): for _ in xrange(mu):
@@ -166,9 +168,8 @@ class GeneticOperators(object):
offspring.append(ind) offspring.append(ind)
else: # Apply reproduction else: # Apply reproduction
offspring.append(random.choice(population)) offspring.append(random.choice(population))
#for x in offspring:
#~ for x in offspring: #print self.decode(x)
#~ print self.decode(x)
# Evaluate the individuals with an invalid fitness # Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in offspring if not ind.fitness.valid] invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
fitnesses = self.toolbox.map(self.evaluate, invalid_ind) fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
@@ -180,9 +181,9 @@ class GeneticOperators(object):
population[:] = self.toolbox.select(population + offspring, mu) population[:] = self.toolbox.select(population + offspring, mu)
#Update #Update
gen = gen + 1 gen = gen + 1
best_profile = '(%s)'%','.join(map(str,self.decode(hof[0]))); best_profile = '(%s)'%','.join(map(str,self.decode(hof[0])))
best_performance = compute_perf(hof[0].fitness.values[0]) best_performance = compute_perf(hof[0].fitness.values[0])
sys.stdout.write('Time %d | Best %d %s [ for %s ]\r'%(time.time() - start_time, best_performance, perf_metric, best_profile)) sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\r'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile))
sys.stdout.flush() sys.stdout.flush()
sys.stdout.write('\n') sys.stdout.write('\n')
return self.decode(hof[0]) return self.decode(hof[0])

View File

@@ -8,7 +8,7 @@ from pybrain.supervised.trainers import BackpropTrainer
from pybrain.structure import LinearLayer, TanhLayer, SigmoidLayer, SoftmaxLayer, FeedForwardNetwork, BiasUnit from pybrain.structure import LinearLayer, TanhLayer, SigmoidLayer, SoftmaxLayer, FeedForwardNetwork, BiasUnit
from pybrain.tools.neuralnets import NNregression, Trainer from pybrain.tools.neuralnets import NNregression, Trainer
def train_model(X, Y, profiles): def train_model(X, Y, profiles, metric):
#Preprocessing #Preprocessing
Xmean = np.mean(X, axis=0) Xmean = np.mean(X, axis=0)
Xstd = np.std(X, axis=0) Xstd = np.std(X, axis=0)
@@ -43,7 +43,7 @@ def train_model(X, Y, profiles):
np.set_printoptions(precision=2) np.set_printoptions(precision=2)
print("-----------------") print("-----------------")
print("Average testing speedup : %f (Optimal : %f)"%(sp.stats.gmean(speedups), sp.stats.gmean(optspeedups))) print("Average testing speedup : %f (Optimal : %f)"%(sp.stats.gmean(speedups), sp.stats.gmean(optspeedups)))
print("Average GFLOP/s : %f (Default %f, Optimal %f)"%(np.mean(np.multiply(GFlops,speedups)), np.mean(GFlops), np.mean(np.multiply(GFlops,optspeedups)))) print("Average %s: %f (Default %f, Optimal %f)"%(metric, np.mean(np.multiply(GFlops,speedups)), np.mean(GFlops), np.mean(np.multiply(GFlops,optspeedups))))
print("Minimum speedup is %f wrt %i GFlops"%(np.min(speedups), GFlops[np.argmin(speedups)])) print("Minimum speedup is %f wrt %i %s"%(np.min(speedups), GFlops[np.argmin(speedups)], metric))
print("Maximum speedup is %f wrt %i GFlops for %s"%(np.max(speedups), GFlops[np.argmax(speedups)], X[np.argmax(speedups)]*Xstd+Xmean)) print("Maximum speedup is %f wrt %i %s"%(np.max(speedups), GFlops[np.argmax(speedups)], metric))
print("--------") print("--------")