From fc8b450a7c498861d5d940311a82cdd57ad3c58d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 4 Oct 2014 08:58:11 +0200 Subject: [PATCH] Input-dependent models now activated for all the operations --- autotune/config.ini | 12 ++--- autotune/python/autotune.py | 42 +++++++++------ autotune/python/dataset.py | 102 ++++++++++++++++-------------------- autotune/python/genetic.py | 15 +++--- autotune/python/model.py | 8 +-- 5 files changed, 89 insertions(+), 90 deletions(-) diff --git a/autotune/config.ini b/autotune/config.ini index 042ce1500..e07a8a924 100644 --- a/autotune/config.ini +++ b/autotune/config.ini @@ -4,16 +4,16 @@ tmp-folder = /tmp/ [vector-axpy] devices = 0 precision = single -size = 10000000 - +#~ size = 10000000 +#~ #~ [matrix-axpy] #~ devices = 0 -#~ precision = all +#~ precision = single #~ size = 3072, 3072 - +#~ #~ [row-wise-reduction] #~ devices = 0 -#~ precision = all +#~ precision = single #~ layout = N, T #~ size = 3968, 3968 @@ -21,4 +21,4 @@ size = 10000000 devices = 0 precision = single layout = NT -size = 1536, 1536, 1536 +#size = 1536, 1536, 1536 diff --git a/autotune/python/autotune.py b/autotune/python/autotune.py index 5478b3cfe..7a06c1ca7 100644 --- a/autotune/python/autotune.py +++ b/autotune/python/autotune.py @@ -8,6 +8,7 @@ from external.configobj import ConfigObj import pyopencl as cl import pyviennacl as vcl +import numpy as np from pyviennacl import backend from pyviennacl import opencl from pyviennacl import atidlas @@ -73,32 +74,45 @@ def do_tuning(config_fname, spec_fname, viennacl_root): with open(fname, "w+") as archive: return optimize.genetic(statement, device, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params), lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive) + #Helper + def tune(execution_handler, nTuning, nDataPoints, draw): + if 'size' in p: + profile = execution_handler(map_to_list(int, p['size'])) + else: + def compute_perf(x, t): + return TYPES[operation]['perf-index']([datatype().itemsize, x, t]) + X, Y, profiles = generate_dataset(TYPES[operation]['template'], execution_handler, nTuning, nDataPoints, compute_perf, draw) + train_model(X, Y, profiles, TYPES[operation]['perf-measure']) + #Vector AXPY if operation=='vector-axpy': def execution_handler(sizes, fname=os.devnull, parameters=None): x = vcl.Vector(sizes[0], context=ctx, dtype=datatype) y = vcl.Vector(sizes[0], context=ctx, dtype=datatype) return execute(device, vcl.Statement(vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y))), (), sizes, fname, parameters) - if 'size' in p: - profile = execution_handler(map_to_list(int, p['size'])) + tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=10, high=100000, size=1)) #Matrix AXPY if operation=='matrix-axpy': - A = vcl.Matrix(s, context=ctx, dtype=datatype) - B = vcl.Matrix(s, context=ctx, dtype=datatype) - execute(A+B, ()) + def execution_handler(sizes, fname=os.devnull, parameters=None): + A = vcl.Matrix(sizes, context=ctx, dtype=datatype) + B = vcl.Matrix(sizes, context=ctx, dtype=datatype) + return execute(device, vcl.Statement(A+B), (), sizes, fname, parameters) + tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=5, high=100, size=2)) #Row-wise reduction if operation=='row-wise-reduction': - layouts = map_to_list((str,p['layout'])) + layouts = map_to_list(str,p['layout']) if 'all' in layouts: layouts = ['N', 'T'] for A_trans in layouts: - A = vcl.Matrix(s if A_trans=='N' else s[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR) - x = vcl.Vector(s[1] if A_trans=='N' else s[0], context=ctx, dtype=datatype) - LHS = A if A_trans=='N' else A.T - execute(LHS*x, ()) + def execution_handler(sizes, fname=os.devnull, parameters=None): + A = vcl.Matrix(sizes if A_trans=='N' else sizes[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR) + x = vcl.Vector(sizes[1] if A_trans=='N' else sizes[0], context=ctx, dtype=datatype) + LHS = A if A_trans=='N' else A.T + execute(device, vcl.Statement(LHS*x), (), sizes, fname, parameters) + tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=5, high=100, size=2)) #Matrix Product if operation=='matrix-product': - layouts = map_to_list((str,p['layout'])) + layouts = map_to_list(str,p['layout']) if 'all' in layouts: layouts = ['NN', 'NT', 'TN', 'TT'] for layout in layouts: @@ -114,11 +128,7 @@ def do_tuning(config_fname, spec_fname, viennacl_root): C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR) statement = vcl.Statement(vcl.Assign(C,LHS*RHS*alpha + C*beta)) return execute(device, statement,(A_trans, B_trans), sizes, fname, parameters) - if 'size' in p: - profile = execution_handler(map(int, p['size'])) - else: - X, Y, profiles = generate_dataset(TYPES[operation]['template'], execution_handler) - train_model(X, Y, profiles) + tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=1, high=40, size=3)) diff --git a/autotune/python/dataset.py b/autotune/python/dataset.py index 326f34015..d0664c0ce 100644 --- a/autotune/python/dataset.py +++ b/autotune/python/dataset.py @@ -6,71 +6,59 @@ import numpy as np from sklearn.neighbors.kde import KernelDensity from pyviennacl.atidlas import FetchingPolicy -def decode(y): - fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED] - y[7] = fetch[y[7]] - y[8] = fetch[y[8]] - return y - -def resample(X, tbincount, densities, step): +def resample(X, draw): Xtuples = [tuple(x) for x in X] r = random.random() while(True): - if(len(tbincount)==0 or len(densities)==0 or r<=1.0/len(densities)): - x = np.array([step*random.randint(1,40), step*random.randint(1,40), step*random.randint(1,40)]) - else: - probs = [1.0/x if x>0 else 0 for x in tbincount] - distr = np.random.choice(range(tbincount.size), p = probs/np.sum(probs)) - x = densities[distr].sample()[0] - x = np.maximum(np.ones(x.shape),(x - step/2).astype(int)/step + 1)*step + x = draw() if tuple(x) not in Xtuples: break return x.astype(int) -def generate_dataset(TemplateType, execution_handler): - I = 50 - step = 64 - path = "./data" +def generate_dataset(TemplateType, execution_handler, nTuning, nDataPoints, compute_perf, draw): - # print "Getting some good profiles..." - # X = np.empty((I, 3)) - # t = np.empty(I) - # profiles = [] - # for i in range(I): - # x = resample(X, [], [], step) - # y = execution_handler(x) - # if y not in profiles: - # profiles.append(y) - # idx = profiles.index(y) - # X[i,:] = x - # t[i] = idx - # densities = [KernelDensity(kernel='gaussian', bandwidth=2*step).fit(X[t==i,:]) for i in range(int(max(t))+1)]; - # - # print "Generating the dataset..." - # N = 10000 - # Y = np.empty((N, len(profiles))) - # X = np.empty((N,3)) - # t = [] - # - # for i in range(N): - # x = resample(X, [], [], step) - # for j,y in enumerate(profiles): - # T = execution_handler(x, os.devnull, decode(map(int, y))) - # Y[i,j] = 2*1e-9*x[0]*x[1]*x[2]/T - # idx = np.argmax(Y[i,:]) - # X[i,:] = x - # t = np.argmax(Y[:i+1,], axis=1) - # densities[idx].fit(X[t==idx,:]) - # if i%10==0: - # sys.stdout.write('%d data points generated\r'%i) - # sys.stdout.flush() - # - # np.savetxt(os.path.join(path,"profiles.csv"), profiles) - # np.savetxt(os.path.join(path,"X.csv"), X) - # np.savetxt(os.path.join(path,"Y.csv"), Y) + print "Getting some good profiles..." + nDim = draw().size + X = np.empty((nTuning, nDim)) + t = np.empty(nTuning) + profiles = [] + for i in range(nTuning): + x = resample(X, draw) + y = execution_handler(x) + if y not in profiles: + profiles.append(y) + idx = profiles.index(y) + X[i,:] = x + t[i] = idx - profiles = np.loadtxt(os.path.join(path,"profiles.csv")) - X = np.loadtxt(os.path.join(path,"X.csv")) - Y = np.loadtxt(os.path.join(path,"Y.csv")) + print "Generating the dataset..." + Y = np.empty((nDataPoints, len(profiles))) + X = np.empty((nDataPoints, nDim)) + t = [] + + for i in range(nDataPoints): + x = resample(X, draw) + for j,y in enumerate(profiles): + T = execution_handler(x, os.devnull, y) + Y[i,j] = compute_perf(x, T) + idx = np.argmax(Y[i,:]) + X[i,:] = x + t = np.argmax(Y[:i+1,], axis=1) + if i%10==0: + sys.stdout.write('%d data points generated\r'%i) + sys.stdout.flush() + + template_name = TemplateType.__name__ + dir = os.path.join("data", template_name) + if not os.path.exists(dir): + os.makedirs(dir) + + np.savetxt(os.path.join(dir,"profiles.csv"), profiles) + np.savetxt(os.path.join(dir,"X.csv"), X) + np.savetxt(os.path.join(dir,"Y.csv"), Y) + + profiles = np.loadtxt(os.path.join(dir, "profiles.csv")) + X = np.loadtxt(os.path.join(dir, "X.csv"),ndmin=2) + Y = np.loadtxt(os.path.join(dir, "Y.csv"),ndmin=2) return X, Y, profiles diff --git a/autotune/python/genetic.py b/autotune/python/genetic.py index 224b997b3..3fd0d83ed 100644 --- a/autotune/python/genetic.py +++ b/autotune/python/genetic.py @@ -40,13 +40,15 @@ class GeneticOperators(object): self.ParameterType = TemplateType.Parameters self.build_template = build_template self.cache = {} - self.indpb = 0.05 self.out = out self.genome_info = { vcl.atidlas.VectorAxpyTemplate: [3,4,4,vcl.atidlas.FetchingPolicy], + vcl.atidlas.MatrixAxpyTemplate: [3,3,3,3,3,vcl.atidlas.FetchingPolicy], + vcl.atidlas.RowWiseReductionTemplate: [3,3,3,4,vcl.atidlas.FetchingPolicy], vcl.atidlas.MatrixProductTemplate: [3,3,3,3,3,3,3,vcl.atidlas.FetchingPolicy,vcl.atidlas.FetchingPolicy,3] }[TemplateType] + self.indpb = 1.0/sum([1 if x==vcl.atidlas.FetchingPolicy else x for x in self.genome_info]) creator.create("FitnessMin", base.Fitness, weights=(-1.0,)) creator.create("Individual", list, fitness=creator.FitnessMin) @@ -149,7 +151,7 @@ class GeneticOperators(object): ind.fitness.values = fit hof.update(population) - while time.time() - start_time < maxtime: + while time.time() - start_time < maxtime and gen < maxgen: # Vary the population offspring = [] for _ in xrange(mu): @@ -166,9 +168,8 @@ class GeneticOperators(object): offspring.append(ind) else: # Apply reproduction offspring.append(random.choice(population)) - - #~ for x in offspring: - #~ print self.decode(x) + #for x in offspring: + #print self.decode(x) # Evaluate the individuals with an invalid fitness invalid_ind = [ind for ind in offspring if not ind.fitness.valid] fitnesses = self.toolbox.map(self.evaluate, invalid_ind) @@ -180,9 +181,9 @@ class GeneticOperators(object): population[:] = self.toolbox.select(population + offspring, mu) #Update gen = gen + 1 - best_profile = '(%s)'%','.join(map(str,self.decode(hof[0]))); + best_profile = '(%s)'%','.join(map(str,self.decode(hof[0]))) best_performance = compute_perf(hof[0].fitness.values[0]) - sys.stdout.write('Time %d | Best %d %s [ for %s ]\r'%(time.time() - start_time, best_performance, perf_metric, best_profile)) + sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\r'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile)) sys.stdout.flush() sys.stdout.write('\n') return self.decode(hof[0]) diff --git a/autotune/python/model.py b/autotune/python/model.py index 87e6a09a5..62c3583f5 100644 --- a/autotune/python/model.py +++ b/autotune/python/model.py @@ -8,7 +8,7 @@ from pybrain.supervised.trainers import BackpropTrainer from pybrain.structure import LinearLayer, TanhLayer, SigmoidLayer, SoftmaxLayer, FeedForwardNetwork, BiasUnit from pybrain.tools.neuralnets import NNregression, Trainer -def train_model(X, Y, profiles): +def train_model(X, Y, profiles, metric): #Preprocessing Xmean = np.mean(X, axis=0) Xstd = np.std(X, axis=0) @@ -43,7 +43,7 @@ def train_model(X, Y, profiles): np.set_printoptions(precision=2) print("-----------------") print("Average testing speedup : %f (Optimal : %f)"%(sp.stats.gmean(speedups), sp.stats.gmean(optspeedups))) - print("Average GFLOP/s : %f (Default %f, Optimal %f)"%(np.mean(np.multiply(GFlops,speedups)), np.mean(GFlops), np.mean(np.multiply(GFlops,optspeedups)))) - print("Minimum speedup is %f wrt %i GFlops"%(np.min(speedups), GFlops[np.argmin(speedups)])) - print("Maximum speedup is %f wrt %i GFlops for %s"%(np.max(speedups), GFlops[np.argmax(speedups)], X[np.argmax(speedups)]*Xstd+Xmean)) + print("Average %s: %f (Default %f, Optimal %f)"%(metric, np.mean(np.multiply(GFlops,speedups)), np.mean(GFlops), np.mean(np.multiply(GFlops,optspeedups)))) + print("Minimum speedup is %f wrt %i %s"%(np.min(speedups), GFlops[np.argmin(speedups)], metric)) + print("Maximum speedup is %f wrt %i %s"%(np.max(speedups), GFlops[np.argmax(speedups)], metric)) print("--------") \ No newline at end of file