Porting GA for all the operations

2014-10-03 09:29:45 +02:00
parent 2f6d41f661
commit 044419f9f0
6 changed files with 76 additions and 78 deletions
--- a/autotune/config.ini
+++ b/autotune/config.ini
@@ -1,10 +1,10 @@
 #will save the archive into /tmp/name-of-operation.dat
 tmp-folder = /tmp/

-#~ [vector-axpy]
-#~ devices = 0
-#~ precision = all
-#~ size = 10000000
+[vector-axpy]
+devices = 0
+precision = single
+size = 10000000

 #~ [matrix-axpy]
 #~ devices = 0
--- a/autotune/python/autotune.py
+++ b/autotune/python/autotune.py
@@ -24,39 +24,35 @@ DATATYPES = { 'single' : vcl.float32,
              'double' : vcl.float64 }

 TYPES = { 'vector-axpy': {'template':vcl.atidlas.VectorAxpyTemplate,
-                          'parameter-names':['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
                          'perf-index':lambda x: 3*x[0]*x[1][0]/x[2]*1e-9,
                          'perf-measure':'GB/s'},

          'matrix-axpy': {'template':vcl.atidlas.MatrixAxpyTemplate,
-                          'parameter-names':['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'num-groups-1', 'fetch'],
                          'perf-index':lambda x: 3*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
                          'perf-measure':'GB/s'},

          'reduction': {'template':vcl.atidlas.ReductionTemplate,
-                        'parameter-names':['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
                        'perf-index':lambda x: 2*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
                        'perf-measure':'GB/s'},

          'row-wise-reduction': {'template':vcl.atidlas.RowWiseReductionTemplate,
-                                'parameter-names':['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'fetch'],
                                'perf-index':lambda x: x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
                                'perf-measure':'GB/s'},

          'matrix-product': {'template':vcl.atidlas.MatrixProductTemplate,
-                            'parameter-names':['simd-width', 'local-size-0', 'kL', 'local-size-1', 'mS', 'kS', 'nS', 'A-fetch-policy', 'B-fetch-policy', 'local-fetch-size-0', 'local-fetch-size-1'],
                            'perf-index': lambda x: 2*x[1][0]*x[1][1]*x[1][2]/x[2]*1e-9,
                            'perf-measure': 'GFLOP/s'} }

 def do_tuning(config_fname, spec_fname, viennacl_root):
    config = ConfigObj(config_fname, configspec=spec_fname)
-    map_to_list = lambda T: list(map(T[0], T[1] if isinstance(T[1], list) else [T[1]]))
+    def map_to_list(T, x):
+        return list(map(T, x if isinstance(x, list) else [x]))
    for operation in ['vector-axpy', 'matrix-axpy', 'row-wise-reduction', 'matrix-product']:
        if operation in config:
            p = config[operation]
            confdevices = p['devices']
            devices = utils.DEVICES_PRESETS[confdevices] if confdevices in utils.DEVICES_PRESETS else [utils.all_devices[int(i)] for i in confdevices]
-            precisions =  map_to_list((str, p['precision']))
+            precisions =  map_to_list(str, p['precision'])
            datatypes = [DATATYPES[k] for k in precisions]
            #Iterate through the datatypes and the devices
            for datatype, device in itertools.product(datatypes, devices):
@@ -68,18 +64,23 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
                    sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...')
                    continue
                #Helper
-                def execute(statement, other_params, sizes, fname = os.devnull):
+                def execute(device, statement, other_params, sizes, fname = os.devnull, parameters = None):
+                    if parameters:
+                        TemplateType = TYPES[operation]['template']
+                        return tools.benchmark(TemplateType(TemplateType.Parameters(*parameters),*other_params), statement, device)
                    print('-----')
                    print(' '.join(map(str, ("Now tuning:", datatype.__name__, '-', operation, '-'.join(other_params), '[' + device.name, '(' + device.platform.name + ')] for sizes', sizes))))
                    with open(fname, "w+") as archive:
-                        return optimize.genetic(statement, ctx, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
-                                      TYPES[operation]['parameter-names'], lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive)
-                s = map_to_list((int, p['size']))
+                        return optimize.genetic(statement, device, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
+                                                lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive)
                #Vector AXPY
                if operation=='vector-axpy':
-                    x = vcl.Vector(s[0], context=ctx, dtype=datatype)
-                    y = vcl.Vector(s[0], context=ctx, dtype=datatype)
-                    execute(vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y)), ())
+                    def execution_handler(sizes, fname=os.devnull, parameters=None):
+                        x = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
+                        y = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
+                        return execute(device, vcl.Statement(vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y))), (), sizes, fname, parameters)
+                    if 'size' in p:
+                        profile = execution_handler(map_to_list(int, p['size']))
                #Matrix AXPY
                if operation=='matrix-axpy':
                    A = vcl.Matrix(s, context=ctx, dtype=datatype)
@@ -112,11 +113,10 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
                            beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
                            C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR)
                            statement = vcl.Statement(vcl.Assign(C,LHS*RHS*alpha + C*beta))
-                            if parameters:
-                                TemplateType = TYPES[operation]['template']
-                                return tools.benchmark(TemplateType(TemplateType.Parameters(*parameters),A_trans,B_trans), statement, device)
+                            return execute(device, statement,(A_trans, B_trans), sizes, fname, parameters)
+                        if 'size' in p:
+                            profile = execution_handler(map(int, p['size']))
                        else:
-                                return execute(statement,(A_trans, B_trans), sizes, fname)
                            X, Y, profiles = generate_dataset(TYPES[operation]['template'], execution_handler)
                            train_model(X, Y, profiles)

--- a/autotune/python/dataset.py
+++ b/autotune/python/dataset.py
@@ -28,7 +28,7 @@ def resample(X, tbincount, densities, step):
    return x.astype(int)

 def generate_dataset(TemplateType, execution_handler):
-    I = 10
+    I = 50
    step = 64
    path = "./data"

@@ -47,13 +47,13 @@ def generate_dataset(TemplateType, execution_handler):
    # densities = [KernelDensity(kernel='gaussian', bandwidth=2*step).fit(X[t==i,:]) for i in range(int(max(t))+1)];
    #
    # print "Generating the dataset..."
-    # N = 1000
+    # N = 10000
    # Y = np.empty((N, len(profiles)))
    # X = np.empty((N,3))
    # t = []
    #
    # for i in range(N):
-    #     x = resample(X, np.bincount(t), densities, step)
+    #     x = resample(X, [], [], step)
    #     for j,y in enumerate(profiles):
    #         T = execution_handler(x, os.devnull, decode(map(int, y)))
    #         Y[i,j] = 2*1e-9*x[0]*x[1]*x[2]/T
@@ -61,6 +61,9 @@ def generate_dataset(TemplateType, execution_handler):
    #     X[i,:] = x
    #     t = np.argmax(Y[:i+1,], axis=1)
    #     densities[idx].fit(X[t==idx,:])
+    #     if i%10==0:
+    #         sys.stdout.write('%d data points generated\r'%i)
+    #         sys.stdout.flush()
    #
    # np.savetxt(os.path.join(path,"profiles.csv"), profiles)
    # np.savetxt(os.path.join(path,"X.csv"), X)
--- a/autotune/python/genetic.py
+++ b/autotune/python/genetic.py
@@ -33,10 +33,9 @@ def b_gray_to_bin(A='00000000', endian='big'):

 class GeneticOperators(object):

-    def __init__(self, device, statement, parameter_names, TemplateType, build_template, out):
+    def __init__(self, device, statement, TemplateType, build_template, out):
        self.device = device
        self.statement = statement
-        self.parameter_names = parameter_names
        self.TemplateType = TemplateType
        self.ParameterType = TemplateType.Parameters
        self.build_template = build_template
@@ -44,6 +43,11 @@ class GeneticOperators(object):
        self.indpb = 0.05
        self.out = out

+        self.genome_info = {
+                            vcl.atidlas.VectorAxpyTemplate: [3,4,4,vcl.atidlas.FetchingPolicy],
+                            vcl.atidlas.MatrixProductTemplate: [3,3,3,3,3,3,3,vcl.atidlas.FetchingPolicy,vcl.atidlas.FetchingPolicy,3]
+                           }[TemplateType]
+
        creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMin)

@@ -54,35 +58,39 @@ class GeneticOperators(object):
        self.toolbox.register("mutate", self.mutate)
        self.toolbox.register("select", deap_tools.selNSGA2)

-    @staticmethod
-    def decode(s):
+    def decode(self, genome):
        FetchingPolicy = vcl.atidlas.FetchingPolicy
        fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED]
-        fetchA = fetch[s[0]]
-        fetchB = fetch[s[1]]
-        bincode = ''.join(s[2:])
-        decode_element = lambda x:2**int(b_gray_to_bin(x), 2)
-        simd = decode_element(bincode[0:3])
-        ls0 = decode_element(bincode[2:5])
-        ls1 = decode_element(bincode[5:8])
-        kL = decode_element(bincode[8:11])
-        mS = decode_element(bincode[11:14])
-        kS = decode_element(bincode[14:17])
-        nS = decode_element(bincode[17:20])
-        if fetchA==FetchingPolicy.FETCH_FROM_LOCAL or fetchB==FetchingPolicy.FETCH_FROM_LOCAL:
-            lf0 = decode_element(bincode[20:23])
-            lf1 = ls0*ls1/lf0
+        decode_element = lambda x:2**int(b_gray_to_bin(''.join(x)), 2)
+        result = []
+        offset = 0
+        for x in self.genome_info:
+            if x==vcl.atidlas.FetchingPolicy:
+                result.append(fetch[genome[offset]])
+                offset = offset + 1
            else:
-            lf0, lf1 = 0, 0
-        return [simd, ls0, kL, ls1, mS, kS, nS, fetchA, fetchB, lf0, lf1]
+                result.append(decode_element(genome[offset:offset+x]))
+                offset = offset + x
+        #GEMM peculiarities
+        if self.TemplateType==vcl.atidlas.MatrixProductTemplate:
+            if FetchingPolicy.FETCH_FROM_LOCAL in result:
+                lf1 = result[1]*result[3]/result[9]
+            else:
+                result[9] = 0
+                lf1 = 0
+            result.append(lf1)
+        return result

    def init(self, N):
        result = []
-        fetchcount = [0, 0, 0]
        while len(result) < N:
            while True:
-                fetch = random.randint(0,2)
-                bincode = [fetch, fetch] + [str(random.randint(0,1)) for i in range(23)]
+                bincode = []
+                for x in self.genome_info:
+                    if x==vcl.atidlas.FetchingPolicy:
+                        bincode = bincode + [random.randint(0,2)]
+                    else:
+                        bincode = bincode + [str(random.randint(0,1)) for i in range(x)]
                parameters = self.decode(bincode)
                template = self.build_template(self.TemplateType.Parameters(*parameters))
                registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4
@@ -90,22 +98,18 @@ class GeneticOperators(object):
                local_size = template.parameters.local_size_0*template.parameters.local_size_1
                occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
                if not tools.skip(template, self.statement, self.device):
-                    fetchcount[fetch] = fetchcount[fetch] + 1
-                    if max(fetchcount) - min(fetchcount) <= 1:
                    result.append(creator.Individual(bincode))
                    break
-                    else:
-                        fetchcount[fetch] = fetchcount[fetch] - 1
        return result

    def mutate(self, individual):
        while True:
            new_individual = copy.deepcopy(individual)
            for i in range(len(new_individual)):
-                if i < 2 and random.random() < self.indpb:
+                if isinstance(individual[i], int) and random.random() < self.indpb:
                    while new_individual[i] == individual[i]:
                        new_individual[i] = random.randint(0, 2)
-                elif i >= 2 and random.random() < self.indpb:
+                elif not isinstance(individual[i], int) and random.random() < self.indpb:
                    new_individual[i] = '1' if new_individual[i]=='0' else '0'
            parameters = self.decode(new_individual)
            template = self.build_template(self.TemplateType.Parameters(*parameters))
@@ -176,7 +180,7 @@ class GeneticOperators(object):
            population[:] = self.toolbox.select(population + offspring, mu)
            #Update
            gen = gen + 1
-            best_profile = '(%s)'%','.join(map(str,GeneticOperators.decode(hof[0])));
+            best_profile = '(%s)'%','.join(map(str,self.decode(hof[0])));
            best_performance = compute_perf(hof[0].fitness.values[0])
            sys.stdout.write('Time %d | Best %d %s [ for %s ]\r'%(time.time() - start_time, best_performance, perf_metric, best_profile))
            sys.stdout.flush()
--- a/autotune/python/model.py
+++ b/autotune/python/model.py
@@ -13,12 +13,13 @@ def train_model(X, Y, profiles):
    Xmean = np.mean(X, axis=0)
    Xstd = np.std(X, axis=0)
    X = (X - Xmean)/Xstd
+
    Ymax = np.max(Y)
    Y = Y/Ymax

    ref = np.argmax(np.bincount(np.argmax(Y, axis=1))) #most common profile
    #Cross-validation data-sets
-    cut = int(0.1*X.shape[0]+1)
+    cut = int(0.800*X.shape[0]+1)
    XTr = X[0:cut, :]
    YTr = Y[0:cut, :]
    XTe = X[cut:,:]
@@ -26,23 +27,15 @@ def train_model(X, Y, profiles):

    #Train the model
    print("Training the model...")
-    ds = SupervisedDataSet(X.shape[1], Y.shape[1])
-    for idx, x in enumerate(X):
-        ds.addSample(x, Y[idx,:])
-    clf = buildNetwork(*[X.shape[1], 100, Y.shape[1]], hiddenclass = TanhLayer, outclass = LinearLayer)
-    #print fnn;
-    #trainer = RPropMinusTrainer( fnn, dataset=ds, verbose=True);
-    trainer = BackpropTrainer( clf, dataset=ds, verbose=True, momentum=0.01, weightdecay=0.01, learningrate=0.002, batchlearning=False)
-    trainer.trainUntilConvergence(maxEpochs=100)
+    clf = ensemble.RandomForestRegressor(40).fit(XTr,YTr)

    #Evaluate the model
    GFlops = np.empty(XTe.shape[0])
    speedups = np.empty(XTe.shape[0])
    optspeedups = np.empty(XTe.shape[0])
    for i,x in enumerate(XTe):
-        predictions = clf.activate(x)
+        predictions = clf.predict(x)
        label = np.argmax(predictions)
-        # print YTe[i,label], YTe[i,ref], np.max(YTe[i,:])
        speedups[i] = YTe[i,label]/YTe[i,ref]
        optspeedups[i] = np.max(YTe[i,:])/YTe[i,ref]
        GFlops[i] = YTe[i,ref]*Ymax
@@ -52,7 +45,5 @@ def train_model(X, Y, profiles):
    print("Average testing speedup : %f (Optimal : %f)"%(sp.stats.gmean(speedups), sp.stats.gmean(optspeedups)))
    print("Average GFLOP/s : %f (Default %f, Optimal %f)"%(np.mean(np.multiply(GFlops,speedups)), np.mean(GFlops), np.mean(np.multiply(GFlops,optspeedups))))
    print("Minimum speedup is %f wrt %i GFlops"%(np.min(speedups), GFlops[np.argmin(speedups)]))
-    print("Maximum speedup is %f wrt %i GFlops"%(np.max(speedups), GFlops[np.argmax(speedups)]))
+    print("Maximum speedup is %f wrt %i GFlops for %s"%(np.max(speedups), GFlops[np.argmax(speedups)], X[np.argmax(speedups)]*Xstd+Xmean))
    print("--------")
-
-    print clf
--- a/autotune/python/optimize.py
+++ b/autotune/python/optimize.py
@@ -48,6 +48,6 @@ from genetic import GeneticOperators
  #~ sys.stdout.flush()
    #~

-def genetic(statement, context, TemplateType, build_template, parameter_names, compute_perf, perf_metric, out):
-    GA = GeneticOperators(context.devices[0], statement, parameter_names, TemplateType, build_template, out)
+def genetic(statement, device, TemplateType, build_template, compute_perf, perf_metric, out):
+    GA = GeneticOperators(device, statement, TemplateType, build_template, out)
    return GA.optimize(maxtime='2m30s', maxgen=1000, compute_perf=compute_perf, perf_metric=perf_metric)