From 9a5627e20fe8adfbb39ae98c7be3a56c84a858c5 Mon Sep 17 00:00:00 2001
From: Philippe Tillet <phil.tillet@gmail.com>
Date: Tue, 28 Oct 2014 01:10:14 -0400
Subject: [PATCH] Prettier command line stuff

---
 python/autotune/pysrc/autotune.py | 58 +++++++++++---------
 python/autotune/pysrc/dataset.py  | 91 ++++++++++++++++---------------
 python/autotune/pysrc/optimize.py | 39 -------------
 3 files changed, 77 insertions(+), 111 deletions(-)

diff --git a/python/autotune/pysrc/autotune.py b/python/autotune/pysrc/autotune.py
index 8fbf63a36..b8cd97715 100644
--- a/python/autotune/pysrc/autotune.py
+++ b/python/autotune/pysrc/autotune.py
@@ -1,8 +1,7 @@
 from __future__ import division
 
 import argparse, itertools, os, sys, json
-import misc_tools, optimize
-
+import misc_tools, optimize, dataset
 import pyopencl as cl
 import pyviennacl as vcl
 import pyatidlas as atd
@@ -10,7 +9,6 @@ import numpy as np
 
 from configobj import ConfigObj
 from numpy import random
-from dataset import generate_dataset
 from model import train_model
 
 
@@ -42,10 +40,10 @@ def do_tuning(args, devices):
     def map_to_list(T, x):
         return list(map(T, x if isinstance(x, list) else [x]))
 
-    if(args.method=='unique'):
-        default_tuning_sizes = {'vector-axpy': tuple(args.sizes[:1]), 'reduction': tuple(args.sizes[:1]),
-                                'matrix-axpy' : tuple(args.sizes[1:3]), 'row-wise-reduction' : tuple(args.sizes[1:3]),
-                                'matrix-product': tuple(args.sizes[3:])}
+    if(args.method=='simple'):
+        default_tuning_sizes = {'vector-axpy': [args.blas1_size], 'reduction': [args.blas1_size],
+                                'matrix-axpy' : args.blas2_size, 'row-wise-reduction' : args.blas2_size,
+                                'matrix-product': args.blas3_size}
     for operation in ['vector-axpy', 'matrix-axpy', 'reduction', 'row-wise-reduction', 'matrix-product']:
 
           #Iterate through the datatypes
@@ -72,7 +70,7 @@ def do_tuning(args, devices):
                                                   lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive)
 
               #Helper for tuning
-              def tune(execution_handler, nTuning, nDataPoints, draw, additional_parameters):
+              def tune(execution_handler, n_datapoints, sampler, additional_parameters):
                   #Update JSON
                   full_operation = operation + ''.join(additional_parameters)
                   if full_operation not in json_out:
@@ -80,20 +78,22 @@ def do_tuning(args, devices):
                   json_out[full_operation][datatype.__name__] = {}
                   D = json_out[full_operation][datatype.__name__]
 
-                  if args.method == 'unique':
+                  if args.method == 'simple':
                       profiles = [execution_handler(map(int,default_tuning_sizes[operation]))]
-                      if args.viennacl_src_path:
-                          misc_tools.update_viennacl_headers(args.viennacl_src_path,device,datatype,operation,additional_parameters,profiles[0])
                   else:
                       def compute_perf(x, t):
                           return TYPES[operation]['perf-index']([datatype().itemsize, x, t])
-                      X, Y, profiles = generate_dataset(TYPES[operation]['template'], execution_handler, nTuning, nDataPoints, draw)
-                      clf = train_model(X, Y, profiles, TYPES[operation]['perf-measure'])
-                      D['predictor'] = [{'children_left': e.tree_.children_left.tolist(),
-                                     'children_right': e.tree_.children_right.tolist(),
-                                     'threshold': e.tree_.threshold.astype('float32').tolist(),
-                                     'feature': e.tree_.feature.astype('float32').tolist(),
-                                     'value': e.tree_.value[:,:,0].astype('float32').tolist()} for e in clf.estimators_]
+                      profiles = dataset.sample_profiles(execution_handler, args.sample_size, sampler)
+                      if args.build_model:
+                        X, Y = dataset.sample_dataset(os.path.join(full_operation,datatype.__name__), profiles, execution_handler, n_datapoints, sampler)
+                        clf = train_model(X, Y, profiles, TYPES[operation]['perf-measure'])
+                        D['predictor'] = [{'children_left': e.tree_.children_left.tolist(),
+                                       'children_right': e.tree_.children_right.tolist(),
+                                       'threshold': e.tree_.threshold.astype('float32').tolist(),
+                                       'feature': e.tree_.feature.astype('float32').tolist(),
+                                       'value': e.tree_.value[:,:,0].astype('float32').tolist()} for e in clf.estimators_]
+                  if args.viennacl_src_path:
+                    misc_tools.update_viennacl_headers(args.viennacl_src_path,device,datatype,operation,additional_parameters,profiles[0])
                   D['profiles'] = [ prof.astype('int').tolist() for prof in profiles]
 
 
@@ -104,7 +104,7 @@ def do_tuning(args, devices):
                       y = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
                       z = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
                       return execute(device, vcl.Assign(z, vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y))), (), sizes, fname, parameters)
-                  tune(execution_handler, 30, 1000, lambda : 64*np.random.randint(low=10, high=100000, size=1), ())
+                  tune(execution_handler, 1000, lambda : 64*np.random.randint(low=10, high=100000, size=1), ())
               #Reduction
               if operation=='reduction':
                   def execution_handler(sizes, fname=os.devnull, parameters=None):
@@ -112,7 +112,7 @@ def do_tuning(args, devices):
                       y = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
                       s = vcl.Scalar(0, context=ctx, dtype=datatype)
                       return execute(device, vcl.Assign(s, vcl.Dot(x,y)), (), sizes, fname, parameters)
-                  tune(execution_handler, 30, 1000, lambda : 64*np.random.randint(low=10, high=100000, size=1), ())
+                  tune(execution_handler, 1000, lambda : 64*np.random.randint(low=10, high=100000, size=1), ())
               #Matrix AXPY
               if operation=='matrix-axpy':
                   def execution_handler(sizes, fname=os.devnull, parameters=None):
@@ -120,7 +120,7 @@ def do_tuning(args, devices):
                       B = vcl.Matrix(sizes, context=ctx, dtype=datatype)
                       C = vcl.Matrix(sizes, context=ctx, dtype=datatype)
                       return execute(device, vcl.Assign(C,A+B), (), sizes, fname, parameters)
-                  tune(execution_handler, 30, 1000, lambda : 64*np.random.randint(low=5, high=100, size=2), ())
+                  tune(execution_handler, 1000, lambda : 64*np.random.randint(low=5, high=100, size=2), ())
               #Row-wise reduction
               if operation=='row-wise-reduction':
                   layouts = ['N', 'T']
@@ -131,7 +131,7 @@ def do_tuning(args, devices):
                           y = vcl.Vector(sizes[0] if A_trans=='N' else sizes[1], context=ctx, dtype=datatype)
                           LHS = A if A_trans=='N' else A.T
                           return execute(device, vcl.Assign(y, LHS*x), (), sizes, fname, parameters)
-                      tune(execution_handler, 30, 1000, lambda : 64*np.random.randint(low=5, high=100, size=2), (A_trans,))
+                      tune(execution_handler, 1000, lambda : 64*np.random.randint(low=5, high=100, size=2), (A_trans,))
               #Matrix Product
               if operation=='matrix-product':
                   layouts = ['NN', 'NT', 'TN', 'TT']
@@ -147,7 +147,7 @@ def do_tuning(args, devices):
                           beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
                           C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR)
                           return execute(device, vcl.Assign(C,LHS*RHS*alpha + C*beta),(A_trans, B_trans), sizes, fname, parameters)
-                      tune(execution_handler, 30, 1000, lambda : 64*np.random.randint(low=1, high=40, size=3),(layout[0], layout[1]))
+                      tune(execution_handler, 1000, lambda : 64*np.random.randint(low=1, high=40, size=3),(layout[0], layout[1]))
 
     dname = misc_tools.sanitize_string(device.name)
     json_out["version"] = "1.0"
@@ -161,14 +161,18 @@ if __name__ == "__main__":
     print_devices_parser = subparsers.add_parser('list-devices', help='list the devices available')
     tune_parser = subparsers.add_parser('tune', help='tune using a specific configuration file')
     tune_parser.add_argument("--device", default=0, required=False, type=str)
+    tune_parser.add_argument("--viennacl-src-path", default='', type=str)
 
     tune_subparsers = tune_parser.add_subparsers(dest='method')
-    big_sizes_parser = tune_subparsers.add_parser('unique', help = 'Tune each operation for unique sizes')
-    big_sizes_parser.add_argument("--sizes", nargs='+', default=[10e6,2560,2560,1536,1536,1536], required=False, type=int, help = '6 = 1 + 2 + 3 sizes for respectively BLAS1, BLAS2, BLAS3')
-    big_sizes_parser.add_argument("--viennacl-src-path", default='', required=False, type=str)
+    simple_parser = tune_subparsers.add_parser('simple', help = 'Tune each operation for unique sizes')
 
-    model_parser = tune_subparsers.add_parser('build-model', help = 'Build an input-dependent model')
+    simple_parser.add_argument("--blas1-size", default = 10e6, type=int)
+    simple_parser.add_argument("--blas2-size", nargs=2, default=[2560,2560], type=int)
+    simple_parser.add_argument("--blas3-size", nargs=3, default=[1536,1536,1536],type=int)
 
+    full_parser = tune_subparsers.add_parser('full', help = 'Tune each operation for randomly chosen sizes')
+    full_parser.add_argument("--build-model", default=False, type=bool)
+    full_parser.add_argument("--sample-size", default=30, type=int)
 
     args = parser.parse_args()
 
diff --git a/python/autotune/pysrc/dataset.py b/python/autotune/pysrc/dataset.py
index c06476217..f9c736f07 100644
--- a/python/autotune/pysrc/dataset.py
+++ b/python/autotune/pysrc/dataset.py
@@ -4,64 +4,65 @@ import re
 import random
 import numpy as np
 
-def resample(X, draw):
+def resample(X, sampler):
     Xtuples = [tuple(x) for x in X]
     r = random.random()
     while(True):
-        x = draw()
+        x = sampler()
         if tuple(x) not in Xtuples:
             break
     return x.astype(int)
 
-def generate_dataset(TemplateType, execution_handler, nTuning, nDataPoints, draw):
+def sample_profiles(execution_handler, nTuning, sampler):
+    print "Sampling profiles..."
+    nDim = sampler().size
+    X = np.empty((nTuning, nDim))
+    t = np.empty(nTuning)
+    profiles = []
+    for i in range(nTuning):
+        x = resample(X, sampler)
+        y = execution_handler(x)
+        if y not in profiles:
+            profiles.append(y)
+        idx = profiles.index(y)
+        X[i,:] = x
+        t[i] = idx
 
-    # print "Getting some good profiles..."
-    # nDim = draw().size
-    # X = np.empty((nTuning, nDim))
-    # t = np.empty(nTuning)
-    # profiles = []
-    # for i in range(nTuning):
-    #     x = resample(X, draw)
-    #     y = execution_handler(x)
-    #     if y not in profiles:
-    #         profiles.append(y)
-    #     idx = profiles.index(y)
-    #     X[i,:] = x
-    #     t[i] = idx
-    #
-    # print "Generating the dataset..."
-    # Y = np.empty((nDataPoints, len(profiles)))
-    # X = np.empty((nDataPoints, nDim))
-    # t = []
-    #
-    # for i in range(nDataPoints):
-    #     x = resample(X, draw)
-    #     for j,y in enumerate(profiles):
-    #         T = execution_handler(x, os.devnull, y)
-    #         Y[i,j] = T
-    #     idx = np.argmax(Y[i,:])
-    #     X[i,:] = x
-    #     t = np.argmax(Y[:i+1,], axis=1)
-    #     if i%10==0:
-    #         sys.stdout.write('%d data points generated\r'%i)
-    #         sys.stdout.flush()
+    idx = int(t[np.argmax(np.linalg.norm(X, axis=1))])
+    profiles = np.array([profiles[idx]] + [x for i,x in enumerate(profiles) if i!=idx])
+    return profiles
 
-    template_name = TemplateType.__name__
-    dir = os.path.join("data", template_name)
-    if not os.path.exists(dir):
-        os.makedirs(dir)
+def sample_dataset(prefix_name, profiles, execution_handler, nDataPoints, sampler):
 
-    # np.savetxt(os.path.join(dir,"profiles.csv"), profiles)
-    # np.savetxt(os.path.join(dir,"X.csv"), X)
-    # np.savetxt(os.path.join(dir,"Y.csv"), Y)
+    print "Generating the dataset..."
+    Y = np.empty((nDataPoints, len(profiles)))
+    X = np.empty((nDataPoints, len(profiles[0])))
+    t = []
 
-    profiles = np.loadtxt(os.path.join(dir, "profiles.csv"))
-    X = np.loadtxt(os.path.join(dir, "X.csv"),ndmin=2)
-    Y = np.loadtxt(os.path.join(dir, "Y.csv"),ndmin=2)
+    for i in range(nDataPoints):
+        x = resample(X, sampler)
+        for j,y in enumerate(profiles):
+            T = execution_handler(x, os.devnull, y)
+            Y[i,j] = T
+        idx = np.argmax(Y[i,:])
+        X[i,:] = x
+        t = np.argmax(Y[:i+1,], axis=1)
+        if i%10==0:
+            sys.stdout.write('%d data points generated\r'%i)
+            sys.stdout.flush()
 
-    #idx = np.argsort(np.bincount(np.argmin(Y, axis=1)))
     idx = np.argsort(Y[np.argmax(X),:])
     Y = Y[:, idx]
     profiles = profiles[idx]
+    
+    dir = os.path.join("data", prefix_name)
+    if not os.path.exists(dir):
+        os.makedirs(dir)
+    np.savetxt(os.path.join(dir,"X.csv"), X)
+    np.savetxt(os.path.join(dir,"Y.csv"), Y)
+    np.savetxt(os.path.join(dir,"profiles.csv"), profiles)
+    X = np.loadtxt(os.path.join(dir, "X.csv"),ndmin=2)
+    Y = np.loadtxt(os.path.join(dir, "Y.csv"),ndmin=2)
+    profiles = np.loadtxt(os.path.join(dir, "profiles.csv"))
 
-    return X, Y, profiles
+    return X, Y
diff --git a/python/autotune/pysrc/optimize.py b/python/autotune/pysrc/optimize.py
index 8df8aa438..134fe0d8c 100644
--- a/python/autotune/pysrc/optimize.py
+++ b/python/autotune/pysrc/optimize.py
@@ -4,45 +4,6 @@ import numpy as np
 
 from genetic import GeneticOperators
 
-#~ def parameter_space(operation):
-  #~ simd = [1, 2, 4, 8]
-  #~ pow2_1D = [2**k for k in range(12)]
-  #~ pow2_2D = [2**i for i in range(8)]
-  #~ pow2_2D_unrolled = [2**i for i in range(8)]
-  #~ FetchingPolicy = vcl.atidlas.FetchingPolicy
-  #~ fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED]
-  #~ if operation == 'vector-axpy': return [simd, pow2_1D, pow2_1D, fetch]
-  #~ if operation == 'reduction': return [simd, pow2_1D, pow2_1D, fetch]
-  #~ if operation == 'matrix-axpy': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D, fetch]
-  #~ if operation == 'row-wise-reduction': return [simd, pow2_2D, pow2_2D, pow2_1D, fetch]
-  #~ if operation == 'matrix-product': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D_unrolled,  pow2_2D_unrolled,  pow2_2D_unrolled, fetch, fetch, [0] + pow2_2D, [0] + pow2_2D]
-  #~
-
-#~ def exhaustive(statement, context, TemplateType, build_template, parameter_names, all_parameters, compute_perf, perf_metric, out):
-  #~ device = context.devices[0]
-  #~ nvalid = 0
-  #~ current = 0
-  #~ minT = float('inf')
-  #~ for individual in itertools.product(*all_parameters):
-    #~ template = build_template(TemplateType.Parameters(*individual))
-    #~ if not tools.skip(template, statement, device):
-      #~ nvalid = nvalid + 1
-  #~ for individual in itertools.product(*all_parameters):
-    #~ template = build_template(TemplateType.Parameters(*individual))
-    #~ try:
-      #~ T = tools.benchmark(template,statement,device)
-      #~ current = current + 1
-      #~ if T < minT:
-        #~ minT = T
-        #~ best = individual
-      #~ sys.stdout.write('%d / %d , Best is %d %s for %s\r'%(current, nvalid, compute_perf(minT), perf_metric, best))
-      #~ sys.stdout.flush()
-    #~ except:
-      #~ pass
-  #~ sys.stdout.write('\n')
-  #~ sys.stdout.flush()
-    #~
-
 def genetic(statement, device, TemplateType, build_template, compute_perf, perf_metric, out):
     GA = GeneticOperators(device, statement, TemplateType, build_template, out)
     return GA.optimize(maxtime='2m30s', maxgen=1000, compute_perf=compute_perf, perf_metric=perf_metric)