Squashed feature branch:

* Added CUDA support * Performance improvements * API improvements * Added "depth" parameter to GEMM * Android cross-compilation
2015-04-29 15:50:57 -04:00
parent 5ff16bfcb6
commit cf5028d55b
3819 changed files with 7080 additions and 2916 deletions
--- a/python/autotune/pysrc/autotune.py
+++ b/python/autotune/pysrc/autotune.py
@@ -2,7 +2,7 @@ from __future__ import division

 import argparse, itertools, os, sys, json
 import misc_tools, optimize, dataset
-import pyatidlas as atd
+import pyisaac as atd
 import numpy as np

 from numpy import random
@@ -71,27 +71,20 @@ def do_tuning(args):
                    return optimize.genetic(symbolic, Template, lambda t: TYPES[operation]['perf-index']([datatype(0).size, sizes, t]), 
                                             TYPES[operation]['perf-measure'], archive)
             
-              def log_uniform_sample(a,b):
-                  return np.exp(np.random.uniform(low=np.log(a), high=np.log(b), size=1)).astype(int)
-
-              def space_gen_product(a,b,N,dim,method):
-                  N = int(N**(1.0/dim))
-                  def space_gen(a,b,method):
-                      for i in range(N):
-                          if method == 'linear':
-                              v = int(a + (b-a)*i/N)
-                          if method == 'log':
-                              v = int(np.exp(np.log(a) + (np.log(b) - np.log(a))*i/N))
-                          yield (v//64 + 1)*64
-                  return tuple(itertools.product(*[space_gen(a,b,method) for i in range(dim)]))
+              def log_spaced_points(a,b,N,r=128):
+                  t = np.ceil(np.exp(np.linspace(np.log(a), np.log(b), N))/r)*r
+                  return t.reshape(t.size,1).astype(int)


              #Helper for tuning
-              def tune(execution_handler, a, b, dimsample, layouts, sample_method_profiles, sample_method_dataset):
+              def tune(execution_handler, layouts, tuning_sizes, training_sizes):
                  print('-----')
                  print(' '.join(map(str, ("Now tuning:", dtypestr, '-', operation, '-'.join(layouts), '[' + device.name, '(' + device.platform.name + ')]'))))
                  #Update JSON
                  full_operation = operation + ''.join(layouts)
+                  prefix = os.path.join('data',os.path.join(full_operation,dtypestr))
+                  if not os.path.exists(prefix):
+                      os.makedirs(prefix)
                  if full_operation not in json_out:
                      json_out[full_operation] = {}
                  json_out[full_operation][dtypestr] = {}
@@ -103,15 +96,13 @@ def do_tuning(args):
                  else:
                      def compute_perf(x, t):
                          return TYPES[operation]['perf-index']([datatype(0).size, x, t])
-                      profiles_generator = space_gen_product(a, b, args.sample_size, dimsample, sample_method_profiles)
-                      profiles = dataset.sample_profiles(execution_handler, profiles_generator)
+                      #profiles = dataset.sample_profiles(execution_handler, tuning_sizes)
                      if args.build_model:
-                        dataset_generator = space_gen_product(a, b, 1000, dimsample, sample_method_dataset)
-                        X, Y, profiles = dataset.sample_dataset(os.path.join(full_operation,dtypestr), profiles, execution_handler, dataset_generator)
-                        # profiles = np.loadtxt('data/'+full_operation+'/'+datatype+'/profiles.csv')
-                        # X = np.loadtxt('data/'+full_operation+'/'+datatype+'/X.csv',ndmin=2)
-                        # Y = np.loadtxt('data/'+full_operation+'/'+datatype+'/Y.csv',ndmin=2)
-                        clf = train_model(X, Y, profiles, TYPES[operation]['perf-measure'])
+                        #X, Y, profiles = dataset.sample_dataset(prefix, profiles, execution_handler, training_sizes)
+                        profiles = np.loadtxt(prefix+'/profiles.csv')
+                        X = np.loadtxt(prefix+'/X.csv',ndmin=2)
+                        Y = np.loadtxt(prefix+'/Y.csv',ndmin=2)
+                        clf = train_model(X, Y, profiles, compute_perf, TYPES[operation]['perf-measure'])
                        D['predictor'] = [{'children_left': e.tree_.children_left.tolist(),
                                       'children_right': e.tree_.children_right.tolist(),
                                       'threshold': e.tree_.threshold.astype('float64').tolist(),
@@ -128,15 +119,15 @@ def do_tuning(args):
                      x = atd.empty(sizes[0], datatype, context=context)
                      y = atd.empty(sizes[0], datatype, context=context)
                      return execute(x + y, sizes, Template, parameters, fname)
-                  tune(execution_handler, 1e3, 2e7, 1, (),'log', 'log')
-              #dot
+                  tune(execution_handler, (), log_spaced_points(1e4, 1e7, 20), log_spaced_points(1e4, 1e7, 1000))
+              #Dot
              if operation=='dot':
                  def execution_handler(sizes, fname=os.devnull, parameters=None):
                      x = atd.empty(sizes[0], datatype, context=context)
                      y = atd.empty(sizes[0], datatype, context=context)
                      s = atd.scalar(datatype)
                      return execute(atd.dot(x, y), sizes, Template, parameters, fname)
-                  tune(execution_handler, 1e3, 2e7, 1, (),'log', 'log')
+                  tune(execution_handler, (), log_spaced_points(1e4, 1e7, 50), log_spaced_points(1e4, 1e7, 1000))
              #Matrix AXPY
              if operation=='maxpy':
                  def execution_handler(sizes, fname=os.devnull, parameters=None):
@@ -152,7 +143,14 @@ def do_tuning(args):
                          x = atd.empty(sizes[1], datatype, context=context)
                          LHS = A if A_trans=='N' else A.T
                          return execute(atd.dot(LHS, x), sizes, Template[A_trans], parameters, fname)
-                      tune(execution_handler, 64, 6000, 2, (A_trans,),'log', 'log')
+                      tuning_sizes = itertools.chain( itertools.product([128, 512, 2048, 8192], [128, 512, 2048, 8192]),
+                                                     itertools.product([128, 512, 2048, 8192], [16384, 32768, 65536]),
+                                                     itertools.product([16384, 32768, 65536], [128, 512, 2048, 8192]))
+                      
+                      training_sizes = itertools.chain( itertools.product([2**k for k in range(4, 13)], [2**k for k in range(4, 13)]),
+                                                        itertools.product([2**k for k in range(4, 13)], [2**k for k in range(13, 17)]),
+                                                        itertools.product([2**k for k in range(13, 17)], [2**k for k in range(4, 13)]))
+                      tune(execution_handler, (A_trans,), tuning_sizes, training_sizes)
              #Matrix Product
              if operation=='gemm':
                  for L in args.gemm_layouts:
@@ -164,8 +162,11 @@ def do_tuning(args):
                          LHS = A if A_trans=='N' else A.T
                          RHS = B if B_trans=='N' else B.T
                          return execute(atd.dot(LHS, RHS), sizes, Template[(A_trans, B_trans)], parameters, fname)
-                      tune(execution_handler, 100, 2000, 3,(A_trans,B_trans), 'linear', 'linear')
-
+                      
+                      tuning_sizes = itertools.product([64, 256, 1024, 2560], [64, 256, 1024, 2560], [256, 2560, 32768, 65536])
+                      training_sizes = itertools.product([2**k for k in range(6, 13)], [2**k for k in range(6, 13)], [2**k for k in range(6, 17)])
+                      tune(execution_handler,(A_trans,B_trans), tuning_sizes, training_sizes)
+                      
              json.dump(json_out, open(args.out,'w'))


@@ -194,7 +195,7 @@ class ArgumentsHandler:

        full_parser = tune_subparsers.add_parser('full', help = 'Tune each operation for randomly chosen sizes')
        full_parser.add_argument("--build-model", default=True, type=bool)
-        full_parser.add_argument("--sample-size", default=60, type=int)
+        full_parser.add_argument("--sample-size", default=64, type=int)

        args = parser.parse_args()
        self.__dict__ = args.__dict__.copy()