triton/python/isaac/autotuning/tune.py

import random, argparse, json, os
from math import log, isinf
from itertools import chain, product
from numpy import argsort, argmax
from operator import mul
import isaac as sc
from external.sklearn.forest import RandomForestRegressor
import optimize, tools, model
from json import encoder
import json

encoder.FLOAT_REPR = lambda o: format(o, '.2f')
encoder.separators = (',',':')

def unique(L):
    seen = set()
    seen_add = seen.add
    return [ x for x in L if not (x in seen or seen_add(x))]

def pow2range(a, b):
    return [2**x for x in range(a, b)]


def tune(device, operation, json_path): 
    #Context
    context = sc.driver.context(device)
    
    #List of size tuples to use
    sizes = {}
    sizes[sc.templates.axpy] = [(x,) for x in tools.expspace(1e3, 1e8, 4)]
    sizes[sc.templates.gemv_n] = product(pow2range(4,17), pow2range(4,17))
    sizes[sc.templates.gemv_t] = sizes[sc.templates.gemv_n]
    sizes[sc.templates.gemm_nn]     = product(pow2range(6, 12), pow2range(6, 12), pow2range(6, 12))
    sizes[sc.templates.gemm_tn]     = sizes[sc.templates.gemm_nn]
    sizes[sc.templates.gemm_nt]     = sizes[sc.templates.gemm_nn]
    sizes[sc.templates.gemm_tt]     = sizes[sc.templates.gemm_nn]
    

    #Quick tuning - AlexNet sizes + Intuition
    sizes[sc.templates.ger] 		 = [(1536,1536)]

    sizes[sc.templates.gemv_n]		 = [(1000,256),
                                        (4096,256)]
    sizes[sc.templates.gemv_t]		 = [(169,256),
                                        (169,384),
                                        (729,256),
                                        (3025,96)]
	
    sizes[sc.templates.gemm_nn]	 = [(3025,96,363),
                                        (729,128,1200),
                                        (169,384,2304),
                                        (169,192,1728),
                                        (169,128,1728)]
    sizes[sc.templates.gemm_nt]	 = [(169,1728,128),
										(169,1728,192),
										(169,2304,384),
										(729,1200,128)]
    sizes[sc.templates.gemm_tn]	 = [(1728,128,169), 
										(1728,192,169),
										(2304,384,169),
										(1200,128,729),
										(363,96,3025)]
    
    #Remove duplicated
    sizes = unique(list(sizes[operation]))
    sizes = [x for x in sizes if 1e-4 <= tools.memory_footprint(operation, x) <= 1e-1]

    #Training data
    performance = tools.metric_of(operation)
    profiles = []
    X = []
    Y = []
    for idx, x in enumerate(sizes):
        print x
        nparams = len(profiles)
        tree, operands = tools.tree_of(operation, x, context)
        #Check if the current best prediction is not a local optimum
        if idx==0:
            tune = True
            predicted = None
        else:
            if nparams==1:
                predicted = profiles[0]
            else:
                clf = RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y)
                #clf, nrmse = model.train(X, Y, profiles)
                predperf = clf.predict(x)[0]
                best = (-predperf).argsort()[:5]
                perf = [performance(x, tools.benchmark(operation, profiles[b], tree)) for b in best]
                predicted = profiles[best[argmax(perf)]]
            #tune = not optimize.is_local_optimum(predicted, operation, x, context)     
            tune = True
        #Retune if necessary
        if tune:
            #new = optimize.exhaustive(operation, x, context)
            new = optimize.genetic(operation, x, context, niter=1000, naccept=1000, popsize=20, prior=predicted)[0]
            if new not in profiles:
                profiles.append(new)
                if idx > 0:
                    for xx,yy in zip(X, Y):
                        _tree, _operands = tools.tree_of(operation, xx, context)
                        try:
                            time = tools.benchmark(operation, new, _tree)
                            perf = performance(xx, time)
                        except (sc.OperationNotSupported, sc.LaunchOutOfResources, sc.MemObjectAllocationFailure):
                            perf = 0
                        yy.append(0 if isinf(perf) else perf)
        #Update dataset
        y = []
        fastest = max(predperf) if nparams > 1 else None
        for ip, p in enumerate(profiles):
            try:
                perf = 0 if fastest and ip < nparams and predperf[ip]/fastest < .1 else performance(x,tools.benchmark(operation, p, tree))
            except (sc.OperationNotSupported, sc.LaunchOutOfResources, sc.MemObjectAllocationFailure):
                perf = 0
            y.append(0 if isinf(perf) else perf)
        X.append(x)
        Y.append(y)

    
    #Export to JSON
    json_path = tools.sanitize(device.name) + '.json' if not json_path else json_path
    if os.path.isfile(json_path):
        json_data = json.load(open(json_path, 'r'))
    else:
        json_data = {}
        json_data["version"] = "1.0"
    operation_name = operation.__name__
    if operation_name not in json_data:
        json_data[operation_name] = {}
    json_data[operation_name]['float32'] = {}
    D = json_data[operation_name]['float32']
    if len(profiles) > 1:
        clf, nrmse = model.train(X, Y, profiles)
        D['predictor'] = [{'children_left': e.tree_.children_left.tolist(),
                            'children_right': e.tree_.children_right.tolist(),
                            'threshold': e.tree_.threshold.astype('float64').tolist(),
                            'feature': e.tree_.feature.astype('float64').tolist(),
                            'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_]
    D['profiles'] = [map(int, x) for x in profiles]
    json.dump(json_data, open(json_path,'w'))
Tuning: Merged tune branch. - Much cleaner and more concise source - Better exceptions handling - Checks local minima to see if retuning is needed. Resolved conflicts: bench/blas.cpp include/isaac/backend/templates/mproduct.h include/isaac/driver/buffer.h lib/array.cpp lib/backend/templates/mproduct.cpp lib/driver/buffer.cpp python/setup.py tune/pysrc/autotune.py tune/pysrc/dataset.py tune/pysrc/misc_tools.py 2015-06-28 17:53:16 -07:00			`import random, argparse, json, os`
			`from math import log, isinf`
			`from itertools import chain, product`
			`from numpy import argsort, argmax`
			`from operator import mul`
Code quality: renamed isaac shortcut from isc to sc 2015-08-12 19:38:53 -07:00			`import isaac as sc`
Tuner: Moved tuning logic into the python wrapper - draft of Android tuning app using kivy 2015-08-16 19:58:54 -07:00			`from external.sklearn.forest import RandomForestRegressor`
Tuning: Merged tune branch. - Much cleaner and more concise source - Better exceptions handling - Checks local minima to see if retuning is needed. Resolved conflicts: bench/blas.cpp include/isaac/backend/templates/mproduct.h include/isaac/driver/buffer.h lib/array.cpp lib/backend/templates/mproduct.cpp lib/driver/buffer.cpp python/setup.py tune/pysrc/autotune.py tune/pysrc/dataset.py tune/pysrc/misc_tools.py 2015-06-28 17:53:16 -07:00			`import optimize, tools, model`
Tuning: added ger default sizes 2015-08-04 16:03:14 -07:00			`from json import encoder`
Tuner: Moved tuning logic into the python wrapper - draft of Android tuning app using kivy 2015-08-16 19:58:54 -07:00			`import json`

Tuning: added ger default sizes 2015-08-04 16:03:14 -07:00			`encoder.FLOAT_REPR = lambda o: format(o, '.2f')`
			`encoder.separators = (',',':')`

Tuning: Merged tune branch. - Much cleaner and more concise source - Better exceptions handling - Checks local minima to see if retuning is needed. Resolved conflicts: bench/blas.cpp include/isaac/backend/templates/mproduct.h include/isaac/driver/buffer.h lib/array.cpp lib/backend/templates/mproduct.cpp lib/driver/buffer.cpp python/setup.py tune/pysrc/autotune.py tune/pysrc/dataset.py tune/pysrc/misc_tools.py 2015-06-28 17:53:16 -07:00			`def unique(L):`
			`seen = set()`
			`seen_add = seen.add`
			`return [ x for x in L if not (x in seen or seen_add(x))]`

			`def pow2range(a, b):`
			`return [2**x for x in range(a, b)]`


			`def tune(device, operation, json_path):`
Tuner: Moved tuning logic into the python wrapper - draft of Android tuning app using kivy 2015-08-16 19:58:54 -07:00			`#Context`
Code quality: renamed isaac shortcut from isc to sc 2015-08-12 19:38:53 -07:00			`context = sc.driver.context(device)`
Tuning: Merged tune branch. - Much cleaner and more concise source - Better exceptions handling - Checks local minima to see if retuning is needed. Resolved conflicts: bench/blas.cpp include/isaac/backend/templates/mproduct.h include/isaac/driver/buffer.h lib/array.cpp lib/backend/templates/mproduct.cpp lib/driver/buffer.cpp python/setup.py tune/pysrc/autotune.py tune/pysrc/dataset.py tune/pysrc/misc_tools.py 2015-06-28 17:53:16 -07:00
			`#List of size tuples to use`
Backend: GEMM - Improved bounds checking 2015-07-02 14:02:31 -04:00			`sizes = {}`
Code quality: renamed isaac shortcut from isc to sc 2015-08-12 19:38:53 -07:00			`sizes[sc.templates.axpy] = [(x,) for x in tools.expspace(1e3, 1e8, 4)]`
			`sizes[sc.templates.gemv_n] = product(pow2range(4,17), pow2range(4,17))`
			`sizes[sc.templates.gemv_t] = sizes[sc.templates.gemv_n]`
			`sizes[sc.templates.gemm_nn] = product(pow2range(6, 12), pow2range(6, 12), pow2range(6, 12))`
			`sizes[sc.templates.gemm_tn] = sizes[sc.templates.gemm_nn]`
			`sizes[sc.templates.gemm_nt] = sizes[sc.templates.gemm_nn]`
			`sizes[sc.templates.gemm_tt] = sizes[sc.templates.gemm_nn]`
Tuning: Added AlexNet sizes for GEMM 2015-07-16 14:26:21 -04:00
Tuning: added ger default sizes 2015-08-04 16:03:14 -07:00
Tune: added quick tuning sizes for GEMV 2015-08-10 10:19:27 -07:00			`#Quick tuning - AlexNet sizes + Intuition`
Code quality: renamed isaac shortcut from isc to sc 2015-08-12 19:38:53 -07:00			`sizes[sc.templates.ger] = [(1536,1536)]`
Tune: added quick tuning sizes for GEMV 2015-08-10 10:19:27 -07:00
Code quality: renamed isaac shortcut from isc to sc 2015-08-12 19:38:53 -07:00			`sizes[sc.templates.gemv_n] = [(1000,256),`
Tune: added quick tuning sizes for GEMV 2015-08-10 10:19:27 -07:00			`(4096,256)]`
Code quality: renamed isaac shortcut from isc to sc 2015-08-12 19:38:53 -07:00			`sizes[sc.templates.gemv_t] = [(169,256),`
Tune: added quick tuning sizes for GEMV 2015-08-10 10:19:27 -07:00			`(169,384),`
			`(729,256),`
			`(3025,96)]`

Code quality: renamed isaac shortcut from isc to sc 2015-08-12 19:38:53 -07:00			`sizes[sc.templates.gemm_nn] = [(3025,96,363),`
Tuning: Added AlexNet sizes for GEMM 2015-07-16 14:26:21 -04:00			`(729,128,1200),`
			`(169,384,2304),`
			`(169,192,1728),`
			`(169,128,1728)]`
Code quality: renamed isaac shortcut from isc to sc 2015-08-12 19:38:53 -07:00			`sizes[sc.templates.gemm_nt] = [(169,1728,128),`
Tuning: Added AlexNet sizes for GEMM 2015-07-16 14:26:21 -04:00			`(169,1728,192),`
			`(169,2304,384),`
			`(729,1200,128)]`
Code quality: renamed isaac shortcut from isc to sc 2015-08-12 19:38:53 -07:00			`sizes[sc.templates.gemm_tn] = [(1728,128,169),`
Tuning: Added AlexNet sizes for GEMM 2015-07-16 14:26:21 -04:00			`(1728,192,169),`
			`(2304,384,169),`
			`(1200,128,729),`
			`(363,96,3025)]`

Tune: added quick tuning sizes for GEMV 2015-08-10 10:19:27 -07:00			`#Remove duplicated`
Backend: GEMM - Improved bounds checking 2015-07-02 14:02:31 -04:00			`sizes = unique(list(sizes[operation]))`
Tuning: Merged tune branch. - Much cleaner and more concise source - Better exceptions handling - Checks local minima to see if retuning is needed. Resolved conflicts: bench/blas.cpp include/isaac/backend/templates/mproduct.h include/isaac/driver/buffer.h lib/array.cpp lib/backend/templates/mproduct.cpp lib/driver/buffer.cpp python/setup.py tune/pysrc/autotune.py tune/pysrc/dataset.py tune/pysrc/misc_tools.py 2015-06-28 17:53:16 -07:00			`sizes = [x for x in sizes if 1e-4 <= tools.memory_footprint(operation, x) <= 1e-1]`
Backend: GEMM - Improved bounds checking 2015-07-02 14:02:31 -04:00
Tuning: Merged tune branch. - Much cleaner and more concise source - Better exceptions handling - Checks local minima to see if retuning is needed. Resolved conflicts: bench/blas.cpp include/isaac/backend/templates/mproduct.h include/isaac/driver/buffer.h lib/array.cpp lib/backend/templates/mproduct.cpp lib/driver/buffer.cpp python/setup.py tune/pysrc/autotune.py tune/pysrc/dataset.py tune/pysrc/misc_tools.py 2015-06-28 17:53:16 -07:00			`#Training data`
			`performance = tools.metric_of(operation)`
			`profiles = []`
			`X = []`
			`Y = []`
			`for idx, x in enumerate(sizes):`
			`print x`
			`nparams = len(profiles)`
			`tree, operands = tools.tree_of(operation, x, context)`
			`#Check if the current best prediction is not a local optimum`
			`if idx==0:`
			`tune = True`
			`predicted = None`
			`else:`
			`if nparams==1:`
			`predicted = profiles[0]`
			`else:`
Tuner: Removed Sklearn + scipy dependency 2015-08-13 01:51:02 -07:00			`clf = RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y)`
			`#clf, nrmse = model.train(X, Y, profiles)`
Tuning: Merged tune branch. - Much cleaner and more concise source - Better exceptions handling - Checks local minima to see if retuning is needed. Resolved conflicts: bench/blas.cpp include/isaac/backend/templates/mproduct.h include/isaac/driver/buffer.h lib/array.cpp lib/backend/templates/mproduct.cpp lib/driver/buffer.cpp python/setup.py tune/pysrc/autotune.py tune/pysrc/dataset.py tune/pysrc/misc_tools.py 2015-06-28 17:53:16 -07:00			`predperf = clf.predict(x)[0]`
			`best = (-predperf).argsort()[:5]`
			`perf = [performance(x, tools.benchmark(operation, profiles[b], tree)) for b in best]`
			`predicted = profiles[best[argmax(perf)]]`
Tuning: Added AlexNet sizes for GEMM 2015-07-16 14:26:21 -04:00			`#tune = not optimize.is_local_optimum(predicted, operation, x, context)`
			`tune = True`
Tuning: Merged tune branch. - Much cleaner and more concise source - Better exceptions handling - Checks local minima to see if retuning is needed. Resolved conflicts: bench/blas.cpp include/isaac/backend/templates/mproduct.h include/isaac/driver/buffer.h lib/array.cpp lib/backend/templates/mproduct.cpp lib/driver/buffer.cpp python/setup.py tune/pysrc/autotune.py tune/pysrc/dataset.py tune/pysrc/misc_tools.py 2015-06-28 17:53:16 -07:00			`#Retune if necessary`
			`if tune:`
			`#new = optimize.exhaustive(operation, x, context)`
			`new = optimize.genetic(operation, x, context, niter=1000, naccept=1000, popsize=20, prior=predicted)[0]`
			`if new not in profiles:`
			`profiles.append(new)`
			`if idx > 0:`
			`for xx,yy in zip(X, Y):`
			`_tree, _operands = tools.tree_of(operation, xx, context)`
Tuning: Bugfixes and loosened local minima check 2015-07-12 23:19:00 -07:00			`try:`
			`time = tools.benchmark(operation, new, _tree)`
			`perf = performance(xx, time)`
Code quality: renamed isaac shortcut from isc to sc 2015-08-12 19:38:53 -07:00			`except (sc.OperationNotSupported, sc.LaunchOutOfResources, sc.MemObjectAllocationFailure):`
Tuning: Bugfixes and loosened local minima check 2015-07-12 23:19:00 -07:00			`perf = 0`
Tuning: Merged tune branch. - Much cleaner and more concise source - Better exceptions handling - Checks local minima to see if retuning is needed. Resolved conflicts: bench/blas.cpp include/isaac/backend/templates/mproduct.h include/isaac/driver/buffer.h lib/array.cpp lib/backend/templates/mproduct.cpp lib/driver/buffer.cpp python/setup.py tune/pysrc/autotune.py tune/pysrc/dataset.py tune/pysrc/misc_tools.py 2015-06-28 17:53:16 -07:00			`yy.append(0 if isinf(perf) else perf)`
			`#Update dataset`
			`y = []`
			`fastest = max(predperf) if nparams > 1 else None`
			`for ip, p in enumerate(profiles):`
Tuning: Bugfixes and loosened local minima check 2015-07-12 23:19:00 -07:00			`try:`
			`perf = 0 if fastest and ip < nparams and predperf[ip]/fastest < .1 else performance(x,tools.benchmark(operation, p, tree))`
Code quality: renamed isaac shortcut from isc to sc 2015-08-12 19:38:53 -07:00			`except (sc.OperationNotSupported, sc.LaunchOutOfResources, sc.MemObjectAllocationFailure):`
Tuning: Bugfixes and loosened local minima check 2015-07-12 23:19:00 -07:00			`perf = 0`
Tuning: Merged tune branch. - Much cleaner and more concise source - Better exceptions handling - Checks local minima to see if retuning is needed. Resolved conflicts: bench/blas.cpp include/isaac/backend/templates/mproduct.h include/isaac/driver/buffer.h lib/array.cpp lib/backend/templates/mproduct.cpp lib/driver/buffer.cpp python/setup.py tune/pysrc/autotune.py tune/pysrc/dataset.py tune/pysrc/misc_tools.py 2015-06-28 17:53:16 -07:00			`y.append(0 if isinf(perf) else perf)`
			`X.append(x)`
			`Y.append(y)`
Backend: GEMM - Improved bounds checking 2015-07-02 14:02:31 -04:00
Tuning: Merged tune branch. - Much cleaner and more concise source - Better exceptions handling - Checks local minima to see if retuning is needed. Resolved conflicts: bench/blas.cpp include/isaac/backend/templates/mproduct.h include/isaac/driver/buffer.h lib/array.cpp lib/backend/templates/mproduct.cpp lib/driver/buffer.cpp python/setup.py tune/pysrc/autotune.py tune/pysrc/dataset.py tune/pysrc/misc_tools.py 2015-06-28 17:53:16 -07:00
			`#Export to JSON`
Tuner: Moved tuning logic into the python wrapper - draft of Android tuning app using kivy 2015-08-16 19:58:54 -07:00			`json_path = tools.sanitize(device.name) + '.json' if not json_path else json_path`
Tuning: Merged tune branch. - Much cleaner and more concise source - Better exceptions handling - Checks local minima to see if retuning is needed. Resolved conflicts: bench/blas.cpp include/isaac/backend/templates/mproduct.h include/isaac/driver/buffer.h lib/array.cpp lib/backend/templates/mproduct.cpp lib/driver/buffer.cpp python/setup.py tune/pysrc/autotune.py tune/pysrc/dataset.py tune/pysrc/misc_tools.py 2015-06-28 17:53:16 -07:00			`if os.path.isfile(json_path):`
Backend: GEMM - Improved bounds checking 2015-07-02 14:02:31 -04:00			`json_data = json.load(open(json_path, 'r'))`
Tuning: Merged tune branch. - Much cleaner and more concise source - Better exceptions handling - Checks local minima to see if retuning is needed. Resolved conflicts: bench/blas.cpp include/isaac/backend/templates/mproduct.h include/isaac/driver/buffer.h lib/array.cpp lib/backend/templates/mproduct.cpp lib/driver/buffer.cpp python/setup.py tune/pysrc/autotune.py tune/pysrc/dataset.py tune/pysrc/misc_tools.py 2015-06-28 17:53:16 -07:00			`else:`
			`json_data = {}`
			`json_data["version"] = "1.0"`
			`operation_name = operation.__name__`
			`if operation_name not in json_data:`
			`json_data[operation_name] = {}`
			`json_data[operation_name]['float32'] = {}`
			`D = json_data[operation_name]['float32']`
			`if len(profiles) > 1:`
Tuner: Removed Sklearn + scipy dependency 2015-08-13 01:51:02 -07:00			`clf, nrmse = model.train(X, Y, profiles)`
Tuning: Merged tune branch. - Much cleaner and more concise source - Better exceptions handling - Checks local minima to see if retuning is needed. Resolved conflicts: bench/blas.cpp include/isaac/backend/templates/mproduct.h include/isaac/driver/buffer.h lib/array.cpp lib/backend/templates/mproduct.cpp lib/driver/buffer.cpp python/setup.py tune/pysrc/autotune.py tune/pysrc/dataset.py tune/pysrc/misc_tools.py 2015-06-28 17:53:16 -07:00			`D['predictor'] = [{'children_left': e.tree_.children_left.tolist(),`
			`'children_right': e.tree_.children_right.tolist(),`
			`'threshold': e.tree_.threshold.astype('float64').tolist(),`
			`'feature': e.tree_.feature.astype('float64').tolist(),`
			`'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_]`
			`D['profiles'] = [map(int, x) for x in profiles]`
			`json.dump(json_data, open(json_path,'w'))`