triton/autotune/python/model.py

from sklearn import *
from sklearn import tree
from sklearn import ensemble

import numpy as np
import scipy as sp


# def random_forest(Xtr, Ytr):
#     clf = ensemble.RandomForestRegressor(10, max_depth=7).fit(Xtr,Ytr)
#
#     def predict_tree(tree, x):
#         tree_ = tree.tree_
#         children_left = tree_.children_left
#         children_right = tree_.children_right
#         threshold = tree_.threshold
#         feature = tree_.feature
#         value = tree_.value
#         idx = 0
#         while children_left[idx]!=-1:
#             if x[0, feature[idx]] <= threshold[idx]:
#                 idx = children_left[idx]
#             else:
#                 idx = children_right[idx]
#         return value[[idx],:,:][:,:,0]
#
#     s = 0
#     for e in clf.estimators_:
#         tree_ = e.tree_
#         children_left = tree_.children_left
#         children_right = tree_.children_right
#         threshold = tree_.threshold
#         feature = tree_.feature
#         value = tree_.value
#         s = s + value.size + feature.size + threshold.size + children_right.size + children_left.size
#     print s*4*1e-3
#     return clf, clf.predict
#
# def train_nn(layer_sizes, XTr, YTr, XTe, YTe):
#     optimizer = HF(open(os.devnull, 'w'), 15)
#     optimizer.doCGBacktracking = True
#     net = FeedforwardNeuralNet(layer_sizes, [Act.Tanh() for i in range(len(layer_sizes)-2)], Act.Linear(), 1e-5)
#
#     nbatch=10
#     bsize = XTr.shape[0]/nbatch
#     data = ((XTr[(i%nbatch)*bsize:(i%nbatch+1)*bsize,:], YTr[(i%nbatch)*bsize:(i%nbatch+1)*bsize,:]) for i in range(nbatch))
#     data = HFDataSource(data, bsize, gradBatchSize = nbatch*bsize, curvatureBatchSize = bsize, lineSearchBatchSize =nbatch*bsize, gradBatchIsTrainingSet=True)
#     iters = optimizer.optimize(HFModel(net), data, 300, otherPrecondDampingTerm=net.L2Cost)
#     bestte = collections.deque([float("inf")]*5, maxlen=5)
#     for i,w in enumerate(iters):
#         Diffte = YTe - net.predictions(XTe).as_numpy_array()
#         Difftr = YTr - net.predictions(XTr).as_numpy_array()
#         Ete = np.sum(Diffte**2)
#         Etr = np.sum(Difftr**2)
#         bestte.append(min(min(bestte),Ete))
#         if min(bestte)==max(bestte):
#             print 'Final test error: ', Ete
#             return net, net.predictions
#         print 'Iteration %d | Test error = %.2f | Train error = %.2f'%(i, Ete, Etr)
#     return net, net.predictions

def train_model(X, Y, profiles, metric):
    print("Building the model...")

    Xmean = np.mean(X)
    Xstd = np.std(X)
    X = (X - Xmean)/Xstd

    Y = Y[:, :]
    Ymax = np.max(Y)
    Y = Y/Ymax

    ref = np.argmax(np.bincount(np.argmin(Y, axis=1))) #most common profile
    cut = int(0.800*X.shape[0]+1)

    #Train the model
    clf = ensemble.RandomForestRegressor(10, max_depth=10).fit(X[:cut,:], Y[:cut,:])

    t = np.argmin(clf.predict(X[cut:,:]), axis = 1)
    s = np.array([y[ref]/y[k] for y,k in zip(Y[cut:,:], t)])
    # s = np.maximum(s, 1.0)
    tt = np.argmin(Y[cut:,:], axis = 1)
    ss = np.array([y[ref]/y[k] for y,k in zip(Y[cut:,:], tt)])
    print("Testing speedup : mean = %.3f, median = %.3f, min = %.3f,  max %.3f"%(sp.stats.gmean(s), np.median(s), np.min(s), np.max(s)))
    print("Optimal speedup : mean = %.3f, median = %.3f, min = %.3f,  max %.3f"%(sp.stats.gmean(ss), np.median(ss), np.min(ss), np.max(ss)))