Cleaned model building ; added some informative commented code

This commit is contained in:
Philippe Tillet
2014-10-13 03:38:19 +02:00
parent b8d339e54d
commit 6313f060cb
5 changed files with 93 additions and 61 deletions

View File

@@ -1,44 +1,85 @@
from sklearn import *;
from sklearn import ensemble;
from sklearn import *
from sklearn import tree
from sklearn import ensemble
import numpy as np
import scipy as sp
# def random_forest(Xtr, Ytr):
# clf = ensemble.RandomForestRegressor(10, max_depth=7).fit(Xtr,Ytr)
#
# def predict_tree(tree, x):
# tree_ = tree.tree_
# children_left = tree_.children_left
# children_right = tree_.children_right
# threshold = tree_.threshold
# feature = tree_.feature
# value = tree_.value
# idx = 0
# while children_left[idx]!=-1:
# if x[0, feature[idx]] <= threshold[idx]:
# idx = children_left[idx]
# else:
# idx = children_right[idx]
# return value[[idx],:,:][:,:,0]
#
# s = 0
# for e in clf.estimators_:
# tree_ = e.tree_
# children_left = tree_.children_left
# children_right = tree_.children_right
# threshold = tree_.threshold
# feature = tree_.feature
# value = tree_.value
# s = s + value.size + feature.size + threshold.size + children_right.size + children_left.size
# print s*4*1e-3
# return clf, clf.predict
#
# def train_nn(layer_sizes, XTr, YTr, XTe, YTe):
# optimizer = HF(open(os.devnull, 'w'), 15)
# optimizer.doCGBacktracking = True
# net = FeedforwardNeuralNet(layer_sizes, [Act.Tanh() for i in range(len(layer_sizes)-2)], Act.Linear(), 1e-5)
#
# nbatch=10
# bsize = XTr.shape[0]/nbatch
# data = ((XTr[(i%nbatch)*bsize:(i%nbatch+1)*bsize,:], YTr[(i%nbatch)*bsize:(i%nbatch+1)*bsize,:]) for i in range(nbatch))
# data = HFDataSource(data, bsize, gradBatchSize = nbatch*bsize, curvatureBatchSize = bsize, lineSearchBatchSize =nbatch*bsize, gradBatchIsTrainingSet=True)
# iters = optimizer.optimize(HFModel(net), data, 300, otherPrecondDampingTerm=net.L2Cost)
# bestte = collections.deque([float("inf")]*5, maxlen=5)
# for i,w in enumerate(iters):
# Diffte = YTe - net.predictions(XTe).as_numpy_array()
# Difftr = YTr - net.predictions(XTr).as_numpy_array()
# Ete = np.sum(Diffte**2)
# Etr = np.sum(Difftr**2)
# bestte.append(min(min(bestte),Ete))
# if min(bestte)==max(bestte):
# print 'Final test error: ', Ete
# return net, net.predictions
# print 'Iteration %d | Test error = %.2f | Train error = %.2f'%(i, Ete, Etr)
# return net, net.predictions
def train_model(X, Y, profiles, metric):
#Preprocessing
Xmean = np.mean(X, axis=0)
Xstd = np.std(X, axis=0)
print("Building the model...")
Xmean = np.mean(X)
Xstd = np.std(X)
X = (X - Xmean)/Xstd
Y = Y[:, :]
Ymax = np.max(Y)
Y = Y/Ymax
ref = np.argmax(np.bincount(np.argmax(Y, axis=1))) #most common profile
#Cross-validation data-sets
ref = np.argmax(np.bincount(np.argmin(Y, axis=1))) #most common profile
cut = int(0.800*X.shape[0]+1)
XTr = X[0:cut, :]
YTr = Y[0:cut, :]
XTe = X[cut:,:]
YTe = Y[cut:,:]
#Train the model
print("Training the model...")
clf = ensemble.RandomForestRegressor(40).fit(XTr,YTr)
clf = ensemble.RandomForestRegressor(10, max_depth=10).fit(X[:cut,:], Y[:cut,:])
#Evaluate the model
GFlops = np.empty(XTe.shape[0])
speedups = np.empty(XTe.shape[0])
optspeedups = np.empty(XTe.shape[0])
for i,x in enumerate(XTe):
predictions = clf.predict(x)
label = np.argmax(predictions)
speedups[i] = YTe[i,label]/YTe[i,ref]
optspeedups[i] = np.max(YTe[i,:])/YTe[i,ref]
GFlops[i] = YTe[i,ref]*Ymax
np.set_printoptions(precision=2)
print("-----------------")
print("Average testing speedup : %f (Optimal : %f)"%(sp.stats.gmean(speedups), sp.stats.gmean(optspeedups)))
print("Average %s: %f (Default %f, Optimal %f)"%(metric, np.mean(np.multiply(GFlops,speedups)), np.mean(GFlops), np.mean(np.multiply(GFlops,optspeedups))))
print("Minimum speedup is %f wrt %i %s"%(np.min(speedups), GFlops[np.argmin(speedups)], metric))
print("Maximum speedup is %f wrt %i %s"%(np.max(speedups), GFlops[np.argmax(speedups)], metric))
print("--------")
t = np.argmin(clf.predict(X[cut:,:]), axis = 1)
s = np.array([y[ref]/y[k] for y,k in zip(Y[cut:,:], t)])
# s = np.maximum(s, 1.0)
tt = np.argmin(Y[cut:,:], axis = 1)
ss = np.array([y[ref]/y[k] for y,k in zip(Y[cut:,:], tt)])
print("Testing speedup : mean = %.3f, median = %.3f, min = %.3f, max %.3f"%(sp.stats.gmean(s), np.median(s), np.min(s), np.max(s)))
print("Optimal speedup : mean = %.3f, median = %.3f, min = %.3f, max %.3f"%(sp.stats.gmean(ss), np.median(ss), np.min(ss), np.max(ss)))