from sklearn import tree from sklearn import ensemble from numpy import array, bincount, mean, std, max, argmax, min, argmin, median def gmean(a, axis=0, dtype=None): if not isinstance(a, np.ndarray): # if not an ndarray object attempt to convert it log_a = np.log(np.array(a, dtype=dtype)) elif dtype: # Must change the default dtype allowing array type if isinstance(a,np.ma.MaskedArray): log_a = np.log(np.ma.asarray(a, dtype=dtype)) else: log_a = np.log(np.asarray(a, dtype=dtype)) else: log_a = np.log(a) return np.exp(log_a.mean(axis=axis)) def train_model(X, Y, profiles, metric): print("Building the model...") Xmean = mean(X) Xstd = std(X) X = (X - Xmean)/Xstd Y = Y[:, :] Ymax = max(Y) Y = Y/Ymax ref = argmax(bincount(argmin(Y, axis=1))) #most common profile cut = int(0.800*X.shape[0]+1) #Train the model clf = ensemble.RandomForestRegressor(10, max_depth=10).fit(X[:cut,:], Y[:cut,:]) t = argmin(clf.predict(X[cut:,:]), axis = 1) s = array([y[ref]/y[k] for y,k in zip(Y[cut:,:], t)]) tt = argmin(Y[cut:,:], axis = 1) ss = array([y[ref]/y[k] for y,k in zip(Y[cut:,:], tt)]) print("Testing speedup : mean = %.3f, median = %.3f, min = %.3f, max %.3f"%(gmean(s), median(s), min(s), max(s))) print("Optimal speedup : mean = %.3f, median = %.3f, min = %.3f, max %.3f"%(gmean(ss), median(ss), min(ss), max(ss)))