Enhancements of the auto-tuner

This commit is contained in:
Philippe Tillet
2014-10-29 17:01:57 +01:00
parent 199badc689
commit 9803bc8c92
4 changed files with 76 additions and 69 deletions

View File

@@ -7,17 +7,16 @@ import pyviennacl as vcl
import pyatidlas as atd import pyatidlas as atd
import numpy as np import numpy as np
from configobj import ConfigObj
from numpy import random from numpy import random
from model import train_model from model import train_model
TYPES = { 'vector-axpy': {'template':atd.VectorAxpyTemplate, TYPES = { 'vector-axpy': {'template':atd.VectorAxpyTemplate,
'perf-index':lambda x: 3*x[0]*x[1][0]/x[2]*1e-9, 'perf-index':lambda x: 2*x[0]*x[1][0]/x[2]*1e-9,
'perf-measure':'GB/s'}, 'perf-measure':'GB/s'},
'matrix-axpy': {'template':atd.MatrixAxpyTemplate, 'matrix-axpy': {'template':atd.MatrixAxpyTemplate,
'perf-index':lambda x: 3*x[0]*x[1][0]*x[1][1]/x[2]*1e-9, 'perf-index':lambda x: 2*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
'perf-measure':'GB/s'}, 'perf-measure':'GB/s'},
'reduction': {'template':atd.ReductionTemplate, 'reduction': {'template':atd.ReductionTemplate,
@@ -34,8 +33,11 @@ TYPES = { 'vector-axpy': {'template':atd.VectorAxpyTemplate,
def do_tuning(args, devices): def do_tuning(args, devices):
json_out = {}
device = devices[args.device] device = devices[args.device]
dname = misc_tools.sanitize_string(device.name)
json_out = {}
json_out["version"] = "1.0"
def map_to_list(T, x): def map_to_list(T, x):
return list(map(T, x if isinstance(x, list) else [x])) return list(map(T, x if isinstance(x, list) else [x]))
@@ -44,11 +46,15 @@ def do_tuning(args, devices):
default_tuning_sizes = {'vector-axpy': [args.blas1_size], 'reduction': [args.blas1_size], default_tuning_sizes = {'vector-axpy': [args.blas1_size], 'reduction': [args.blas1_size],
'matrix-axpy' : args.blas2_size, 'row-wise-reduction' : args.blas2_size, 'matrix-axpy' : args.blas2_size, 'row-wise-reduction' : args.blas2_size,
'matrix-product': args.blas3_size} 'matrix-product': args.blas3_size}
for operation in ['vector-axpy', 'matrix-axpy', 'reduction', 'row-wise-reduction', 'matrix-product']:
for operation in ['matrix-product']:
#Iterate through the datatypes #Iterate through the datatypes
for datatype in [vcl.float32, vcl.float64]: for datatype in [vcl.float32, vcl.float64]:
if operation=='matrix-product' and datatype==vcl.float64 and args.no_dgemm:
continue
ctx = cl.Context([device]) ctx = cl.Context([device])
ctx = vcl.backend.Context(ctx) ctx = vcl.backend.Context(ctx)
@@ -60,17 +66,17 @@ def do_tuning(args, devices):
#Helper for execution #Helper for execution
def execute(device, node, other_params, sizes, fname = os.devnull, parameters = None): def execute(device, node, other_params, sizes, fname = os.devnull, parameters = None):
with vcl.Statement(node) as statement: with vcl.Statement(node) as statement:
if parameters: if parameters is not None:
TemplateType = TYPES[operation]['template'] TemplateType = TYPES[operation]['template']
return misc_tools.benchmark(TemplateType(TemplateType.Parameters(*parameters),*other_params), statement, device) return misc_tools.benchmark(TemplateType(TemplateType.Parameters(*parameters),*other_params), statement, device)
print('-----')
print(' '.join(map(str, ("Now tuning:", datatype.__name__, '-', operation, '-'.join(other_params), '[' + device.name, '(' + device.platform.name + ')] for sizes', sizes))))
with open(fname, "w+") as archive: with open(fname, "w+") as archive:
return optimize.genetic(statement, device, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params), return optimize.genetic(statement, device, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive) lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive)
#Helper for tuning #Helper for tuning
def tune(execution_handler, n_datapoints, sampler, additional_parameters): def tune(execution_handler, profiles_generator, dataset_generator, additional_parameters):
print('-----')
print(' '.join(map(str, ("Now tuning:", datatype.__name__, '-', operation, '-'.join(additional_parameters), '[' + device.name, '(' + device.platform.name + ')]'))))
#Update JSON #Update JSON
full_operation = operation + ''.join(additional_parameters) full_operation = operation + ''.join(additional_parameters)
if full_operation not in json_out: if full_operation not in json_out:
@@ -79,13 +85,14 @@ def do_tuning(args, devices):
D = json_out[full_operation][datatype.__name__] D = json_out[full_operation][datatype.__name__]
if args.method == 'simple': if args.method == 'simple':
print default_tuning_sizes[operation]
profiles = [execution_handler(map(int,default_tuning_sizes[operation]))] profiles = [execution_handler(map(int,default_tuning_sizes[operation]))]
else: else:
def compute_perf(x, t): def compute_perf(x, t):
return TYPES[operation]['perf-index']([datatype().itemsize, x, t]) return TYPES[operation]['perf-index']([datatype().itemsize, x, t])
profiles = dataset.sample_profiles(execution_handler, args.sample_size, sampler) profiles = dataset.sample_profiles(execution_handler, profiles_generator)
if args.build_model: if args.build_model:
X, Y = dataset.sample_dataset(os.path.join(full_operation,datatype.__name__), profiles, execution_handler, n_datapoints, sampler) X, Y, profiles = dataset.sample_dataset(os.path.join(full_operation,datatype.__name__), profiles, execution_handler, dataset_generator)
clf = train_model(X, Y, profiles, TYPES[operation]['perf-measure']) clf = train_model(X, Y, profiles, TYPES[operation]['perf-measure'])
D['predictor'] = [{'children_left': e.tree_.children_left.tolist(), D['predictor'] = [{'children_left': e.tree_.children_left.tolist(),
'children_right': e.tree_.children_right.tolist(), 'children_right': e.tree_.children_right.tolist(),
@@ -94,17 +101,27 @@ def do_tuning(args, devices):
'value': e.tree_.value[:,:,0].astype('float32').tolist()} for e in clf.estimators_] 'value': e.tree_.value[:,:,0].astype('float32').tolist()} for e in clf.estimators_]
if args.viennacl_src_path: if args.viennacl_src_path:
misc_tools.update_viennacl_headers(args.viennacl_src_path,device,datatype,operation,additional_parameters,profiles[0]) misc_tools.update_viennacl_headers(args.viennacl_src_path,device,datatype,operation,additional_parameters,profiles[0])
D['profiles'] = [ prof.astype('int').tolist() for prof in profiles] D['profiles'] = [map(int, x) for x in profiles]
def log_uniform_sample(a,b):
return np.exp(np.random.uniform(low=np.log(a), high=np.log(b), size=1)).astype(int)
def log_space_gen_product(a,b,N,dim):
N = int(N**(1.0/dim))
def log_space_gen(a,b):
for i in range(N):
v = int(np.exp(np.log(a) + (np.log(b) - np.log(a))*(i+1)/N))
yield (v//64 + 1)*64
return tuple(itertools.product(*[log_space_gen(a,b) for i in range(dim)]))
#Vector AXPY #Vector AXPY
if operation=='vector-axpy': if operation=='vector-axpy':
def execution_handler(sizes, fname=os.devnull, parameters=None): def execution_handler(sizes, fname=os.devnull, parameters=None):
x = vcl.Vector(sizes[0], context=ctx, dtype=datatype) x = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
y = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
z = vcl.Vector(sizes[0], context=ctx, dtype=datatype) z = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
return execute(device, vcl.Assign(z, vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y))), (), sizes, fname, parameters) return execute(device, vcl.Assign(z, x), (), sizes, fname, parameters)
tune(execution_handler, 1000, lambda : 64*np.random.randint(low=10, high=100000, size=1), ()) tune(execution_handler, log_space_gen_product(1e3, 1e7, args.sample_size, 1), log_space_gen_product(1e3, 1e7, 1000, 1), ())
#Reduction #Reduction
if operation=='reduction': if operation=='reduction':
def execution_handler(sizes, fname=os.devnull, parameters=None): def execution_handler(sizes, fname=os.devnull, parameters=None):
@@ -112,26 +129,25 @@ def do_tuning(args, devices):
y = vcl.Vector(sizes[0], context=ctx, dtype=datatype) y = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
s = vcl.Scalar(0, context=ctx, dtype=datatype) s = vcl.Scalar(0, context=ctx, dtype=datatype)
return execute(device, vcl.Assign(s, vcl.Dot(x,y)), (), sizes, fname, parameters) return execute(device, vcl.Assign(s, vcl.Dot(x,y)), (), sizes, fname, parameters)
tune(execution_handler, 1000, lambda : 64*np.random.randint(low=10, high=100000, size=1), ()) tune(execution_handler, log_space_gen_product(1e3, 1e7, args.sample_size, 1), log_space_gen_product(1e3, 1e7, 1000, 1), ())
#Matrix AXPY #Matrix AXPY
if operation=='matrix-axpy': if operation=='matrix-axpy':
def execution_handler(sizes, fname=os.devnull, parameters=None): def execution_handler(sizes, fname=os.devnull, parameters=None):
A = vcl.Matrix(sizes, context=ctx, dtype=datatype) A = vcl.Matrix(sizes, context=ctx, dtype=datatype)
B = vcl.Matrix(sizes, context=ctx, dtype=datatype)
C = vcl.Matrix(sizes, context=ctx, dtype=datatype) C = vcl.Matrix(sizes, context=ctx, dtype=datatype)
return execute(device, vcl.Assign(C,A+B), (), sizes, fname, parameters) return execute(device, vcl.Assign(C,A), (), sizes, fname, parameters)
tune(execution_handler, 1000, lambda : 64*np.random.randint(low=5, high=100, size=2), ()) tune(execution_handler, log_space_gen_product(100, 4000, args.sample_size, 2), log_space_gen_product(100, 4000, 1000, 2), ())
#Row-wise reduction #Row-wise reduction
if operation=='row-wise-reduction': if operation=='row-wise-reduction':
layouts = ['N', 'T'] layouts = ['N', 'T']
for A_trans in layouts: for A_trans in layouts:
def execution_handler(sizes, fname=os.devnull, parameters=None): def execution_handler(sizes, fname=os.devnull, parameters=None):
A = vcl.Matrix(sizes if A_trans=='N' else sizes[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR) A = vcl.Matrix(sizes if A_trans=='N' else sizes[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR)
x = vcl.Vector(sizes[1] if A_trans=='N' else sizes[0], context=ctx, dtype=datatype) x = vcl.Vector(sizes[1], context=ctx, dtype=datatype)
y = vcl.Vector(sizes[0] if A_trans=='N' else sizes[1], context=ctx, dtype=datatype) y = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
LHS = A if A_trans=='N' else A.T LHS = A if A_trans=='N' else A.T
return execute(device, vcl.Assign(y, LHS*x), (), sizes, fname, parameters) return execute(device, vcl.Assign(y, LHS*x), (), sizes, fname, parameters)
tune(execution_handler, 1000, lambda : 64*np.random.randint(low=5, high=100, size=2), (A_trans,)) tune(execution_handler, log_space_gen_product(100, 4000, args.sample_size, 2), log_space_gen_product(100, 4000, 1000, 2), (A_trans,))
#Matrix Product #Matrix Product
if operation=='matrix-product': if operation=='matrix-product':
layouts = ['NN', 'NT', 'TN', 'TT'] layouts = ['NN', 'NT', 'TN', 'TT']
@@ -147,11 +163,12 @@ def do_tuning(args, devices):
beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype) beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR) C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR)
return execute(device, vcl.Assign(C,LHS*RHS*alpha + C*beta),(A_trans, B_trans), sizes, fname, parameters) return execute(device, vcl.Assign(C,LHS*RHS*alpha + C*beta),(A_trans, B_trans), sizes, fname, parameters)
tune(execution_handler, 1000, lambda : 64*np.random.randint(low=1, high=40, size=3),(layout[0], layout[1])) tune(execution_handler, log_space_gen_product(100, 4000, args.sample_size, 3), log_space_gen_product(100, 4000, 1000, 3),(layout[0], layout[1]))
json.dump(json_out, open(dname + '.json','w'))
dname = misc_tools.sanitize_string(device.name)
json_out["version"] = "1.0"
json.dump(json_out, open(dname + '.json','w'))
@@ -160,7 +177,8 @@ if __name__ == "__main__":
subparsers = parser.add_subparsers(dest='action') subparsers = parser.add_subparsers(dest='action')
print_devices_parser = subparsers.add_parser('list-devices', help='list the devices available') print_devices_parser = subparsers.add_parser('list-devices', help='list the devices available')
tune_parser = subparsers.add_parser('tune', help='tune using a specific configuration file') tune_parser = subparsers.add_parser('tune', help='tune using a specific configuration file')
tune_parser.add_argument("--device", default=0, required=False, type=str) tune_parser.add_argument("--device", default=0, type=str)
tune_parser.add_argument("--no-dgemm", default=True, type=bool)
tune_parser.add_argument("--viennacl-src-path", default='', type=str) tune_parser.add_argument("--viennacl-src-path", default='', type=str)
tune_subparsers = tune_parser.add_subparsers(dest='method') tune_subparsers = tune_parser.add_subparsers(dest='method')

View File

@@ -4,56 +4,47 @@ import re
import random import random
import numpy as np import numpy as np
def resample(X, sampler): def sample_profiles(execution_handler, generator):
Xtuples = [tuple(x) for x in X]
r = random.random()
while(True):
x = sampler()
if tuple(x) not in Xtuples:
break
return x.astype(int)
def sample_profiles(execution_handler, nTuning, sampler):
print "Sampling profiles..." print "Sampling profiles..."
nDim = sampler().size
X = np.empty((nTuning, nDim)) t = np.empty(0)
t = np.empty(nTuning)
profiles = [] profiles = []
for i in range(nTuning): for i, x in enumerate(generator):
x = resample(X, sampler) print x
if i==0:
X = np.empty((0,len(x)))
y = execution_handler(x) y = execution_handler(x)
if y not in profiles: if y not in profiles:
profiles.append(y) profiles.append(y)
idx = profiles.index(y) idx = profiles.index(y)
X[i,:] = x
t[i] = idx X = np.vstack((X, x))
t = np.append(t, idx)
idx = int(t[np.argmax(np.linalg.norm(X, axis=1))]) idx = int(t[np.argmax(np.linalg.norm(X, axis=1))])
profiles = np.array([profiles[idx]] + [x for i,x in enumerate(profiles) if i!=idx]) profiles = [profiles[idx]] + [x for i,x in enumerate(profiles) if i!=idx]
return profiles return profiles
def sample_dataset(prefix_name, profiles, execution_handler, nDataPoints, sampler): def sample_dataset(prefix_name, profiles, execution_handler, generator):
P = len(profiles)
print "Generating the dataset..." print "Generating the dataset..."
Y = np.empty((nDataPoints, len(profiles))) Y = np.empty((0, P))
X = np.empty((nDataPoints, len(profiles[0]))) for i,x in enumerate(generator):
t = [] if i==0:
X = np.empty((0,len(x)))
for i in range(nDataPoints): new_y = np.zeros(P)
x = resample(X, sampler)
for j,y in enumerate(profiles): for j,y in enumerate(profiles):
T = execution_handler(x, os.devnull, y) T = execution_handler(x, os.devnull, y)
Y[i,j] = T new_y[j] = T
idx = np.argmax(Y[i,:]) X = np.vstack((X, x))
X[i,:] = x Y = np.vstack((Y, new_y))
t = np.argmax(Y[:i+1,], axis=1)
if i%10==0: if i%10==0:
sys.stdout.write('%d data points generated\r'%i) sys.stdout.write('%d data points generated\r'%i)
sys.stdout.flush() sys.stdout.flush()
idx = np.argsort(Y[np.argmax(X),:]) idx = np.argsort(Y[np.argmax(np.linalg.norm(X, axis=1)),:])
Y = Y[:, idx] Y = Y[:, idx]
profiles = profiles[idx] profiles = [profiles[i] for i in idx]
dir = os.path.join("data", prefix_name) dir = os.path.join("data", prefix_name)
if not os.path.exists(dir): if not os.path.exists(dir):
@@ -61,8 +52,5 @@ def sample_dataset(prefix_name, profiles, execution_handler, nDataPoints, sample
np.savetxt(os.path.join(dir,"X.csv"), X) np.savetxt(os.path.join(dir,"X.csv"), X)
np.savetxt(os.path.join(dir,"Y.csv"), Y) np.savetxt(os.path.join(dir,"Y.csv"), Y)
np.savetxt(os.path.join(dir,"profiles.csv"), profiles) np.savetxt(os.path.join(dir,"profiles.csv"), profiles)
X = np.loadtxt(os.path.join(dir, "X.csv"),ndmin=2)
Y = np.loadtxt(os.path.join(dir, "Y.csv"),ndmin=2)
profiles = np.loadtxt(os.path.join(dir, "profiles.csv"))
return X, Y return X, Y, profiles

View File

@@ -62,7 +62,7 @@ class GeneticOperators(object):
def decode(self, genome): def decode(self, genome):
FetchingPolicy = atd.FetchingPolicy FetchingPolicy = atd.FetchingPolicy
fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED] fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS]
decode_element = lambda x:2**int(b_gray_to_bin(''.join(x)), 2) decode_element = lambda x:2**int(b_gray_to_bin(''.join(x)), 2)
result = [] result = []
offset = 0 offset = 0

View File

@@ -15,16 +15,17 @@ def gmean(a, axis=0, dtype=None):
return np.exp(log_a.mean(axis=axis)) return np.exp(log_a.mean(axis=axis))
def train_model(X, Y, profiles, metric): def train_model(X, Y, profiles, metric):
Y=Y[:,:] #Shuffle
profiles=profiles[:] p = np.random.permutation(X.shape[0])
X = X[p,:]
Y = Y[p,:]
#Normalize
Ymax = np.max(Y) Ymax = np.max(Y)
Y = Y/Ymax Y = Y/Ymax
#Train the model #Train the model
cut = int(0.75*X.shape[0]) cut = int(0.75*X.shape[0])
clf = ensemble.RandomForestRegressor(10, max_depth=4).fit(X[:cut,:], Y[:cut,:]) clf = ensemble.RandomForestRegressor(10, max_depth=3).fit(X[:cut,:], Y[:cut,:])
print clf.predict([10000])
t = np.argmin(clf.predict(X[cut:,:]), axis = 1) t = np.argmin(clf.predict(X[cut:,:]), axis = 1)
s = np.array([y[0]/y[k] for y,k in zip(Y[cut:,:], t)]) s = np.array([y[0]/y[k] for y,k in zip(Y[cut:,:], t)])