Porting GA for all the operations

This commit is contained in:
Philippe Tillet
2014-10-03 09:29:45 +02:00
parent 2f6d41f661
commit 044419f9f0
6 changed files with 76 additions and 78 deletions

View File

@@ -1,10 +1,10 @@
#will save the archive into /tmp/name-of-operation.dat #will save the archive into /tmp/name-of-operation.dat
tmp-folder = /tmp/ tmp-folder = /tmp/
#~ [vector-axpy] [vector-axpy]
#~ devices = 0 devices = 0
#~ precision = all precision = single
#~ size = 10000000 size = 10000000
#~ [matrix-axpy] #~ [matrix-axpy]
#~ devices = 0 #~ devices = 0

View File

@@ -24,39 +24,35 @@ DATATYPES = { 'single' : vcl.float32,
'double' : vcl.float64 } 'double' : vcl.float64 }
TYPES = { 'vector-axpy': {'template':vcl.atidlas.VectorAxpyTemplate, TYPES = { 'vector-axpy': {'template':vcl.atidlas.VectorAxpyTemplate,
'parameter-names':['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
'perf-index':lambda x: 3*x[0]*x[1][0]/x[2]*1e-9, 'perf-index':lambda x: 3*x[0]*x[1][0]/x[2]*1e-9,
'perf-measure':'GB/s'}, 'perf-measure':'GB/s'},
'matrix-axpy': {'template':vcl.atidlas.MatrixAxpyTemplate, 'matrix-axpy': {'template':vcl.atidlas.MatrixAxpyTemplate,
'parameter-names':['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'num-groups-1', 'fetch'],
'perf-index':lambda x: 3*x[0]*x[1][0]*x[1][1]/x[2]*1e-9, 'perf-index':lambda x: 3*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
'perf-measure':'GB/s'}, 'perf-measure':'GB/s'},
'reduction': {'template':vcl.atidlas.ReductionTemplate, 'reduction': {'template':vcl.atidlas.ReductionTemplate,
'parameter-names':['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
'perf-index':lambda x: 2*x[0]*x[1][0]*x[1][1]/x[2]*1e-9, 'perf-index':lambda x: 2*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
'perf-measure':'GB/s'}, 'perf-measure':'GB/s'},
'row-wise-reduction': {'template':vcl.atidlas.RowWiseReductionTemplate, 'row-wise-reduction': {'template':vcl.atidlas.RowWiseReductionTemplate,
'parameter-names':['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'fetch'],
'perf-index':lambda x: x[0]*x[1][0]*x[1][1]/x[2]*1e-9, 'perf-index':lambda x: x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
'perf-measure':'GB/s'}, 'perf-measure':'GB/s'},
'matrix-product': {'template':vcl.atidlas.MatrixProductTemplate, 'matrix-product': {'template':vcl.atidlas.MatrixProductTemplate,
'parameter-names':['simd-width', 'local-size-0', 'kL', 'local-size-1', 'mS', 'kS', 'nS', 'A-fetch-policy', 'B-fetch-policy', 'local-fetch-size-0', 'local-fetch-size-1'],
'perf-index': lambda x: 2*x[1][0]*x[1][1]*x[1][2]/x[2]*1e-9, 'perf-index': lambda x: 2*x[1][0]*x[1][1]*x[1][2]/x[2]*1e-9,
'perf-measure': 'GFLOP/s'} } 'perf-measure': 'GFLOP/s'} }
def do_tuning(config_fname, spec_fname, viennacl_root): def do_tuning(config_fname, spec_fname, viennacl_root):
config = ConfigObj(config_fname, configspec=spec_fname) config = ConfigObj(config_fname, configspec=spec_fname)
map_to_list = lambda T: list(map(T[0], T[1] if isinstance(T[1], list) else [T[1]])) def map_to_list(T, x):
return list(map(T, x if isinstance(x, list) else [x]))
for operation in ['vector-axpy', 'matrix-axpy', 'row-wise-reduction', 'matrix-product']: for operation in ['vector-axpy', 'matrix-axpy', 'row-wise-reduction', 'matrix-product']:
if operation in config: if operation in config:
p = config[operation] p = config[operation]
confdevices = p['devices'] confdevices = p['devices']
devices = utils.DEVICES_PRESETS[confdevices] if confdevices in utils.DEVICES_PRESETS else [utils.all_devices[int(i)] for i in confdevices] devices = utils.DEVICES_PRESETS[confdevices] if confdevices in utils.DEVICES_PRESETS else [utils.all_devices[int(i)] for i in confdevices]
precisions = map_to_list((str, p['precision'])) precisions = map_to_list(str, p['precision'])
datatypes = [DATATYPES[k] for k in precisions] datatypes = [DATATYPES[k] for k in precisions]
#Iterate through the datatypes and the devices #Iterate through the datatypes and the devices
for datatype, device in itertools.product(datatypes, devices): for datatype, device in itertools.product(datatypes, devices):
@@ -68,18 +64,23 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...') sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...')
continue continue
#Helper #Helper
def execute(statement, other_params, sizes, fname = os.devnull): def execute(device, statement, other_params, sizes, fname = os.devnull, parameters = None):
if parameters:
TemplateType = TYPES[operation]['template']
return tools.benchmark(TemplateType(TemplateType.Parameters(*parameters),*other_params), statement, device)
print('-----') print('-----')
print(' '.join(map(str, ("Now tuning:", datatype.__name__, '-', operation, '-'.join(other_params), '[' + device.name, '(' + device.platform.name + ')] for sizes', sizes)))) print(' '.join(map(str, ("Now tuning:", datatype.__name__, '-', operation, '-'.join(other_params), '[' + device.name, '(' + device.platform.name + ')] for sizes', sizes))))
with open(fname, "w+") as archive: with open(fname, "w+") as archive:
return optimize.genetic(statement, ctx, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params), return optimize.genetic(statement, device, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
TYPES[operation]['parameter-names'], lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive) lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive)
s = map_to_list((int, p['size']))
#Vector AXPY #Vector AXPY
if operation=='vector-axpy': if operation=='vector-axpy':
x = vcl.Vector(s[0], context=ctx, dtype=datatype) def execution_handler(sizes, fname=os.devnull, parameters=None):
y = vcl.Vector(s[0], context=ctx, dtype=datatype) x = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
execute(vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y)), ()) y = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
return execute(device, vcl.Statement(vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y))), (), sizes, fname, parameters)
if 'size' in p:
profile = execution_handler(map_to_list(int, p['size']))
#Matrix AXPY #Matrix AXPY
if operation=='matrix-axpy': if operation=='matrix-axpy':
A = vcl.Matrix(s, context=ctx, dtype=datatype) A = vcl.Matrix(s, context=ctx, dtype=datatype)
@@ -112,11 +113,10 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype) beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR) C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR)
statement = vcl.Statement(vcl.Assign(C,LHS*RHS*alpha + C*beta)) statement = vcl.Statement(vcl.Assign(C,LHS*RHS*alpha + C*beta))
if parameters: return execute(device, statement,(A_trans, B_trans), sizes, fname, parameters)
TemplateType = TYPES[operation]['template'] if 'size' in p:
return tools.benchmark(TemplateType(TemplateType.Parameters(*parameters),A_trans,B_trans), statement, device) profile = execution_handler(map(int, p['size']))
else: else:
return execute(statement,(A_trans, B_trans), sizes, fname)
X, Y, profiles = generate_dataset(TYPES[operation]['template'], execution_handler) X, Y, profiles = generate_dataset(TYPES[operation]['template'], execution_handler)
train_model(X, Y, profiles) train_model(X, Y, profiles)

View File

@@ -28,7 +28,7 @@ def resample(X, tbincount, densities, step):
return x.astype(int) return x.astype(int)
def generate_dataset(TemplateType, execution_handler): def generate_dataset(TemplateType, execution_handler):
I = 10 I = 50
step = 64 step = 64
path = "./data" path = "./data"
@@ -47,13 +47,13 @@ def generate_dataset(TemplateType, execution_handler):
# densities = [KernelDensity(kernel='gaussian', bandwidth=2*step).fit(X[t==i,:]) for i in range(int(max(t))+1)]; # densities = [KernelDensity(kernel='gaussian', bandwidth=2*step).fit(X[t==i,:]) for i in range(int(max(t))+1)];
# #
# print "Generating the dataset..." # print "Generating the dataset..."
# N = 1000 # N = 10000
# Y = np.empty((N, len(profiles))) # Y = np.empty((N, len(profiles)))
# X = np.empty((N,3)) # X = np.empty((N,3))
# t = [] # t = []
# #
# for i in range(N): # for i in range(N):
# x = resample(X, np.bincount(t), densities, step) # x = resample(X, [], [], step)
# for j,y in enumerate(profiles): # for j,y in enumerate(profiles):
# T = execution_handler(x, os.devnull, decode(map(int, y))) # T = execution_handler(x, os.devnull, decode(map(int, y)))
# Y[i,j] = 2*1e-9*x[0]*x[1]*x[2]/T # Y[i,j] = 2*1e-9*x[0]*x[1]*x[2]/T
@@ -61,6 +61,9 @@ def generate_dataset(TemplateType, execution_handler):
# X[i,:] = x # X[i,:] = x
# t = np.argmax(Y[:i+1,], axis=1) # t = np.argmax(Y[:i+1,], axis=1)
# densities[idx].fit(X[t==idx,:]) # densities[idx].fit(X[t==idx,:])
# if i%10==0:
# sys.stdout.write('%d data points generated\r'%i)
# sys.stdout.flush()
# #
# np.savetxt(os.path.join(path,"profiles.csv"), profiles) # np.savetxt(os.path.join(path,"profiles.csv"), profiles)
# np.savetxt(os.path.join(path,"X.csv"), X) # np.savetxt(os.path.join(path,"X.csv"), X)

View File

@@ -33,10 +33,9 @@ def b_gray_to_bin(A='00000000', endian='big'):
class GeneticOperators(object): class GeneticOperators(object):
def __init__(self, device, statement, parameter_names, TemplateType, build_template, out): def __init__(self, device, statement, TemplateType, build_template, out):
self.device = device self.device = device
self.statement = statement self.statement = statement
self.parameter_names = parameter_names
self.TemplateType = TemplateType self.TemplateType = TemplateType
self.ParameterType = TemplateType.Parameters self.ParameterType = TemplateType.Parameters
self.build_template = build_template self.build_template = build_template
@@ -44,6 +43,11 @@ class GeneticOperators(object):
self.indpb = 0.05 self.indpb = 0.05
self.out = out self.out = out
self.genome_info = {
vcl.atidlas.VectorAxpyTemplate: [3,4,4,vcl.atidlas.FetchingPolicy],
vcl.atidlas.MatrixProductTemplate: [3,3,3,3,3,3,3,vcl.atidlas.FetchingPolicy,vcl.atidlas.FetchingPolicy,3]
}[TemplateType]
creator.create("FitnessMin", base.Fitness, weights=(-1.0,)) creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin) creator.create("Individual", list, fitness=creator.FitnessMin)
@@ -54,35 +58,39 @@ class GeneticOperators(object):
self.toolbox.register("mutate", self.mutate) self.toolbox.register("mutate", self.mutate)
self.toolbox.register("select", deap_tools.selNSGA2) self.toolbox.register("select", deap_tools.selNSGA2)
@staticmethod def decode(self, genome):
def decode(s):
FetchingPolicy = vcl.atidlas.FetchingPolicy FetchingPolicy = vcl.atidlas.FetchingPolicy
fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED] fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED]
fetchA = fetch[s[0]] decode_element = lambda x:2**int(b_gray_to_bin(''.join(x)), 2)
fetchB = fetch[s[1]] result = []
bincode = ''.join(s[2:]) offset = 0
decode_element = lambda x:2**int(b_gray_to_bin(x), 2) for x in self.genome_info:
simd = decode_element(bincode[0:3]) if x==vcl.atidlas.FetchingPolicy:
ls0 = decode_element(bincode[2:5]) result.append(fetch[genome[offset]])
ls1 = decode_element(bincode[5:8]) offset = offset + 1
kL = decode_element(bincode[8:11])
mS = decode_element(bincode[11:14])
kS = decode_element(bincode[14:17])
nS = decode_element(bincode[17:20])
if fetchA==FetchingPolicy.FETCH_FROM_LOCAL or fetchB==FetchingPolicy.FETCH_FROM_LOCAL:
lf0 = decode_element(bincode[20:23])
lf1 = ls0*ls1/lf0
else: else:
lf0, lf1 = 0, 0 result.append(decode_element(genome[offset:offset+x]))
return [simd, ls0, kL, ls1, mS, kS, nS, fetchA, fetchB, lf0, lf1] offset = offset + x
#GEMM peculiarities
if self.TemplateType==vcl.atidlas.MatrixProductTemplate:
if FetchingPolicy.FETCH_FROM_LOCAL in result:
lf1 = result[1]*result[3]/result[9]
else:
result[9] = 0
lf1 = 0
result.append(lf1)
return result
def init(self, N): def init(self, N):
result = [] result = []
fetchcount = [0, 0, 0]
while len(result) < N: while len(result) < N:
while True: while True:
fetch = random.randint(0,2) bincode = []
bincode = [fetch, fetch] + [str(random.randint(0,1)) for i in range(23)] for x in self.genome_info:
if x==vcl.atidlas.FetchingPolicy:
bincode = bincode + [random.randint(0,2)]
else:
bincode = bincode + [str(random.randint(0,1)) for i in range(x)]
parameters = self.decode(bincode) parameters = self.decode(bincode)
template = self.build_template(self.TemplateType.Parameters(*parameters)) template = self.build_template(self.TemplateType.Parameters(*parameters))
registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4 registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4
@@ -90,22 +98,18 @@ class GeneticOperators(object):
local_size = template.parameters.local_size_0*template.parameters.local_size_1 local_size = template.parameters.local_size_0*template.parameters.local_size_1
occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage) occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
if not tools.skip(template, self.statement, self.device): if not tools.skip(template, self.statement, self.device):
fetchcount[fetch] = fetchcount[fetch] + 1
if max(fetchcount) - min(fetchcount) <= 1:
result.append(creator.Individual(bincode)) result.append(creator.Individual(bincode))
break break
else:
fetchcount[fetch] = fetchcount[fetch] - 1
return result return result
def mutate(self, individual): def mutate(self, individual):
while True: while True:
new_individual = copy.deepcopy(individual) new_individual = copy.deepcopy(individual)
for i in range(len(new_individual)): for i in range(len(new_individual)):
if i < 2 and random.random() < self.indpb: if isinstance(individual[i], int) and random.random() < self.indpb:
while new_individual[i] == individual[i]: while new_individual[i] == individual[i]:
new_individual[i] = random.randint(0, 2) new_individual[i] = random.randint(0, 2)
elif i >= 2 and random.random() < self.indpb: elif not isinstance(individual[i], int) and random.random() < self.indpb:
new_individual[i] = '1' if new_individual[i]=='0' else '0' new_individual[i] = '1' if new_individual[i]=='0' else '0'
parameters = self.decode(new_individual) parameters = self.decode(new_individual)
template = self.build_template(self.TemplateType.Parameters(*parameters)) template = self.build_template(self.TemplateType.Parameters(*parameters))
@@ -176,7 +180,7 @@ class GeneticOperators(object):
population[:] = self.toolbox.select(population + offspring, mu) population[:] = self.toolbox.select(population + offspring, mu)
#Update #Update
gen = gen + 1 gen = gen + 1
best_profile = '(%s)'%','.join(map(str,GeneticOperators.decode(hof[0]))); best_profile = '(%s)'%','.join(map(str,self.decode(hof[0])));
best_performance = compute_perf(hof[0].fitness.values[0]) best_performance = compute_perf(hof[0].fitness.values[0])
sys.stdout.write('Time %d | Best %d %s [ for %s ]\r'%(time.time() - start_time, best_performance, perf_metric, best_profile)) sys.stdout.write('Time %d | Best %d %s [ for %s ]\r'%(time.time() - start_time, best_performance, perf_metric, best_profile))
sys.stdout.flush() sys.stdout.flush()

View File

@@ -13,12 +13,13 @@ def train_model(X, Y, profiles):
Xmean = np.mean(X, axis=0) Xmean = np.mean(X, axis=0)
Xstd = np.std(X, axis=0) Xstd = np.std(X, axis=0)
X = (X - Xmean)/Xstd X = (X - Xmean)/Xstd
Ymax = np.max(Y) Ymax = np.max(Y)
Y = Y/Ymax Y = Y/Ymax
ref = np.argmax(np.bincount(np.argmax(Y, axis=1))) #most common profile ref = np.argmax(np.bincount(np.argmax(Y, axis=1))) #most common profile
#Cross-validation data-sets #Cross-validation data-sets
cut = int(0.1*X.shape[0]+1) cut = int(0.800*X.shape[0]+1)
XTr = X[0:cut, :] XTr = X[0:cut, :]
YTr = Y[0:cut, :] YTr = Y[0:cut, :]
XTe = X[cut:,:] XTe = X[cut:,:]
@@ -26,23 +27,15 @@ def train_model(X, Y, profiles):
#Train the model #Train the model
print("Training the model...") print("Training the model...")
ds = SupervisedDataSet(X.shape[1], Y.shape[1]) clf = ensemble.RandomForestRegressor(40).fit(XTr,YTr)
for idx, x in enumerate(X):
ds.addSample(x, Y[idx,:])
clf = buildNetwork(*[X.shape[1], 100, Y.shape[1]], hiddenclass = TanhLayer, outclass = LinearLayer)
#print fnn;
#trainer = RPropMinusTrainer( fnn, dataset=ds, verbose=True);
trainer = BackpropTrainer( clf, dataset=ds, verbose=True, momentum=0.01, weightdecay=0.01, learningrate=0.002, batchlearning=False)
trainer.trainUntilConvergence(maxEpochs=100)
#Evaluate the model #Evaluate the model
GFlops = np.empty(XTe.shape[0]) GFlops = np.empty(XTe.shape[0])
speedups = np.empty(XTe.shape[0]) speedups = np.empty(XTe.shape[0])
optspeedups = np.empty(XTe.shape[0]) optspeedups = np.empty(XTe.shape[0])
for i,x in enumerate(XTe): for i,x in enumerate(XTe):
predictions = clf.activate(x) predictions = clf.predict(x)
label = np.argmax(predictions) label = np.argmax(predictions)
# print YTe[i,label], YTe[i,ref], np.max(YTe[i,:])
speedups[i] = YTe[i,label]/YTe[i,ref] speedups[i] = YTe[i,label]/YTe[i,ref]
optspeedups[i] = np.max(YTe[i,:])/YTe[i,ref] optspeedups[i] = np.max(YTe[i,:])/YTe[i,ref]
GFlops[i] = YTe[i,ref]*Ymax GFlops[i] = YTe[i,ref]*Ymax
@@ -52,7 +45,5 @@ def train_model(X, Y, profiles):
print("Average testing speedup : %f (Optimal : %f)"%(sp.stats.gmean(speedups), sp.stats.gmean(optspeedups))) print("Average testing speedup : %f (Optimal : %f)"%(sp.stats.gmean(speedups), sp.stats.gmean(optspeedups)))
print("Average GFLOP/s : %f (Default %f, Optimal %f)"%(np.mean(np.multiply(GFlops,speedups)), np.mean(GFlops), np.mean(np.multiply(GFlops,optspeedups)))) print("Average GFLOP/s : %f (Default %f, Optimal %f)"%(np.mean(np.multiply(GFlops,speedups)), np.mean(GFlops), np.mean(np.multiply(GFlops,optspeedups))))
print("Minimum speedup is %f wrt %i GFlops"%(np.min(speedups), GFlops[np.argmin(speedups)])) print("Minimum speedup is %f wrt %i GFlops"%(np.min(speedups), GFlops[np.argmin(speedups)]))
print("Maximum speedup is %f wrt %i GFlops"%(np.max(speedups), GFlops[np.argmax(speedups)])) print("Maximum speedup is %f wrt %i GFlops for %s"%(np.max(speedups), GFlops[np.argmax(speedups)], X[np.argmax(speedups)]*Xstd+Xmean))
print("--------") print("--------")
print clf

View File

@@ -48,6 +48,6 @@ from genetic import GeneticOperators
#~ sys.stdout.flush() #~ sys.stdout.flush()
#~ #~
def genetic(statement, context, TemplateType, build_template, parameter_names, compute_perf, perf_metric, out): def genetic(statement, device, TemplateType, build_template, compute_perf, perf_metric, out):
GA = GeneticOperators(context.devices[0], statement, parameter_names, TemplateType, build_template, out) GA = GeneticOperators(device, statement, TemplateType, build_template, out)
return GA.optimize(maxtime='2m30s', maxgen=1000, compute_perf=compute_perf, perf_metric=perf_metric) return GA.optimize(maxtime='2m30s', maxgen=1000, compute_perf=compute_perf, perf_metric=perf_metric)