Now compiling ATIDLAS
This commit is contained in:
172
python/autotune/autotune.py
Normal file
172
python/autotune/autotune.py
Normal file
@@ -0,0 +1,172 @@
|
||||
from __future__ import division
|
||||
|
||||
import argparse
|
||||
import itertools
|
||||
import os
|
||||
|
||||
from configobj import ConfigObj
|
||||
from numpy import random
|
||||
|
||||
import pyopencl as cl
|
||||
import pyviennacl as vcl
|
||||
from pyviennacl import backend, opencl, atidlas
|
||||
from dataset import generate_dataset
|
||||
from model import train_model
|
||||
import tools
|
||||
import optimize
|
||||
import sys
|
||||
|
||||
DATATYPES = { 'single' : vcl.float32,
|
||||
'double' : vcl.float64 }
|
||||
|
||||
TYPES = { 'vector-axpy': {'template':vcl.atidlas.VectorAxpyTemplate,
|
||||
'perf-index':lambda x: 3*x[0]*x[1][0]/x[2]*1e-9,
|
||||
'perf-measure':'GB/s'},
|
||||
|
||||
'matrix-axpy': {'template':vcl.atidlas.MatrixAxpyTemplate,
|
||||
'perf-index':lambda x: 3*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
|
||||
'perf-measure':'GB/s'},
|
||||
|
||||
'reduction': {'template':vcl.atidlas.ReductionTemplate,
|
||||
'perf-index':lambda x: 2*x[0]*x[1][0]/x[2]*1e-9,
|
||||
'perf-measure':'GB/s'},
|
||||
|
||||
'row-wise-reduction': {'template':vcl.atidlas.RowWiseReductionTemplate,
|
||||
'perf-index':lambda x: x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
|
||||
'perf-measure':'GB/s'},
|
||||
|
||||
'matrix-product': {'template':vcl.atidlas.MatrixProductTemplate,
|
||||
'perf-index': lambda x: 2*x[1][0]*x[1][1]*x[1][2]/x[2]*1e-9,
|
||||
'perf-measure': 'GFLOP/s'} }
|
||||
|
||||
def do_tuning(config_fname, viennacl_root):
|
||||
config = ConfigObj(config_fname)
|
||||
def map_to_list(T, x):
|
||||
return list(map(T, x if isinstance(x, list) else [x]))
|
||||
for operation in ['vector-axpy', 'matrix-axpy', 'reduction', 'row-wise-reduction', 'matrix-product']:
|
||||
if operation in config:
|
||||
p = config[operation]
|
||||
confdevices = p['devices']
|
||||
all_devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
|
||||
DEVICES_PRESETS = {'all': all_devices,
|
||||
'gpus': [d for d in all_devices if d.type==cl.device_type.GPU],
|
||||
'cpus': [d for d in all_devices if d.type==cl.device_type.CPU],
|
||||
'accelerators': [d for d in all_devices if d.type==cl.device_type.ACCELERATOR]
|
||||
}
|
||||
devices = DEVICES_PRESETS[confdevices] if confdevices in DEVICES_PRESETS else [all_devices[int(i)] for i in confdevices]
|
||||
precisions = map_to_list(str, p['precision'])
|
||||
if 'all' in precisions:
|
||||
precisions = ['single','double']
|
||||
datatypes = [DATATYPES[k] for k in precisions]
|
||||
#Iterate through the datatypes and the devices
|
||||
for datatype, device in itertools.product(datatypes, devices):
|
||||
ctx = cl.Context([device])
|
||||
ctx = vcl.backend.Context(ctx)
|
||||
device = ctx.current_device
|
||||
#Check data-type
|
||||
if datatype is vcl.float64 and not device.double_fp_config:
|
||||
sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...')
|
||||
continue
|
||||
#Helper for execution
|
||||
def execute(device, node, other_params, sizes, fname = os.devnull, parameters = None):
|
||||
with vcl.Statement(node) as statement:
|
||||
if parameters:
|
||||
TemplateType = TYPES[operation]['template']
|
||||
return tools.benchmark(TemplateType(TemplateType.Parameters(*parameters),*other_params), statement, device)
|
||||
print('-----')
|
||||
print(' '.join(map(str, ("Now tuning:", datatype.__name__, '-', operation, '-'.join(other_params), '[' + device.name, '(' + device.platform.name + ')] for sizes', sizes))))
|
||||
with open(fname, "w+") as archive:
|
||||
return optimize.genetic(statement, device, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
|
||||
lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive)
|
||||
#Helper for tuning
|
||||
def tune(execution_handler, nTuning, nDataPoints, draw, additional_parameters):
|
||||
if 'size' in p:
|
||||
profile = execution_handler(map_to_list(int, p['size']))
|
||||
if 'viennacl-src-root' in config:
|
||||
tools.update_viennacl_headers(config['viennacl-src-root'],device,datatype,operation,additional_parameters,profile)
|
||||
else:
|
||||
def compute_perf(x, t):
|
||||
return TYPES[operation]['perf-index']([datatype().itemsize, x, t])
|
||||
X, Y, profiles = generate_dataset(TYPES[operation]['template'], execution_handler, nTuning, nDataPoints, draw)
|
||||
train_model(X, Y, profiles, TYPES[operation]['perf-measure'])
|
||||
|
||||
#Vector AXPY
|
||||
if operation=='vector-axpy':
|
||||
def execution_handler(sizes, fname=os.devnull, parameters=None):
|
||||
x = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
|
||||
y = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
|
||||
z = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
|
||||
return execute(device, vcl.Assign(z, vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y))), (), sizes, fname, parameters)
|
||||
tune(execution_handler, 30, 1000, lambda : 64*np.random.randint(low=10, high=100000, size=1), ())
|
||||
#Reduction
|
||||
if operation=='reduction':
|
||||
def execution_handler(sizes, fname=os.devnull, parameters=None):
|
||||
x = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
|
||||
y = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
|
||||
s = vcl.Scalar(0, context=ctx, dtype=datatype)
|
||||
return execute(device, vcl.Assign(s, vcl.Dot(x,y)), (), sizes, fname, parameters)
|
||||
tune(execution_handler, 50, 1000, lambda : 64*np.random.randint(low=10, high=100000, size=1), ())
|
||||
#Matrix AXPY
|
||||
if operation=='matrix-axpy':
|
||||
def execution_handler(sizes, fname=os.devnull, parameters=None):
|
||||
A = vcl.Matrix(sizes, context=ctx, dtype=datatype)
|
||||
B = vcl.Matrix(sizes, context=ctx, dtype=datatype)
|
||||
C = vcl.Matrix(sizes, context=ctx, dtype=datatype)
|
||||
return execute(device, vcl.Assign(C,A+B), (), sizes, fname, parameters)
|
||||
tune(execution_handler, 50, 1000, lambda : 64*np.random.randint(low=5, high=100, size=2), ())
|
||||
#Row-wise reduction
|
||||
if operation=='row-wise-reduction':
|
||||
layouts = map_to_list(str,p['layout'])
|
||||
if 'all' in layouts:
|
||||
layouts = ['N', 'T']
|
||||
for A_trans in layouts:
|
||||
def execution_handler(sizes, fname=os.devnull, parameters=None):
|
||||
A = vcl.Matrix(sizes if A_trans=='N' else sizes[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR)
|
||||
x = vcl.Vector(sizes[1] if A_trans=='N' else sizes[0], context=ctx, dtype=datatype)
|
||||
y = vcl.Vector(sizes[0] if A_trans=='N' else sizes[1], context=ctx, dtype=datatype)
|
||||
LHS = A if A_trans=='N' else A.T
|
||||
return execute(device, vcl.Assign(y, LHS*x), (), sizes, fname, parameters)
|
||||
tune(execution_handler, 50, 1000, lambda : 64*np.random.randint(low=5, high=100, size=2), (A_trans,))
|
||||
#Matrix Product
|
||||
if operation=='matrix-product':
|
||||
layouts = map_to_list(str,p['layout'])
|
||||
if 'all' in layouts:
|
||||
layouts = ['NN', 'NT', 'TN', 'TT']
|
||||
for layout in layouts:
|
||||
def execution_handler(sizes, fname=os.devnull, parameters=None):
|
||||
A_trans = layout[0]
|
||||
B_trans = layout[1]
|
||||
A = vcl.Matrix((sizes[0], sizes[1]) if A_trans=='N' else (sizes[1],sizes[0]), context=ctx, dtype=datatype, layout=vcl.COL_MAJOR);
|
||||
B = vcl.Matrix((sizes[1], sizes[2]) if B_trans=='N' else (sizes[2],sizes[1]), context=ctx, dtype=datatype, layout=vcl.COL_MAJOR);
|
||||
LHS = A if A_trans=='N' else A.T
|
||||
RHS = B if B_trans=='N' else B.T
|
||||
alpha = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
|
||||
beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
|
||||
C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR)
|
||||
return execute(device, vcl.Assign(C,LHS*RHS*alpha + C*beta),(A_trans, B_trans), sizes, fname, parameters)
|
||||
tune(execution_handler, 50, 2000, lambda : 64*np.random.randint(low=1, high=40, size=3),(layout[0], layout[1]))
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser();
|
||||
subparsers = parser.add_subparsers(dest='action')
|
||||
print_devices_parser = subparsers.add_parser('list-devices', help='list the devices available')
|
||||
tune_parser = subparsers.add_parser('tune', help='tune using a specific configuration file')
|
||||
tune_parser.add_argument("--config", default="config.ini", required=False, type=str)
|
||||
tune_parser.add_argument("--viennacl-root", default='', required=False, type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
if(args.action=='list-devices'):
|
||||
print("----------------")
|
||||
print("Devices available:")
|
||||
print("----------------")
|
||||
devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
|
||||
for (i, d) in enumerate(devices):
|
||||
print('Device', i, ':', tools.DEVICE_TYPE_PREFIX[d.type].upper() + ':', d.name, 'on', d.platform.name)
|
||||
print("----------------")
|
||||
else:
|
||||
print("------")
|
||||
print("Auto-tuning")
|
||||
print("------")
|
||||
do_tuning(args.config, args.viennacl_root)
|
63
python/autotune/dataset.py
Normal file
63
python/autotune/dataset.py
Normal file
@@ -0,0 +1,63 @@
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import random
|
||||
import numpy as np
|
||||
from pyviennacl.atidlas import FetchingPolicy
|
||||
|
||||
def resample(X, draw):
|
||||
Xtuples = [tuple(x) for x in X]
|
||||
r = random.random()
|
||||
while(True):
|
||||
x = draw()
|
||||
if tuple(x) not in Xtuples:
|
||||
break
|
||||
return x.astype(int)
|
||||
|
||||
def generate_dataset(TemplateType, execution_handler, nTuning, nDataPoints, draw):
|
||||
|
||||
print "Getting some good profiles..."
|
||||
nDim = draw().size
|
||||
X = np.empty((nTuning, nDim))
|
||||
t = np.empty(nTuning)
|
||||
profiles = []
|
||||
for i in range(nTuning):
|
||||
x = resample(X, draw)
|
||||
y = execution_handler(x)
|
||||
if y not in profiles:
|
||||
profiles.append(y)
|
||||
idx = profiles.index(y)
|
||||
X[i,:] = x
|
||||
t[i] = idx
|
||||
|
||||
print "Generating the dataset..."
|
||||
Y = np.empty((nDataPoints, len(profiles)))
|
||||
X = np.empty((nDataPoints, nDim))
|
||||
t = []
|
||||
|
||||
for i in range(nDataPoints):
|
||||
x = resample(X, draw)
|
||||
for j,y in enumerate(profiles):
|
||||
T = execution_handler(x, os.devnull, y)
|
||||
Y[i,j] = T
|
||||
idx = np.argmax(Y[i,:])
|
||||
X[i,:] = x
|
||||
t = np.argmax(Y[:i+1,], axis=1)
|
||||
if i%10==0:
|
||||
sys.stdout.write('%d data points generated\r'%i)
|
||||
sys.stdout.flush()
|
||||
|
||||
template_name = TemplateType.__name__
|
||||
dir = os.path.join("data", template_name)
|
||||
if not os.path.exists(dir):
|
||||
os.makedirs(dir)
|
||||
|
||||
np.savetxt(os.path.join(dir,"profiles.csv"), profiles)
|
||||
np.savetxt(os.path.join(dir,"X.csv"), X)
|
||||
np.savetxt(os.path.join(dir,"Y.csv"), Y)
|
||||
|
||||
profiles = np.loadtxt(os.path.join(dir, "profiles.csv"))
|
||||
X = np.loadtxt(os.path.join(dir, "X.csv"),ndmin=2)
|
||||
Y = np.loadtxt(os.path.join(dir, "Y.csv"),ndmin=2)
|
||||
|
||||
return X, Y, profiles
|
190
python/autotune/genetic.py
Normal file
190
python/autotune/genetic.py
Normal file
@@ -0,0 +1,190 @@
|
||||
import random
|
||||
import time
|
||||
import sys
|
||||
import tools
|
||||
import pyviennacl as vcl
|
||||
import numpy as np
|
||||
import copy
|
||||
|
||||
from deap import algorithms
|
||||
from deap import base
|
||||
from deap import creator
|
||||
from deap import tools as deap_tools
|
||||
|
||||
from collections import OrderedDict as odict
|
||||
|
||||
|
||||
def closest_divisor(N, x):
|
||||
x_low=x_high=max(1,min(round(x),N))
|
||||
while N % x_low > 0 and x_low>0:
|
||||
x_low = x_low - 1
|
||||
while N % x_high > 0 and x_high < N:
|
||||
x_high = x_high + 1
|
||||
return x_low if x - x_low < x_high - x else x_high
|
||||
|
||||
def b_gray_to_bin(A='00000000', endian='big'):
|
||||
assert type(endian) is str
|
||||
assert endian == 'little' or endian == 'big'
|
||||
if endian == 'little': A = A[::-1] # Make sure endianness is big before conversion
|
||||
b = A[0]
|
||||
for i in range(1, len(A)): b += str( int(b[i-1] != A[i]) )
|
||||
if endian == 'little': b = b[::-1] # Convert back to little endian if necessary
|
||||
return b
|
||||
|
||||
class GeneticOperators(object):
|
||||
|
||||
def __init__(self, device, statement, TemplateType, build_template, out):
|
||||
self.device = device
|
||||
self.statement = statement
|
||||
self.TemplateType = TemplateType
|
||||
self.ParameterType = TemplateType.Parameters
|
||||
self.build_template = build_template
|
||||
self.cache = {}
|
||||
self.out = out
|
||||
|
||||
self.genome_info = {
|
||||
vcl.atidlas.VectorAxpyTemplate: [3,4,4,vcl.atidlas.FetchingPolicy],
|
||||
vcl.atidlas.ReductionTemplate: [3,4,4,vcl.atidlas.FetchingPolicy],
|
||||
vcl.atidlas.MatrixAxpyTemplate: [3,3,3,3,3,vcl.atidlas.FetchingPolicy],
|
||||
vcl.atidlas.RowWiseReductionTemplate: [3,3,3,4,vcl.atidlas.FetchingPolicy],
|
||||
vcl.atidlas.MatrixProductTemplate: [3,3,3,3,3,3,3,vcl.atidlas.FetchingPolicy,vcl.atidlas.FetchingPolicy,3]
|
||||
}[TemplateType]
|
||||
self.indpb = 1.0/sum([1 if x==vcl.atidlas.FetchingPolicy else x for x in self.genome_info])
|
||||
|
||||
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
|
||||
creator.create("Individual", list, fitness=creator.FitnessMin)
|
||||
|
||||
self.toolbox = base.Toolbox()
|
||||
self.toolbox.register("population", self.init)
|
||||
self.toolbox.register("evaluate", self.evaluate)
|
||||
self.toolbox.register("mate", deap_tools.cxTwoPoint)
|
||||
self.toolbox.register("mutate", self.mutate)
|
||||
self.toolbox.register("select", deap_tools.selNSGA2)
|
||||
|
||||
def decode(self, genome):
|
||||
FetchingPolicy = vcl.atidlas.FetchingPolicy
|
||||
fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED]
|
||||
decode_element = lambda x:2**int(b_gray_to_bin(''.join(x)), 2)
|
||||
result = []
|
||||
offset = 0
|
||||
for x in self.genome_info:
|
||||
if x==vcl.atidlas.FetchingPolicy:
|
||||
result.append(fetch[genome[offset]])
|
||||
offset = offset + 1
|
||||
else:
|
||||
result.append(decode_element(genome[offset:offset+x]))
|
||||
offset = offset + x
|
||||
#GEMM peculiarities
|
||||
if self.TemplateType==vcl.atidlas.MatrixProductTemplate:
|
||||
if FetchingPolicy.FETCH_FROM_LOCAL in result:
|
||||
lf1 = result[1]*result[3]/result[9]
|
||||
else:
|
||||
result[9] = 0
|
||||
lf1 = 0
|
||||
result.append(lf1)
|
||||
return result
|
||||
|
||||
def init(self, N):
|
||||
result = []
|
||||
while len(result) < N:
|
||||
while True:
|
||||
bincode = []
|
||||
for x in self.genome_info:
|
||||
if x==vcl.atidlas.FetchingPolicy:
|
||||
bincode = bincode + [random.randint(0,2)]
|
||||
else:
|
||||
bincode = bincode + [str(random.randint(0,1)) for i in range(x)]
|
||||
parameters = self.decode(bincode)
|
||||
template = self.build_template(self.TemplateType.Parameters(*parameters))
|
||||
registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4
|
||||
lmem_usage = template.lmem_usage(vcl.atidlas.StatementsTuple(self.statement))
|
||||
local_size = template.parameters.local_size_0*template.parameters.local_size_1
|
||||
occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
|
||||
if not tools.skip(template, self.statement, self.device):
|
||||
result.append(creator.Individual(bincode))
|
||||
break
|
||||
return result
|
||||
|
||||
def mutate(self, individual):
|
||||
while True:
|
||||
new_individual = copy.deepcopy(individual)
|
||||
for i in range(len(new_individual)):
|
||||
if isinstance(individual[i], int) and random.random() < self.indpb:
|
||||
while new_individual[i] == individual[i]:
|
||||
new_individual[i] = random.randint(0, 2)
|
||||
elif not isinstance(individual[i], int) and random.random() < self.indpb:
|
||||
new_individual[i] = '1' if new_individual[i]=='0' else '0'
|
||||
parameters = self.decode(new_individual)
|
||||
template = self.build_template(self.TemplateType.Parameters(*parameters))
|
||||
#print tools.skip(template, self.statement, self.device), parameters
|
||||
if not tools.skip(template, self.statement, self.device):
|
||||
break
|
||||
return new_individual,
|
||||
|
||||
def evaluate(self, individual):
|
||||
if tuple(individual) not in self.cache:
|
||||
parameters = self.decode(individual)
|
||||
template = self.build_template(self.TemplateType.Parameters(*parameters))
|
||||
try:
|
||||
tt = tools.benchmark(template, self.statement, self.device)
|
||||
self.out.write(','.join([str(tt)]+map(str,map(int,parameters)))+'\n')
|
||||
self.cache[tuple(individual)] = tt
|
||||
except:
|
||||
self.cache[tuple(individual)] = 10
|
||||
return self.cache[tuple(individual)],
|
||||
|
||||
def optimize(self, maxtime, maxgen, compute_perf, perf_metric):
|
||||
hof = deap_tools.HallOfFame(1)
|
||||
# Begin the generational process
|
||||
gen = 0
|
||||
maxtime = time.strptime(maxtime, '%Mm%Ss')
|
||||
maxtime = maxtime.tm_min*60 + maxtime.tm_sec
|
||||
start_time = time.time()
|
||||
|
||||
mu = 30
|
||||
cxpb = 0.2
|
||||
mutpb = 0.7
|
||||
|
||||
population = self.init(mu)
|
||||
invalid_ind = [ind for ind in population if not ind.fitness.valid]
|
||||
fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
|
||||
for ind, fit in zip(invalid_ind, fitnesses):
|
||||
ind.fitness.values = fit
|
||||
hof.update(population)
|
||||
|
||||
while time.time() - start_time < maxtime and gen < maxgen:
|
||||
# Vary the population
|
||||
offspring = []
|
||||
for _ in xrange(mu):
|
||||
op_choice = random.random()
|
||||
if op_choice < cxpb: # Apply crossover
|
||||
ind1, ind2 = map(self.toolbox.clone, random.sample(population, 2))
|
||||
ind1, ind2 = self.toolbox.mate(ind1, ind2)
|
||||
del ind1.fitness.values
|
||||
offspring.append(ind1)
|
||||
elif op_choice < cxpb + mutpb: # Apply mutation
|
||||
ind = self.toolbox.clone(random.choice(population))
|
||||
ind, = self.toolbox.mutate(ind)
|
||||
del ind.fitness.values
|
||||
offspring.append(ind)
|
||||
else: # Apply reproduction
|
||||
offspring.append(random.choice(population))
|
||||
#for x in offspring:
|
||||
#print self.decode(x)
|
||||
# Evaluate the individuals with an invalid fitness
|
||||
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
|
||||
fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
|
||||
for ind, fit in zip(invalid_ind, fitnesses):
|
||||
ind.fitness.values = fit
|
||||
# Update the hall of fame with the generated individuals
|
||||
hof.update(offspring)
|
||||
# Select the next generation population
|
||||
population[:] = self.toolbox.select(population + offspring, mu)
|
||||
#Update
|
||||
gen = gen + 1
|
||||
best_profile = '(%s)'%','.join(map(str,self.decode(hof[0])))
|
||||
best_performance = compute_perf(hof[0].fitness.values[0])
|
||||
sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\r'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile))
|
||||
sys.stdout.flush()
|
||||
sys.stdout.write('\n')
|
||||
return self.decode(hof[0])
|
83
python/autotune/model.py
Normal file
83
python/autotune/model.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from sklearn import tree
|
||||
from sklearn import ensemble
|
||||
|
||||
from numpy import array, bincount, mean, std, max, argmax, min, argmin, median
|
||||
from scipy.stats import gmean
|
||||
|
||||
|
||||
# def random_forest(Xtr, Ytr):
|
||||
# clf = ensemble.RandomForestRegressor(10, max_depth=7).fit(Xtr,Ytr)
|
||||
#
|
||||
# def predict_tree(tree, x):
|
||||
# tree_ = tree.tree_
|
||||
# children_left = tree_.children_left
|
||||
# children_right = tree_.children_right
|
||||
# threshold = tree_.threshold
|
||||
# feature = tree_.feature
|
||||
# value = tree_.value
|
||||
# idx = 0
|
||||
# while children_left[idx]!=-1:
|
||||
# if x[0, feature[idx]] <= threshold[idx]:
|
||||
# idx = children_left[idx]
|
||||
# else:
|
||||
# idx = children_right[idx]
|
||||
# return value[[idx],:,:][:,:,0]
|
||||
#
|
||||
# s = 0
|
||||
# for e in clf.estimators_:
|
||||
# tree_ = e.tree_
|
||||
# children_left = tree_.children_left
|
||||
# children_right = tree_.children_right
|
||||
# threshold = tree_.threshold
|
||||
# feature = tree_.feature
|
||||
# value = tree_.value
|
||||
# s = s + value.size + feature.size + threshold.size + children_right.size + children_left.size
|
||||
# print s*4*1e-3
|
||||
# return clf, clf.predict
|
||||
#
|
||||
# def train_nn(layer_sizes, XTr, YTr, XTe, YTe):
|
||||
# optimizer = HF(open(os.devnull, 'w'), 15)
|
||||
# optimizer.doCGBacktracking = True
|
||||
# net = FeedforwardNeuralNet(layer_sizes, [Act.Tanh() for i in range(len(layer_sizes)-2)], Act.Linear(), 1e-5)
|
||||
#
|
||||
# nbatch=10
|
||||
# bsize = XTr.shape[0]/nbatch
|
||||
# data = ((XTr[(i%nbatch)*bsize:(i%nbatch+1)*bsize,:], YTr[(i%nbatch)*bsize:(i%nbatch+1)*bsize,:]) for i in range(nbatch))
|
||||
# data = HFDataSource(data, bsize, gradBatchSize = nbatch*bsize, curvatureBatchSize = bsize, lineSearchBatchSize =nbatch*bsize, gradBatchIsTrainingSet=True)
|
||||
# iters = optimizer.optimize(HFModel(net), data, 300, otherPrecondDampingTerm=net.L2Cost)
|
||||
# bestte = collections.deque([float("inf")]*5, maxlen=5)
|
||||
# for i,w in enumerate(iters):
|
||||
# Diffte = YTe - net.predictions(XTe).as_numpy_array()
|
||||
# Difftr = YTr - net.predictions(XTr).as_numpy_array()
|
||||
# Ete = np.sum(Diffte**2)
|
||||
# Etr = np.sum(Difftr**2)
|
||||
# bestte.append(min(min(bestte),Ete))
|
||||
# if min(bestte)==max(bestte):
|
||||
# print 'Final test error: ', Ete
|
||||
# return net, net.predictions
|
||||
# print 'Iteration %d | Test error = %.2f | Train error = %.2f'%(i, Ete, Etr)
|
||||
# return net, net.predictions
|
||||
|
||||
def train_model(X, Y, profiles, metric):
|
||||
print("Building the model...")
|
||||
|
||||
Xmean = mean(X)
|
||||
Xstd = std(X)
|
||||
X = (X - Xmean)/Xstd
|
||||
|
||||
Y = Y[:, :]
|
||||
Ymax = max(Y)
|
||||
Y = Y/Ymax
|
||||
|
||||
ref = argmax(bincount(argmin(Y, axis=1))) #most common profile
|
||||
cut = int(0.800*X.shape[0]+1)
|
||||
|
||||
#Train the model
|
||||
clf = ensemble.RandomForestRegressor(10, max_depth=10).fit(X[:cut,:], Y[:cut,:])
|
||||
|
||||
t = argmin(clf.predict(X[cut:,:]), axis = 1)
|
||||
s = array([y[ref]/y[k] for y,k in zip(Y[cut:,:], t)])
|
||||
tt = argmin(Y[cut:,:], axis = 1)
|
||||
ss = array([y[ref]/y[k] for y,k in zip(Y[cut:,:], tt)])
|
||||
print("Testing speedup : mean = %.3f, median = %.3f, min = %.3f, max %.3f"%(gmean(s), median(s), min(s), max(s)))
|
||||
print("Optimal speedup : mean = %.3f, median = %.3f, min = %.3f, max %.3f"%(gmean(ss), median(ss), min(ss), max(ss)))
|
53
python/autotune/optimize.py
Normal file
53
python/autotune/optimize.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import array
|
||||
import numpy as np
|
||||
import random
|
||||
import sys
|
||||
|
||||
import itertools
|
||||
import tools
|
||||
import deap.tools
|
||||
|
||||
from genetic import GeneticOperators
|
||||
|
||||
#~ def parameter_space(operation):
|
||||
#~ simd = [1, 2, 4, 8]
|
||||
#~ pow2_1D = [2**k for k in range(12)]
|
||||
#~ pow2_2D = [2**i for i in range(8)]
|
||||
#~ pow2_2D_unrolled = [2**i for i in range(8)]
|
||||
#~ FetchingPolicy = vcl.atidlas.FetchingPolicy
|
||||
#~ fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED]
|
||||
#~ if operation == 'vector-axpy': return [simd, pow2_1D, pow2_1D, fetch]
|
||||
#~ if operation == 'reduction': return [simd, pow2_1D, pow2_1D, fetch]
|
||||
#~ if operation == 'matrix-axpy': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D, fetch]
|
||||
#~ if operation == 'row-wise-reduction': return [simd, pow2_2D, pow2_2D, pow2_1D, fetch]
|
||||
#~ if operation == 'matrix-product': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D_unrolled, pow2_2D_unrolled, pow2_2D_unrolled, fetch, fetch, [0] + pow2_2D, [0] + pow2_2D]
|
||||
#~
|
||||
|
||||
#~ def exhaustive(statement, context, TemplateType, build_template, parameter_names, all_parameters, compute_perf, perf_metric, out):
|
||||
#~ device = context.devices[0]
|
||||
#~ nvalid = 0
|
||||
#~ current = 0
|
||||
#~ minT = float('inf')
|
||||
#~ for individual in itertools.product(*all_parameters):
|
||||
#~ template = build_template(TemplateType.Parameters(*individual))
|
||||
#~ if not tools.skip(template, statement, device):
|
||||
#~ nvalid = nvalid + 1
|
||||
#~ for individual in itertools.product(*all_parameters):
|
||||
#~ template = build_template(TemplateType.Parameters(*individual))
|
||||
#~ try:
|
||||
#~ T = tools.benchmark(template,statement,device)
|
||||
#~ current = current + 1
|
||||
#~ if T < minT:
|
||||
#~ minT = T
|
||||
#~ best = individual
|
||||
#~ sys.stdout.write('%d / %d , Best is %d %s for %s\r'%(current, nvalid, compute_perf(minT), perf_metric, best))
|
||||
#~ sys.stdout.flush()
|
||||
#~ except:
|
||||
#~ pass
|
||||
#~ sys.stdout.write('\n')
|
||||
#~ sys.stdout.flush()
|
||||
#~
|
||||
|
||||
def genetic(statement, device, TemplateType, build_template, compute_perf, perf_metric, out):
|
||||
GA = GeneticOperators(device, statement, TemplateType, build_template, out)
|
||||
return GA.optimize(maxtime='2m30s', maxgen=1000, compute_perf=compute_perf, perf_metric=perf_metric)
|
343
python/autotune/tools.py
Normal file
343
python/autotune/tools.py
Normal file
@@ -0,0 +1,343 @@
|
||||
from __future__ import division
|
||||
|
||||
import pyopencl
|
||||
import time
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pyopencl as cl
|
||||
import pyviennacl as vcl
|
||||
from pyviennacl.atidlas import StatementsTuple
|
||||
|
||||
class PhysicalLimitsNV:
|
||||
def __init__(self, dev):
|
||||
self.compute_capability = pyopencl.characterize.nv_compute_capability(dev)
|
||||
if self.compute_capability[0]==1:
|
||||
if self.compute_capability[1]<=1:
|
||||
self.warps_per_mp = 24
|
||||
self.threads_per_mp = 768
|
||||
self.num_32b_reg_per_mp = 8192
|
||||
self.reg_alloc_unit_size = 256
|
||||
else:
|
||||
self.warps_per_mp = 32
|
||||
self.threads_per_mp = 1024
|
||||
self.num_32b_reg_per_mp = 16384
|
||||
self.reg_alloc_unit_size = 512
|
||||
self.threads_per_warp = 32
|
||||
self.thread_blocks_per_mp = 8
|
||||
self.reg_alloc_granularity = 'block'
|
||||
self.reg_per_thread = 124
|
||||
self.shared_mem_per_mp = 16384
|
||||
self.shared_mem_alloc_unit_size = 512
|
||||
self.warp_alloc_granularity = 2
|
||||
self.max_thread_block_size = 512
|
||||
|
||||
elif self.compute_capability[0]==2:
|
||||
self.threads_per_warp = 32
|
||||
self.warps_per_mp = 48
|
||||
self.threads_per_mp = 1536
|
||||
self.thread_blocks_per_mp = 8
|
||||
self.num_32b_reg_per_mp = 32768
|
||||
self.reg_alloc_unit_size = 64
|
||||
self.reg_alloc_granularity = 'warp'
|
||||
self.reg_per_thread = 63
|
||||
self.shared_mem_per_mp = 49152
|
||||
self.shared_mem_alloc_unit_size = 128
|
||||
self.warp_alloc_granularity = 2
|
||||
self.max_thread_block_size = 1024
|
||||
|
||||
elif self.compute_capability[0]==3:
|
||||
self.threads_per_warp = 32
|
||||
self.warps_per_mp = 64
|
||||
self.threads_per_mp = 2048
|
||||
self.thread_blocks_per_mp = 16
|
||||
self.num_32b_reg_per_mp = 65536
|
||||
self.reg_alloc_unit_size = 256
|
||||
self.reg_alloc_granularity = 'warp'
|
||||
if(self.compute_capability[1]==5):
|
||||
self.reg_per_thread = 255
|
||||
else:
|
||||
self.reg_per_thread = 63
|
||||
self.shared_mem_per_mp = 49152
|
||||
self.shared_mem_alloc_unit_size = 256
|
||||
self.warp_alloc_granularity = 4
|
||||
self.max_thread_block_size = 1024
|
||||
|
||||
else:
|
||||
raise Exception('Compute capability not supported!')
|
||||
|
||||
class PhysicalLimitsAMD:
|
||||
def __init__(self, dev):
|
||||
|
||||
infos =\
|
||||
{
|
||||
#HD5000
|
||||
'Cedar': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 8192},
|
||||
'Redwood': {'arch': 'VLIW', 'WFmax_cu': 62, 'LDS_cu': 32768, 'GPR_cu': 16384},
|
||||
'Juniper': {'arch': 'VLIW', 'WFmax_cu': 24.8, 'LDS_cu': 32768, 'GPR_cu': 16384},
|
||||
'Cypress': {'arch': 'VLIW', 'WFmax_cu': 27.6, 'LDS_cu': 32768, 'GPR_cu': 16384},
|
||||
'Hemlock': {'arch': 'VLIW', 'WFmax_cu': 24.8, 'LDS_cu': 32768, 'GPR_cu': 16384},
|
||||
|
||||
#HD6000
|
||||
'Seymour': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 16384},
|
||||
'Caicos': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 16384},
|
||||
'Turks': {'arch': 'VLIW', 'WFmax_cu': 41.3, 'LDS_cu': 32768, 'GPR_cu': 16384},
|
||||
'Whistler': {'arch': 'VLIW', 'WFmax_cu': 41.3, 'LDS_cu': 32768, 'GPR_cu': 16384},
|
||||
'Bart': {'arch': 'VLIW', 'WFmax_cu': 49.6, 'LDS_cu': 32768, 'GPR_cu': 16384},
|
||||
|
||||
#HD7000
|
||||
'Capeverde': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
|
||||
'Pitcairn': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
|
||||
'Bonaire': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
|
||||
'Tahiti': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
|
||||
|
||||
#Rx 200
|
||||
'Oland': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
|
||||
'Tonga': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
|
||||
'Hawaii': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536}
|
||||
}
|
||||
|
||||
self.WFsize = 64
|
||||
self.WFmax_cu = infos[dev.name]['WFmax_cu']
|
||||
self.LDS_cu = infos[dev.name]['LDS_cu']
|
||||
self.GPR_cu = infos[dev.name]['GPR_cu']
|
||||
self.arch = infos[dev.name]['arch']
|
||||
pass
|
||||
|
||||
def _int_floor(value, multiple_of=1):
|
||||
"""Round C{value} down to be a C{multiple_of} something."""
|
||||
# Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
|
||||
from math import floor
|
||||
return int(floor(value/multiple_of))*multiple_of
|
||||
|
||||
def _int_ceiling(value, multiple_of=1):
|
||||
"""Round C{value} up to be a C{multiple_of} something."""
|
||||
# Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
|
||||
from math import ceil
|
||||
return int(ceil(value/multiple_of))*multiple_of
|
||||
|
||||
class OccupancyRecord:
|
||||
|
||||
def init_nvidia(self, dev, threads, shared_mem, registers):
|
||||
pl = PhysicalLimitsNV(dev)
|
||||
limits = []
|
||||
allocated_warps = max(1,_int_ceiling(threads/pl.threads_per_warp))
|
||||
max_warps_per_mp = pl.warps_per_mp
|
||||
limits.append((min(pl.thread_blocks_per_mp, _int_floor(max_warps_per_mp/allocated_warps)), 'warps'))
|
||||
|
||||
if registers>0:
|
||||
if registers > pl.reg_per_thread:
|
||||
limits.append((0, 'registers'))
|
||||
else:
|
||||
allocated_regs = {'warp': allocated_warps,
|
||||
'block': _int_ceiling(_int_ceiling(allocated_warps, pl.warp_alloc_granularity)*registers*pl.threads_per_warp,allocated_warps)}[pl.reg_alloc_granularity]
|
||||
max_reg_per_mp = {'warp': _int_floor(pl.num_32b_reg_per_mp/_int_ceiling(registers*pl.threads_per_warp, pl.reg_alloc_unit_size), pl.warp_alloc_granularity),
|
||||
'block':pl.num_32b_reg_per_mp}[pl.reg_alloc_granularity]
|
||||
limits.append((_int_floor(max_reg_per_mp/allocated_regs), 'registers'))
|
||||
|
||||
if shared_mem>0:
|
||||
allocated_shared_mem = _int_ceiling(shared_mem, pl.shared_mem_alloc_unit_size)
|
||||
max_shared_mem_per_mp = pl.shared_mem_per_mp
|
||||
limits.append((_int_floor(max_shared_mem_per_mp/allocated_shared_mem), 'shared memory'))
|
||||
|
||||
limit, limited_by = min(limits)
|
||||
warps_per_mp = limit*allocated_warps
|
||||
self.occupancy = 100*warps_per_mp/pl.warps_per_mp
|
||||
|
||||
def init_amd(self, dev, threads, shared_mem, NReg):
|
||||
pl = PhysicalLimitsAMD(dev)
|
||||
limits = {}
|
||||
|
||||
WFwg = _int_ceiling(threads/pl.WFsize)
|
||||
#WFmax without constraint
|
||||
if pl.arch=='VLIW':
|
||||
limits['wg'] = pl.WFmax_cu if WFwg > pl.WFmax_cu else _int_floor(pl.WFmax_cu,WFwg)
|
||||
else:
|
||||
limits['wg'] = min(16*WFwg, pl.WFmax_cu)
|
||||
#WFmax with LDS constraints
|
||||
if shared_mem > 0:
|
||||
WGmax = _int_floor(pl.LDS_cu/shared_mem)
|
||||
limits['lds'] = WGmax*WFwg
|
||||
#WFmax with GPR constraints
|
||||
if NReg > 0:
|
||||
#Amount of work group per CU
|
||||
NRegWG = NReg*pl.WFsize*WFwg
|
||||
WGmax = _int_floor(pl.GPR_cu/NRegWG)
|
||||
limits['gpr'] = WFwg*WGmax
|
||||
|
||||
self.occupancy = 100.0*min(list(limits.values()))/pl.WFmax_cu
|
||||
|
||||
|
||||
def __init__(self, dev, threads, shared_mem=0, registers=0):
|
||||
if 'advanced micro devices' in dev.vendor.lower():
|
||||
self.init_amd(dev, threads, shared_mem, registers)
|
||||
elif 'nvidia' in dev.vendor.lower():
|
||||
self.init_nvidia(dev, threads, shared_mem, registers)
|
||||
|
||||
|
||||
|
||||
def skip(template, statement, device):
|
||||
statements = StatementsTuple(statement)
|
||||
registers_usage = template.registers_usage(statements)/4
|
||||
lmem_usage = template.lmem_usage(statements)
|
||||
local_size = template.parameters.local_size_0*template.parameters.local_size_1
|
||||
occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
|
||||
if template.check(statement) or occupancy_record.occupancy < 15:
|
||||
return True
|
||||
return False
|
||||
|
||||
def benchmark(template, statement, device):
|
||||
statements = StatementsTuple(statement)
|
||||
registers_usage = template.registers_usage(statements)/4
|
||||
lmem_usage = template.lmem_usage(statements)
|
||||
local_size = template.parameters.local_size_0*template.parameters.local_size_1
|
||||
occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
|
||||
if occupancy_record.occupancy < 15 :
|
||||
raise ValueError("Template has too low occupancy")
|
||||
else:
|
||||
template.execute(statement, True)
|
||||
statement.result.context.finish_all_queues()
|
||||
N = 0
|
||||
current_time = 0
|
||||
while current_time < 1e-1:
|
||||
time_before = time.time()
|
||||
template.execute(statement,False)
|
||||
statement.result.context.finish_all_queues()
|
||||
current_time = current_time + time.time() - time_before
|
||||
N+=1
|
||||
return current_time/N
|
||||
|
||||
|
||||
def update_viennacl_headers(viennacl_root, device, datatype, operation, additional_parameters, parameters):
|
||||
|
||||
def sanitize_string(string, keep_chars = ['_']):
|
||||
string = string.replace(' ', '_').replace('-', '_').lower()
|
||||
string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip()
|
||||
return string
|
||||
|
||||
def append_include(data, path):
|
||||
include_name = '#include "' + path +'"\n'
|
||||
already_included = data.find(include_name)
|
||||
if already_included == -1:
|
||||
insert_index = data.index('\n', data.index('#define')) + 1
|
||||
return data[:insert_index] + '\n' + include_name + data[insert_index:]
|
||||
return data
|
||||
|
||||
|
||||
builtin_database_dir = os.path.join(viennacl_root, "device_specific", "builtin_database")
|
||||
if not os.path.isdir(builtin_database_dir):
|
||||
raise EnvironmentError('ViennaCL root path is incorrect. Cannot access ' + builtin_database_dir + '!\n'
|
||||
'Your version of ViennaCL may be too old and/or corrupted.')
|
||||
|
||||
function_name_dict = { vcl.float32: 'add_4B',
|
||||
vcl.float64: 'add_8B' }
|
||||
|
||||
additional_parameters_dict = {'N': "char_to_type<'N'>",
|
||||
'T': "char_to_type<'T'>"}
|
||||
|
||||
#Create the device-specific headers
|
||||
cpp_device_name = sanitize_string(device.name)
|
||||
function_name = function_name_dict[datatype]
|
||||
operation = operation.replace('-','_')
|
||||
|
||||
cpp_class_name = operation + '_template'
|
||||
header_name = cpp_device_name + ".hpp"
|
||||
function_declaration = 'inline void ' + function_name + '(' + ', '.join(['database_type<' + cpp_class_name + '::parameters_type> & db'] + \
|
||||
[additional_parameters_dict[x] for x in additional_parameters]) + ')'
|
||||
|
||||
|
||||
device_type_prefix = {
|
||||
cl.device_type.GPU: 'gpu',
|
||||
cl.device_type.CPU: 'cpu',
|
||||
cl.device_type.ACCELERATOR: 'accelerator'
|
||||
}[device.type]
|
||||
vendor_prefix = {
|
||||
vcl.opencl.VendorId.beignet_id: 'beignet',
|
||||
vcl.opencl.VendorId.nvidia_id: 'nvidia',
|
||||
vcl.opencl.VendorId.amd_id: 'amd',
|
||||
vcl.opencl.VendorId.intel_id: 'intel'
|
||||
}[device.vendor_id]
|
||||
architecture_family = vcl.opencl.architecture_family(device.vendor_id, device.name)
|
||||
|
||||
header_hierarchy = ["devices", device_type_prefix, vendor_prefix, architecture_family]
|
||||
header_directory = os.path.join(builtin_database_dir, *header_hierarchy)
|
||||
header_path = os.path.join(header_directory, header_name)
|
||||
|
||||
if not os.path.exists(header_directory):
|
||||
os.makedirs(header_directory)
|
||||
|
||||
if os.path.exists(header_path):
|
||||
with open (header_path, "r") as myfile:
|
||||
data=myfile.read()
|
||||
else:
|
||||
data = ''
|
||||
|
||||
if not data:
|
||||
ifndef_suffix = ('_'.join(header_hierarchy) + '_hpp_').upper()
|
||||
data = ('#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
|
||||
'#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
|
||||
'\n'
|
||||
'#include "viennacl/device_specific/forwards.h"\n'
|
||||
'#include "viennacl/device_specific/builtin_database/common.hpp"\n'
|
||||
'\n'
|
||||
'namespace viennacl{\n'
|
||||
'namespace device_specific{\n'
|
||||
'namespace builtin_database{\n'
|
||||
'namespace devices{\n'
|
||||
'namespace ' + device_type_prefix + '{\n'
|
||||
'namespace ' + vendor_prefix + '{\n'
|
||||
'namespace ' + architecture_family + '{\n'
|
||||
'namespace ' + cpp_device_name + '{\n'
|
||||
'\n'
|
||||
'}\n'
|
||||
'}\n'
|
||||
'}\n'
|
||||
'}\n'
|
||||
'}\n'
|
||||
'}\n'
|
||||
'}\n'
|
||||
'}\n'
|
||||
'#endif\n'
|
||||
'')
|
||||
|
||||
data = append_include(data, 'viennacl/device_specific/templates/' + cpp_class_name + '.hpp')
|
||||
device_type = {
|
||||
cl.device_type.GPU: 'CL_DEVICE_TYPE_GPU',
|
||||
cl.device_type.CPU: 'CL_DEVICE_TYPE_CPU',
|
||||
cl.device_type.ACCELERATOR: 'CL_DEVICE_TYPE_ACCELERATOR'
|
||||
}[device.type]
|
||||
add_to_database_arguments = [vendor_prefix + '_id', device_type, 'ocl::'+architecture_family,
|
||||
'"' + device.name + '"', cpp_class_name + '::parameters_type(' + ','.join(map(str,parameters)) + ')']
|
||||
core = ' db.' + function_name + '(' + ', '.join(add_to_database_arguments) + ');'
|
||||
|
||||
already_declared = data.find(function_declaration)
|
||||
if already_declared==-1:
|
||||
substr = 'namespace ' + cpp_device_name + '{\n'
|
||||
insert_index = data.index(substr) + len(substr)
|
||||
data = data[:insert_index] + '\n' + function_declaration + '\n{\n' + core + '\n}\n' + data[insert_index:]
|
||||
else:
|
||||
i1 = data.find('{', already_declared)
|
||||
if data[i1-1]=='\n':
|
||||
i1 = i1 - 1
|
||||
i2 = data.find('}', already_declared) + 1
|
||||
data = data[:i1] + '\n{\n' + core + '\n}' + data[i2:]
|
||||
|
||||
#Write the header file
|
||||
with open(header_path, "w+") as myfile:
|
||||
myfile.write(data)
|
||||
|
||||
#Updates the global ViennaCL headers
|
||||
with open(os.path.join(builtin_database_dir, operation + '.hpp'), 'r+') as operation_header:
|
||||
data = operation_header.read()
|
||||
data = append_include(data, os.path.relpath(header_path, os.path.join(viennacl_root, os.pardir)))
|
||||
|
||||
scope_name = '_'.join(('init', operation) + additional_parameters)
|
||||
scope = data.index(scope_name)
|
||||
function_call = ' ' + '::'.join(header_hierarchy + [cpp_device_name, function_name]) + '(' + ', '.join(['result'] + [additional_parameters_dict[k] + '()' for k in additional_parameters]) + ')'
|
||||
if function_call not in data:
|
||||
insert_index = data.rindex('\n', 0, data.index('return result', scope))
|
||||
data = data[:insert_index] + function_call + ';\n' + data[insert_index:]
|
||||
|
||||
operation_header.seek(0)
|
||||
operation_header.truncate()
|
||||
operation_header.write(data)
|
Reference in New Issue
Block a user