Restored VCL header generation functionnality

This commit is contained in:
Philippe Tillet
2014-10-05 05:16:21 +02:00
parent fc8b450a7c
commit acb7fe73e8
9 changed files with 189 additions and 185 deletions

View File

@@ -8,6 +8,7 @@
#include "viennacl/tools/tools.hpp" #include "viennacl/tools/tools.hpp"
#include "viennacl/tools/timer.hpp" #include "viennacl/tools/timer.hpp"
#include "viennacl/scheduler/forwards.h" #include "viennacl/scheduler/forwards.h"
#include "viennacl/scheduler/io.hpp"
#include "atidlas/forwards.h" #include "atidlas/forwards.h"
#include "atidlas/templates/template_base.hpp" #include "atidlas/templates/template_base.hpp"
@@ -25,6 +26,7 @@ namespace atidlas
inline void execute(template_base const & T, statements_container const & statements, viennacl::ocl::context & ctx = viennacl::ocl::current_context(), bool force_compilation = false) inline void execute(template_base const & T, statements_container const & statements, viennacl::ocl::context & ctx = viennacl::ocl::current_context(), bool force_compilation = false)
{ {
//std::cout << statements.data().front() << std::endl;
//Generate program name //Generate program name
std::string program_name = tools::statements_representation(statements, BIND_TO_HANDLE); std::string program_name = tools::statements_representation(statements, BIND_TO_HANDLE);
execution_handler handler(program_name, ctx, ctx.current_device(), force_compilation); execution_handler handler(program_name, ctx, ctx.current_device(), force_compilation);

View File

@@ -148,7 +148,7 @@ private:
str[0] = "#namereg"; str[0] = "#namereg";
else else
for (unsigned int a = 0; a < simd_width; ++a) for (unsigned int a = 0; a < simd_width; ++a)
str[a] = "#namereg.s" + tools::to_string(a); str[a] = append_simd_suffix("#namereg.s", a);
for (unsigned int k = 0; k < exprs.size(); ++k) for (unsigned int k = 0; k < exprs.size(); ++k)
{ {

View File

@@ -110,7 +110,7 @@ private:
str[0] = "#namereg"; str[0] = "#namereg";
else else
for (unsigned int a = 0; a < simd_width; ++a) for (unsigned int a = 0; a < simd_width; ++a)
str[a] = "#namereg.s" + to_string(a); str[a] = append_simd_suffix("#namereg.s",a);
for (unsigned int k = 0; k < exprs.size(); ++k) for (unsigned int k = 0; k < exprs.size(); ++k)

View File

@@ -383,6 +383,14 @@ protected:
protected: protected:
static std::string append_simd_suffix(std::string const & str, unsigned int i)
{
assert(i < 16);
static char suffixes[] = {'0','1','2','3','4','5','6','7','8','9',
'a','b','c','d','e','f'};
return str + tools::to_string(suffixes[i]);
}
static bool is_offset_modifier(viennacl::scheduler::statement_node const & node) static bool is_offset_modifier(viennacl::scheduler::statement_node const & node)
{ {
return node.op.type==viennacl::scheduler::OPERATION_BINARY_VECTOR_DIAG_TYPE return node.op.type==viennacl::scheduler::OPERATION_BINARY_VECTOR_DIAG_TYPE

View File

@@ -1,24 +1,23 @@
#will save the archive into /tmp/name-of-operation.dat viennacl-src-root = /home/philippe/Development/viennacl-dev/viennacl/
tmp-folder = /tmp/
[vector-axpy] [vector-axpy]
devices = 0 devices = 0
precision = single precision = single, double
#~ size = 10000000 size = 5000000
#~
#~ [matrix-axpy] [matrix-axpy]
#~ devices = 0 devices = 0
#~ precision = single precision = single, double
#~ size = 3072, 3072 size = 2560, 2560
#~
#~ [row-wise-reduction] [row-wise-reduction]
#~ devices = 0 devices = 0
#~ precision = single precision = single, double
#~ layout = N, T layout = N,T
#~ size = 3968, 3968 size = 2560, 2560
[matrix-product] [matrix-product]
devices = 0 devices = 0
precision = single precision = single
layout = NT layout = NN,NT,TN,TT
#size = 1536, 1536, 1536 size = 1536, 1536, 1536

View File

@@ -54,6 +54,8 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
confdevices = p['devices'] confdevices = p['devices']
devices = utils.DEVICES_PRESETS[confdevices] if confdevices in utils.DEVICES_PRESETS else [utils.all_devices[int(i)] for i in confdevices] devices = utils.DEVICES_PRESETS[confdevices] if confdevices in utils.DEVICES_PRESETS else [utils.all_devices[int(i)] for i in confdevices]
precisions = map_to_list(str, p['precision']) precisions = map_to_list(str, p['precision'])
if 'all' in precisions:
precisions = ['single','double']
datatypes = [DATATYPES[k] for k in precisions] datatypes = [DATATYPES[k] for k in precisions]
#Iterate through the datatypes and the devices #Iterate through the datatypes and the devices
for datatype, device in itertools.product(datatypes, devices): for datatype, device in itertools.product(datatypes, devices):
@@ -64,20 +66,23 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
if datatype is vcl.float64 and not device.double_fp_config: if datatype is vcl.float64 and not device.double_fp_config:
sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...') sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...')
continue continue
#Helper #Helper for execution
def execute(device, statement, other_params, sizes, fname = os.devnull, parameters = None): def execute(device, node, other_params, sizes, fname = os.devnull, parameters = None):
if parameters: if parameters:
TemplateType = TYPES[operation]['template'] TemplateType = TYPES[operation]['template']
return tools.benchmark(TemplateType(TemplateType.Parameters(*parameters),*other_params), statement, device) return tools.benchmark(TemplateType(TemplateType.Parameters(*parameters),*other_params), statement, device)
print('-----') print('-----')
print(' '.join(map(str, ("Now tuning:", datatype.__name__, '-', operation, '-'.join(other_params), '[' + device.name, '(' + device.platform.name + ')] for sizes', sizes)))) print(' '.join(map(str, ("Now tuning:", datatype.__name__, '-', operation, '-'.join(other_params), '[' + device.name, '(' + device.platform.name + ')] for sizes', sizes))))
with open(fname, "w+") as archive: with open(fname, "w+") as archive:
return optimize.genetic(statement, device, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params), with vcl.Statement(node) as statement:
return optimize.genetic(statement, device, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive) lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive)
#Helper #Helper for tuning
def tune(execution_handler, nTuning, nDataPoints, draw): def tune(execution_handler, nTuning, nDataPoints, draw, additional_parameters):
if 'size' in p: if 'size' in p:
profile = execution_handler(map_to_list(int, p['size'])) profile = execution_handler(map_to_list(int, p['size']))
if 'viennacl-src-root' in config:
tools.update_viennacl_headers(config['viennacl-src-root'],device,datatype,operation,additional_parameters,profile)
else: else:
def compute_perf(x, t): def compute_perf(x, t):
return TYPES[operation]['perf-index']([datatype().itemsize, x, t]) return TYPES[operation]['perf-index']([datatype().itemsize, x, t])
@@ -89,15 +94,17 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
def execution_handler(sizes, fname=os.devnull, parameters=None): def execution_handler(sizes, fname=os.devnull, parameters=None):
x = vcl.Vector(sizes[0], context=ctx, dtype=datatype) x = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
y = vcl.Vector(sizes[0], context=ctx, dtype=datatype) y = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
return execute(device, vcl.Statement(vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y))), (), sizes, fname, parameters) z = vcl.Vector(sizes[0], context=ctx, dtype=datatype)
tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=10, high=100000, size=1)) return execute(device, vcl.Assign(z, vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y))), (), sizes, fname, parameters)
tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=10, high=100000, size=1), ())
#Matrix AXPY #Matrix AXPY
if operation=='matrix-axpy': if operation=='matrix-axpy':
def execution_handler(sizes, fname=os.devnull, parameters=None): def execution_handler(sizes, fname=os.devnull, parameters=None):
A = vcl.Matrix(sizes, context=ctx, dtype=datatype) A = vcl.Matrix(sizes, context=ctx, dtype=datatype)
B = vcl.Matrix(sizes, context=ctx, dtype=datatype) B = vcl.Matrix(sizes, context=ctx, dtype=datatype)
return execute(device, vcl.Statement(A+B), (), sizes, fname, parameters) C = vcl.Matrix(sizes, context=ctx, dtype=datatype)
tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=5, high=100, size=2)) return execute(device, vcl.Assign(C,A+B), (), sizes, fname, parameters)
tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=5, high=100, size=2), ())
#Row-wise reduction #Row-wise reduction
if operation=='row-wise-reduction': if operation=='row-wise-reduction':
layouts = map_to_list(str,p['layout']) layouts = map_to_list(str,p['layout'])
@@ -107,9 +114,10 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
def execution_handler(sizes, fname=os.devnull, parameters=None): def execution_handler(sizes, fname=os.devnull, parameters=None):
A = vcl.Matrix(sizes if A_trans=='N' else sizes[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR) A = vcl.Matrix(sizes if A_trans=='N' else sizes[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR)
x = vcl.Vector(sizes[1] if A_trans=='N' else sizes[0], context=ctx, dtype=datatype) x = vcl.Vector(sizes[1] if A_trans=='N' else sizes[0], context=ctx, dtype=datatype)
y = vcl.Vector(sizes[0] if A_trans=='N' else sizes[1], context=ctx, dtype=datatype)
LHS = A if A_trans=='N' else A.T LHS = A if A_trans=='N' else A.T
execute(device, vcl.Statement(LHS*x), (), sizes, fname, parameters) return execute(device, vcl.Assign(y, LHS*x), (), sizes, fname, parameters)
tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=5, high=100, size=2)) tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=5, high=100, size=2), (A_trans,))
#Matrix Product #Matrix Product
if operation=='matrix-product': if operation=='matrix-product':
layouts = map_to_list(str,p['layout']) layouts = map_to_list(str,p['layout'])
@@ -126,9 +134,8 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
alpha = vcl.HostScalar(1.0, context=ctx, dtype = datatype) alpha = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype) beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR) C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR)
statement = vcl.Statement(vcl.Assign(C,LHS*RHS*alpha + C*beta)) return execute(device, vcl.Assign(C,LHS*RHS*alpha + C*beta),(A_trans, B_trans), sizes, fname, parameters)
return execute(device, statement,(A_trans, B_trans), sizes, fname, parameters) tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=1, high=40, size=3),(layout[0], layout[1]))
tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=1, high=40, size=3))

View File

@@ -1,6 +1,11 @@
from __future__ import division from __future__ import division
import pyopencl import pyopencl
import time import time
import os
import sys
import pyopencl as cl
import pyviennacl as vcl
from pyviennacl.atidlas import StatementsTuple from pyviennacl.atidlas import StatementsTuple
class PhysicalLimitsNV: class PhysicalLimitsNV:
@@ -158,7 +163,6 @@ def benchmark(template, statement, device):
if occupancy_record.occupancy < 15 : if occupancy_record.occupancy < 15 :
raise ValueError("Template has too low occupancy") raise ValueError("Template has too low occupancy")
else: else:
#~ try:
template.execute(statement, True) template.execute(statement, True)
statement.result.context.finish_all_queues() statement.result.context.finish_all_queues()
N = 0 N = 0
@@ -170,5 +174,138 @@ def benchmark(template, statement, device):
current_time += time.time() - time_before current_time += time.time() - time_before
N+=1 N+=1
return current_time/N return current_time/N
#~ except:
#~ raise ValueError("Invalid template")
def update_viennacl_headers(viennacl_root, device, datatype, operation, additional_parameters, parameters):
def sanitize_string(string, keep_chars = ['_']):
string = string.replace(' ', '_').replace('-', '_').lower()
string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip()
return string
def append_include(data, path):
include_name = '#include "' + path +'"\n'
already_included = data.find(include_name)
if already_included == -1:
insert_index = data.index('\n', data.index('#define')) + 1
return data[:insert_index] + '\n' + include_name + data[insert_index:]
return data
builtin_database_dir = os.path.join(viennacl_root, "device_specific", "builtin_database")
if not os.path.isdir(builtin_database_dir):
raise EnvironmentError('ViennaCL root path is incorrect. Cannot access ' + builtin_database_dir + '!\n'
'Your version of ViennaCL may be too old and/or corrupted.')
function_name_dict = { vcl.float32: 'add_4B',
vcl.float64: 'add_8B' }
additional_parameters_dict = {'N': "char_to_type<'N'>",
'T': "char_to_type<'T'>"}
#Create the device-specific headers
cpp_device_name = sanitize_string(device.name)
function_name = function_name_dict[datatype]
operation = operation.replace('-','_')
cpp_class_name = operation + '_template'
header_name = cpp_device_name + ".hpp"
function_declaration = 'inline void ' + function_name + '(' + ', '.join(['database_type<' + cpp_class_name + '::parameters_type> & db'] + \
[additional_parameters_dict[x] for x in additional_parameters]) + ')'
device_type_prefix = {
cl.device_type.GPU: 'gpu',
cl.device_type.CPU: 'cpu',
cl.device_type.ACCELERATOR: 'accelerator'
}[device.type]
vendor_prefix = {
vcl.opencl.VendorId.beignet_id: 'beignet',
vcl.opencl.VendorId.nvidia_id: 'nvidia',
vcl.opencl.VendorId.amd_id: 'amd',
vcl.opencl.VendorId.intel_id: 'intel'
}[device.vendor_id]
architecture_family = vcl.opencl.architecture_family(device.vendor_id, device.name)
header_hierarchy = ["devices", device_type_prefix, vendor_prefix, architecture_family]
header_directory = os.path.join(builtin_database_dir, *header_hierarchy)
header_path = os.path.join(header_directory, header_name)
if not os.path.exists(header_directory):
os.makedirs(header_directory)
if os.path.exists(header_path):
with open (header_path, "r") as myfile:
data=myfile.read()
else:
data = ''
if not data:
ifndef_suffix = ('_'.join(header_hierarchy) + '_hpp_').upper()
data = ('#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
'#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
'\n'
'#include "viennacl/device_specific/forwards.h"\n'
'#include "viennacl/device_specific/builtin_database/common.hpp"\n'
'\n'
'namespace viennacl{\n'
'namespace device_specific{\n'
'namespace builtin_database{\n'
'namespace devices{\n'
'namespace ' + device_type_prefix + '{\n'
'namespace ' + vendor_prefix + '{\n'
'namespace ' + architecture_family + '{\n'
'namespace ' + cpp_device_name + '{\n'
'\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'#endif\n'
'')
data = append_include(data, 'viennacl/device_specific/templates/' + cpp_class_name + '.hpp')
device_type = {
cl.device_type.GPU: 'CL_DEVICE_TYPE_GPU',
cl.device_type.CPU: 'CL_DEVICE_TYPE_CPU',
cl.device_type.ACCELERATOR: 'CL_DEVICE_TYPE_ACCELERATOR'
}[device.type]
add_to_database_arguments = [vendor_prefix + '_id', device_type, 'ocl::'+architecture_family,
'"' + device.name + '"', cpp_class_name + '::parameters_type(' + ','.join(map(str,parameters)) + ')']
core = ' db.' + function_name + '(' + ', '.join(add_to_database_arguments) + ');'
already_declared = data.find(function_declaration)
if already_declared==-1:
substr = 'namespace ' + cpp_device_name + '{\n'
insert_index = data.index(substr) + len(substr)
data = data[:insert_index] + '\n' + function_declaration + '\n{\n' + core + '\n}\n' + data[insert_index:]
else:
i1 = data.find('{', already_declared)
if data[i1-1]=='\n':
i1 = i1 - 1
i2 = data.find('}', already_declared) + 1
data = data[:i1] + '\n{\n' + core + '\n}' + data[i2:]
#Write the header file
with open(header_path, "w+") as myfile:
myfile.write(data)
#Updates the global ViennaCL headers
with open(os.path.join(builtin_database_dir, operation + '.hpp'), 'r+') as operation_header:
data = operation_header.read()
data = append_include(data, os.path.relpath(header_path, os.path.join(viennacl_root, os.pardir)))
scope_name = '_'.join(('init', operation) + additional_parameters)
scope = data.index(scope_name)
function_call = ' ' + '::'.join(header_hierarchy + [cpp_device_name, function_name]) + '(' + ', '.join(['result'] + [additional_parameters_dict[k] + '()' for k in additional_parameters]) + ')'
if function_call not in data:
insert_index = data.rindex('\n', 0, data.index('return result', scope))
data = data[:insert_index] + function_call + ';\n' + data[insert_index:]
operation_header.seek(0)
operation_header.truncate()
operation_header.write(data)

View File

@@ -1,33 +0,0 @@
import pyopencl as cl
import pyviennacl as vcl
all_devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
DEVICE_TYPE_PREFIX = { cl.device_type.GPU: 'gpu',
cl.device_type.CPU: 'cpu',
cl.device_type.ACCELERATOR: 'accelerator'
}
DEVICE_TYPE_CL_NAME = { cl.device_type.GPU: 'CL_DEVICE_TYPE_GPU',
cl.device_type.CPU: 'CL_DEVICE_TYPE_CPU',
cl.device_type.ACCELERATOR: 'CL_DEVICE_TYPE_ACCELERATOR'
}
VENDOR_PREFIX = { vcl.opencl.VendorId.beignet_id: 'beignet',
vcl.opencl.VendorId.nvidia_id: 'nvidia',
vcl.opencl.VendorId.amd_id: 'amd',
vcl.opencl.VendorId.intel_id: 'intel'
}
DEVICES_PRESETS = {'all': all_devices,
'gpus': [d for d in all_devices if d.type==cl.device_type.GPU],
'cpus': [d for d in all_devices if d.type==cl.device_type.CPU],
'accelerators': [d for d in all_devices if d.type==cl.device_type.ACCELERATOR]
}
def sanitize_string(string, keep_chars = ['_']):
string = string.replace(' ', '_').lower()
string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip()
return string

View File

@@ -1,116 +0,0 @@
import sys
import os
import utils
def append_include(data, path):
include_name = '#include "' + path +'"\n'
already_included = data.find(include_name)
if already_included == -1:
insert_index = data.index('\n', data.index('#define')) + 1
return data[:insert_index] + '\n' + include_name + data[insert_index:]
return data
def generate_viennacl_headers(viennacl_root, device, datatype, operation, additional_parameters, parameters):
builtin_database_dir = os.path.join(viennacl_root, "device_specific", "builtin_database")
if not os.path.isdir(builtin_database_dir):
raise EnvironmentError('ViennaCL root path is incorrect. Cannot access ' + builtin_database_dir + '!\n'
'Your version of ViennaCL may be too old and/or corrupted.')
function_name_dict = { vcl.float32: 'add_4B',
vcl.float64: 'add_8B' }
additional_parameters_dict = {'N': "char_to_type<'N'>",
'T': "char_to_type<'T'>"}
#Create the device-specific headers
cpp_device_name = utils.sanitize_string(device.name)
function_name = function_name_dict[datatype]
operation = operation.replace('-','_')
cpp_class_name = operation + '_template'
header_name = cpp_device_name + ".hpp"
function_declaration = 'inline void ' + function_name + '(' + ', '.join(['database_type<' + cpp_class_name + '::parameters_type> & db'] + \
[additional_parameters_dict[x] for x in additional_parameters]) + ')'
device_type_prefix = utils.DEVICE_TYPE_PREFIX[device.type]
vendor_prefix = utils.VENDOR_PREFIX[device.vendor_id]
architecture_family = vcl.opencl.architecture_family(device.vendor_id, device.name)
header_hierarchy = ["devices", device_type_prefix, vendor_prefix, architecture_family]
header_directory = os.path.join(builtin_database_dir, *header_hierarchy)
header_path = os.path.join(header_directory, header_name)
if not os.path.exists(header_directory):
os.makedirs(header_directory)
if os.path.exists(header_path):
with open (header_path, "r") as myfile:
data=myfile.read()
else:
data = ''
if not data:
ifndef_suffix = ('_'.join(header_hierarchy) + '_hpp_').upper()
data = ('#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
'#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n'
'\n'
'#include "viennacl/device_specific/forwards.h"\n'
'#include "viennacl/device_specific/builtin_database/common.hpp"\n'
'\n'
'namespace viennacl{\n'
'namespace device_specific{\n'
'namespace builtin_database{\n'
'namespace devices{\n'
'namespace ' + device_type_prefix + '{\n'
'namespace ' + vendor_prefix + '{\n'
'namespace ' + architecture_family + '{\n'
'namespace ' + cpp_device_name + '{\n'
'\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'}\n'
'#endif\n'
'')
data = append_include(data, 'viennacl/device_specific/templates/' + cpp_class_name + '.hpp')
add_to_database_arguments = [vendor_prefix + '_id', utils.DEVICE_TYPE_CL_NAME[device.type], 'ocl::'+architecture_family,
'"' + device.name + '"', cpp_class_name + '::parameters' + str(parameters)]
core = ' db.' + function_name + '(' + ', '.join(add_to_database_arguments) + ');'
already_declared = data.find(function_declaration)
if already_declared==-1:
substr = 'namespace ' + cpp_device_name + '{\n'
insert_index = data.index(substr) + len(substr)
data = data[:insert_index] + '\n' + function_declaration + '\n{\n' + core + '\n}\n' + data[insert_index:]
else:
i1 = data.find('{', already_declared)
if data[i1-1]=='\n':
i1 = i1 - 1
i2 = data.find('}', already_declared) + 1
data = data[:i1] + '\n{\n' + core + '\n}' + data[i2:]
#Write the header file
with open(header_path, "w+") as myfile:
myfile.write(data)
#Updates the global ViennaCL headers
with open(os.path.join(builtin_database_dir, operation + '.hpp'), 'r+') as operation_header:
data = operation_header.read()
data = append_include(data, os.path.relpath(header_path, os.path.join(viennacl_root, os.pardir)))
scope_name = '_'.join(('init', operation) + additional_parameters)
scope = data.index(scope_name)
function_call = ' ' + '::'.join(header_hierarchy + [cpp_device_name, function_name]) + '(' + ', '.join(['result'] + [additional_parameters_dict[k] + '()' for k in additional_parameters]) + ')'
if function_call not in data:
insert_index = data.rindex('\n', 0, data.index('return result', scope))
data = data[:insert_index] + function_call + ';\n' + data[insert_index:]
operation_header.seek(0)
operation_header.truncate()
operation_header.write(data)