From acb7fe73e8169e0f5316cfaac2fe9cc664c882ab Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 5 Oct 2014 05:16:21 +0200 Subject: [PATCH] Restored VCL header generation functionnality --- atidlas/execute.hpp | 2 + atidlas/templates/reduction.hpp | 2 +- atidlas/templates/row_wise_reduction.hpp | 2 +- atidlas/templates/template_base.hpp | 8 ++ autotune/config.ini | 33 +++--- autotune/python/autotune.py | 35 +++--- autotune/python/tools.py | 143 ++++++++++++++++++++++- autotune/python/utils.py | 33 ------ autotune/python/vclio.py | 116 ------------------ 9 files changed, 189 insertions(+), 185 deletions(-) delete mode 100644 autotune/python/utils.py delete mode 100644 autotune/python/vclio.py diff --git a/atidlas/execute.hpp b/atidlas/execute.hpp index cbc384566..e8547ae13 100644 --- a/atidlas/execute.hpp +++ b/atidlas/execute.hpp @@ -8,6 +8,7 @@ #include "viennacl/tools/tools.hpp" #include "viennacl/tools/timer.hpp" #include "viennacl/scheduler/forwards.h" +#include "viennacl/scheduler/io.hpp" #include "atidlas/forwards.h" #include "atidlas/templates/template_base.hpp" @@ -25,6 +26,7 @@ namespace atidlas inline void execute(template_base const & T, statements_container const & statements, viennacl::ocl::context & ctx = viennacl::ocl::current_context(), bool force_compilation = false) { + //std::cout << statements.data().front() << std::endl; //Generate program name std::string program_name = tools::statements_representation(statements, BIND_TO_HANDLE); execution_handler handler(program_name, ctx, ctx.current_device(), force_compilation); diff --git a/atidlas/templates/reduction.hpp b/atidlas/templates/reduction.hpp index 0648dc5af..84272149f 100644 --- a/atidlas/templates/reduction.hpp +++ b/atidlas/templates/reduction.hpp @@ -148,7 +148,7 @@ private: str[0] = "#namereg"; else for (unsigned int a = 0; a < simd_width; ++a) - str[a] = "#namereg.s" + tools::to_string(a); + str[a] = append_simd_suffix("#namereg.s", a); for (unsigned int k = 0; k < exprs.size(); ++k) { diff --git a/atidlas/templates/row_wise_reduction.hpp b/atidlas/templates/row_wise_reduction.hpp index c02b2919c..863386bb3 100644 --- a/atidlas/templates/row_wise_reduction.hpp +++ b/atidlas/templates/row_wise_reduction.hpp @@ -110,7 +110,7 @@ private: str[0] = "#namereg"; else for (unsigned int a = 0; a < simd_width; ++a) - str[a] = "#namereg.s" + to_string(a); + str[a] = append_simd_suffix("#namereg.s",a); for (unsigned int k = 0; k < exprs.size(); ++k) diff --git a/atidlas/templates/template_base.hpp b/atidlas/templates/template_base.hpp index c90f3e4f5..bfc8478c0 100644 --- a/atidlas/templates/template_base.hpp +++ b/atidlas/templates/template_base.hpp @@ -383,6 +383,14 @@ protected: protected: + static std::string append_simd_suffix(std::string const & str, unsigned int i) + { + assert(i < 16); + static char suffixes[] = {'0','1','2','3','4','5','6','7','8','9', + 'a','b','c','d','e','f'}; + return str + tools::to_string(suffixes[i]); + } + static bool is_offset_modifier(viennacl::scheduler::statement_node const & node) { return node.op.type==viennacl::scheduler::OPERATION_BINARY_VECTOR_DIAG_TYPE diff --git a/autotune/config.ini b/autotune/config.ini index e07a8a924..47d98d4ed 100644 --- a/autotune/config.ini +++ b/autotune/config.ini @@ -1,24 +1,23 @@ -#will save the archive into /tmp/name-of-operation.dat -tmp-folder = /tmp/ +viennacl-src-root = /home/philippe/Development/viennacl-dev/viennacl/ [vector-axpy] devices = 0 -precision = single -#~ size = 10000000 -#~ -#~ [matrix-axpy] -#~ devices = 0 -#~ precision = single -#~ size = 3072, 3072 -#~ -#~ [row-wise-reduction] -#~ devices = 0 -#~ precision = single -#~ layout = N, T -#~ size = 3968, 3968 +precision = single, double +size = 5000000 + +[matrix-axpy] +devices = 0 +precision = single, double +size = 2560, 2560 + +[row-wise-reduction] +devices = 0 +precision = single, double +layout = N,T +size = 2560, 2560 [matrix-product] devices = 0 precision = single -layout = NT -#size = 1536, 1536, 1536 +layout = NN,NT,TN,TT +size = 1536, 1536, 1536 diff --git a/autotune/python/autotune.py b/autotune/python/autotune.py index 7a06c1ca7..a9cefd64f 100644 --- a/autotune/python/autotune.py +++ b/autotune/python/autotune.py @@ -54,6 +54,8 @@ def do_tuning(config_fname, spec_fname, viennacl_root): confdevices = p['devices'] devices = utils.DEVICES_PRESETS[confdevices] if confdevices in utils.DEVICES_PRESETS else [utils.all_devices[int(i)] for i in confdevices] precisions = map_to_list(str, p['precision']) + if 'all' in precisions: + precisions = ['single','double'] datatypes = [DATATYPES[k] for k in precisions] #Iterate through the datatypes and the devices for datatype, device in itertools.product(datatypes, devices): @@ -64,20 +66,23 @@ def do_tuning(config_fname, spec_fname, viennacl_root): if datatype is vcl.float64 and not device.double_fp_config: sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...') continue - #Helper - def execute(device, statement, other_params, sizes, fname = os.devnull, parameters = None): + #Helper for execution + def execute(device, node, other_params, sizes, fname = os.devnull, parameters = None): if parameters: TemplateType = TYPES[operation]['template'] return tools.benchmark(TemplateType(TemplateType.Parameters(*parameters),*other_params), statement, device) print('-----') print(' '.join(map(str, ("Now tuning:", datatype.__name__, '-', operation, '-'.join(other_params), '[' + device.name, '(' + device.platform.name + ')] for sizes', sizes)))) with open(fname, "w+") as archive: - return optimize.genetic(statement, device, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params), + with vcl.Statement(node) as statement: + return optimize.genetic(statement, device, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params), lambda t: TYPES[operation]['perf-index']([datatype().itemsize, sizes, t]), TYPES[operation]['perf-measure'], archive) - #Helper - def tune(execution_handler, nTuning, nDataPoints, draw): + #Helper for tuning + def tune(execution_handler, nTuning, nDataPoints, draw, additional_parameters): if 'size' in p: profile = execution_handler(map_to_list(int, p['size'])) + if 'viennacl-src-root' in config: + tools.update_viennacl_headers(config['viennacl-src-root'],device,datatype,operation,additional_parameters,profile) else: def compute_perf(x, t): return TYPES[operation]['perf-index']([datatype().itemsize, x, t]) @@ -89,15 +94,17 @@ def do_tuning(config_fname, spec_fname, viennacl_root): def execution_handler(sizes, fname=os.devnull, parameters=None): x = vcl.Vector(sizes[0], context=ctx, dtype=datatype) y = vcl.Vector(sizes[0], context=ctx, dtype=datatype) - return execute(device, vcl.Statement(vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y))), (), sizes, fname, parameters) - tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=10, high=100000, size=1)) + z = vcl.Vector(sizes[0], context=ctx, dtype=datatype) + return execute(device, vcl.Assign(z, vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y))), (), sizes, fname, parameters) + tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=10, high=100000, size=1), ()) #Matrix AXPY if operation=='matrix-axpy': def execution_handler(sizes, fname=os.devnull, parameters=None): A = vcl.Matrix(sizes, context=ctx, dtype=datatype) B = vcl.Matrix(sizes, context=ctx, dtype=datatype) - return execute(device, vcl.Statement(A+B), (), sizes, fname, parameters) - tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=5, high=100, size=2)) + C = vcl.Matrix(sizes, context=ctx, dtype=datatype) + return execute(device, vcl.Assign(C,A+B), (), sizes, fname, parameters) + tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=5, high=100, size=2), ()) #Row-wise reduction if operation=='row-wise-reduction': layouts = map_to_list(str,p['layout']) @@ -107,9 +114,10 @@ def do_tuning(config_fname, spec_fname, viennacl_root): def execution_handler(sizes, fname=os.devnull, parameters=None): A = vcl.Matrix(sizes if A_trans=='N' else sizes[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR) x = vcl.Vector(sizes[1] if A_trans=='N' else sizes[0], context=ctx, dtype=datatype) + y = vcl.Vector(sizes[0] if A_trans=='N' else sizes[1], context=ctx, dtype=datatype) LHS = A if A_trans=='N' else A.T - execute(device, vcl.Statement(LHS*x), (), sizes, fname, parameters) - tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=5, high=100, size=2)) + return execute(device, vcl.Assign(y, LHS*x), (), sizes, fname, parameters) + tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=5, high=100, size=2), (A_trans,)) #Matrix Product if operation=='matrix-product': layouts = map_to_list(str,p['layout']) @@ -126,9 +134,8 @@ def do_tuning(config_fname, spec_fname, viennacl_root): alpha = vcl.HostScalar(1.0, context=ctx, dtype = datatype) beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype) C = vcl.Matrix((sizes[0], sizes[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR) - statement = vcl.Statement(vcl.Assign(C,LHS*RHS*alpha + C*beta)) - return execute(device, statement,(A_trans, B_trans), sizes, fname, parameters) - tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=1, high=40, size=3)) + return execute(device, vcl.Assign(C,LHS*RHS*alpha + C*beta),(A_trans, B_trans), sizes, fname, parameters) + tune(execution_handler, 50, 10000, lambda : 64*np.random.randint(low=1, high=40, size=3),(layout[0], layout[1])) diff --git a/autotune/python/tools.py b/autotune/python/tools.py index ecadd89af..afd39ce56 100644 --- a/autotune/python/tools.py +++ b/autotune/python/tools.py @@ -1,6 +1,11 @@ from __future__ import division import pyopencl import time +import os +import sys + +import pyopencl as cl +import pyviennacl as vcl from pyviennacl.atidlas import StatementsTuple class PhysicalLimitsNV: @@ -158,7 +163,6 @@ def benchmark(template, statement, device): if occupancy_record.occupancy < 15 : raise ValueError("Template has too low occupancy") else: - #~ try: template.execute(statement, True) statement.result.context.finish_all_queues() N = 0 @@ -170,5 +174,138 @@ def benchmark(template, statement, device): current_time += time.time() - time_before N+=1 return current_time/N - #~ except: - #~ raise ValueError("Invalid template") + + +def update_viennacl_headers(viennacl_root, device, datatype, operation, additional_parameters, parameters): + + def sanitize_string(string, keep_chars = ['_']): + string = string.replace(' ', '_').replace('-', '_').lower() + string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip() + return string + + def append_include(data, path): + include_name = '#include "' + path +'"\n' + already_included = data.find(include_name) + if already_included == -1: + insert_index = data.index('\n', data.index('#define')) + 1 + return data[:insert_index] + '\n' + include_name + data[insert_index:] + return data + + + builtin_database_dir = os.path.join(viennacl_root, "device_specific", "builtin_database") + if not os.path.isdir(builtin_database_dir): + raise EnvironmentError('ViennaCL root path is incorrect. Cannot access ' + builtin_database_dir + '!\n' + 'Your version of ViennaCL may be too old and/or corrupted.') + + function_name_dict = { vcl.float32: 'add_4B', + vcl.float64: 'add_8B' } + + additional_parameters_dict = {'N': "char_to_type<'N'>", + 'T': "char_to_type<'T'>"} + + #Create the device-specific headers + cpp_device_name = sanitize_string(device.name) + function_name = function_name_dict[datatype] + operation = operation.replace('-','_') + + cpp_class_name = operation + '_template' + header_name = cpp_device_name + ".hpp" + function_declaration = 'inline void ' + function_name + '(' + ', '.join(['database_type<' + cpp_class_name + '::parameters_type> & db'] + \ + [additional_parameters_dict[x] for x in additional_parameters]) + ')' + + + device_type_prefix = { + cl.device_type.GPU: 'gpu', + cl.device_type.CPU: 'cpu', + cl.device_type.ACCELERATOR: 'accelerator' + }[device.type] + vendor_prefix = { + vcl.opencl.VendorId.beignet_id: 'beignet', + vcl.opencl.VendorId.nvidia_id: 'nvidia', + vcl.opencl.VendorId.amd_id: 'amd', + vcl.opencl.VendorId.intel_id: 'intel' + }[device.vendor_id] + architecture_family = vcl.opencl.architecture_family(device.vendor_id, device.name) + + header_hierarchy = ["devices", device_type_prefix, vendor_prefix, architecture_family] + header_directory = os.path.join(builtin_database_dir, *header_hierarchy) + header_path = os.path.join(header_directory, header_name) + + if not os.path.exists(header_directory): + os.makedirs(header_directory) + + if os.path.exists(header_path): + with open (header_path, "r") as myfile: + data=myfile.read() + else: + data = '' + + if not data: + ifndef_suffix = ('_'.join(header_hierarchy) + '_hpp_').upper() + data = ('#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n' + '#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n' + '\n' + '#include "viennacl/device_specific/forwards.h"\n' + '#include "viennacl/device_specific/builtin_database/common.hpp"\n' + '\n' + 'namespace viennacl{\n' + 'namespace device_specific{\n' + 'namespace builtin_database{\n' + 'namespace devices{\n' + 'namespace ' + device_type_prefix + '{\n' + 'namespace ' + vendor_prefix + '{\n' + 'namespace ' + architecture_family + '{\n' + 'namespace ' + cpp_device_name + '{\n' + '\n' + '}\n' + '}\n' + '}\n' + '}\n' + '}\n' + '}\n' + '}\n' + '}\n' + '#endif\n' + '') + + data = append_include(data, 'viennacl/device_specific/templates/' + cpp_class_name + '.hpp') + device_type = { + cl.device_type.GPU: 'CL_DEVICE_TYPE_GPU', + cl.device_type.CPU: 'CL_DEVICE_TYPE_CPU', + cl.device_type.ACCELERATOR: 'CL_DEVICE_TYPE_ACCELERATOR' + }[device.type] + add_to_database_arguments = [vendor_prefix + '_id', device_type, 'ocl::'+architecture_family, + '"' + device.name + '"', cpp_class_name + '::parameters_type(' + ','.join(map(str,parameters)) + ')'] + core = ' db.' + function_name + '(' + ', '.join(add_to_database_arguments) + ');' + + already_declared = data.find(function_declaration) + if already_declared==-1: + substr = 'namespace ' + cpp_device_name + '{\n' + insert_index = data.index(substr) + len(substr) + data = data[:insert_index] + '\n' + function_declaration + '\n{\n' + core + '\n}\n' + data[insert_index:] + else: + i1 = data.find('{', already_declared) + if data[i1-1]=='\n': + i1 = i1 - 1 + i2 = data.find('}', already_declared) + 1 + data = data[:i1] + '\n{\n' + core + '\n}' + data[i2:] + + #Write the header file + with open(header_path, "w+") as myfile: + myfile.write(data) + + #Updates the global ViennaCL headers + with open(os.path.join(builtin_database_dir, operation + '.hpp'), 'r+') as operation_header: + data = operation_header.read() + data = append_include(data, os.path.relpath(header_path, os.path.join(viennacl_root, os.pardir))) + + scope_name = '_'.join(('init', operation) + additional_parameters) + scope = data.index(scope_name) + function_call = ' ' + '::'.join(header_hierarchy + [cpp_device_name, function_name]) + '(' + ', '.join(['result'] + [additional_parameters_dict[k] + '()' for k in additional_parameters]) + ')' + if function_call not in data: + insert_index = data.rindex('\n', 0, data.index('return result', scope)) + data = data[:insert_index] + function_call + ';\n' + data[insert_index:] + + operation_header.seek(0) + operation_header.truncate() + operation_header.write(data) diff --git a/autotune/python/utils.py b/autotune/python/utils.py deleted file mode 100644 index f8871eda9..000000000 --- a/autotune/python/utils.py +++ /dev/null @@ -1,33 +0,0 @@ -import pyopencl as cl -import pyviennacl as vcl - -all_devices = [d for platform in cl.get_platforms() for d in platform.get_devices()] - -DEVICE_TYPE_PREFIX = { cl.device_type.GPU: 'gpu', - cl.device_type.CPU: 'cpu', - cl.device_type.ACCELERATOR: 'accelerator' -} - -DEVICE_TYPE_CL_NAME = { cl.device_type.GPU: 'CL_DEVICE_TYPE_GPU', - cl.device_type.CPU: 'CL_DEVICE_TYPE_CPU', - cl.device_type.ACCELERATOR: 'CL_DEVICE_TYPE_ACCELERATOR' -} - -VENDOR_PREFIX = { vcl.opencl.VendorId.beignet_id: 'beignet', - vcl.opencl.VendorId.nvidia_id: 'nvidia', - vcl.opencl.VendorId.amd_id: 'amd', - vcl.opencl.VendorId.intel_id: 'intel' -} - -DEVICES_PRESETS = {'all': all_devices, - 'gpus': [d for d in all_devices if d.type==cl.device_type.GPU], - 'cpus': [d for d in all_devices if d.type==cl.device_type.CPU], - 'accelerators': [d for d in all_devices if d.type==cl.device_type.ACCELERATOR] -} - - - -def sanitize_string(string, keep_chars = ['_']): - string = string.replace(' ', '_').lower() - string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip() - return string diff --git a/autotune/python/vclio.py b/autotune/python/vclio.py deleted file mode 100644 index 088582e57..000000000 --- a/autotune/python/vclio.py +++ /dev/null @@ -1,116 +0,0 @@ -import sys -import os -import utils - -def append_include(data, path): - include_name = '#include "' + path +'"\n' - already_included = data.find(include_name) - if already_included == -1: - insert_index = data.index('\n', data.index('#define')) + 1 - return data[:insert_index] + '\n' + include_name + data[insert_index:] - return data - -def generate_viennacl_headers(viennacl_root, device, datatype, operation, additional_parameters, parameters): - builtin_database_dir = os.path.join(viennacl_root, "device_specific", "builtin_database") - if not os.path.isdir(builtin_database_dir): - raise EnvironmentError('ViennaCL root path is incorrect. Cannot access ' + builtin_database_dir + '!\n' - 'Your version of ViennaCL may be too old and/or corrupted.') - - function_name_dict = { vcl.float32: 'add_4B', - vcl.float64: 'add_8B' } - - additional_parameters_dict = {'N': "char_to_type<'N'>", - 'T': "char_to_type<'T'>"} - - #Create the device-specific headers - cpp_device_name = utils.sanitize_string(device.name) - function_name = function_name_dict[datatype] - operation = operation.replace('-','_') - - cpp_class_name = operation + '_template' - header_name = cpp_device_name + ".hpp" - function_declaration = 'inline void ' + function_name + '(' + ', '.join(['database_type<' + cpp_class_name + '::parameters_type> & db'] + \ - [additional_parameters_dict[x] for x in additional_parameters]) + ')' - - device_type_prefix = utils.DEVICE_TYPE_PREFIX[device.type] - vendor_prefix = utils.VENDOR_PREFIX[device.vendor_id] - architecture_family = vcl.opencl.architecture_family(device.vendor_id, device.name) - - header_hierarchy = ["devices", device_type_prefix, vendor_prefix, architecture_family] - header_directory = os.path.join(builtin_database_dir, *header_hierarchy) - header_path = os.path.join(header_directory, header_name) - - if not os.path.exists(header_directory): - os.makedirs(header_directory) - - if os.path.exists(header_path): - with open (header_path, "r") as myfile: - data=myfile.read() - else: - data = '' - - if not data: - ifndef_suffix = ('_'.join(header_hierarchy) + '_hpp_').upper() - data = ('#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n' - '#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_' + ifndef_suffix + '\n' - '\n' - '#include "viennacl/device_specific/forwards.h"\n' - '#include "viennacl/device_specific/builtin_database/common.hpp"\n' - '\n' - 'namespace viennacl{\n' - 'namespace device_specific{\n' - 'namespace builtin_database{\n' - 'namespace devices{\n' - 'namespace ' + device_type_prefix + '{\n' - 'namespace ' + vendor_prefix + '{\n' - 'namespace ' + architecture_family + '{\n' - 'namespace ' + cpp_device_name + '{\n' - '\n' - '}\n' - '}\n' - '}\n' - '}\n' - '}\n' - '}\n' - '}\n' - '}\n' - '#endif\n' - '') - - data = append_include(data, 'viennacl/device_specific/templates/' + cpp_class_name + '.hpp') - - add_to_database_arguments = [vendor_prefix + '_id', utils.DEVICE_TYPE_CL_NAME[device.type], 'ocl::'+architecture_family, - '"' + device.name + '"', cpp_class_name + '::parameters' + str(parameters)] - core = ' db.' + function_name + '(' + ', '.join(add_to_database_arguments) + ');' - - already_declared = data.find(function_declaration) - if already_declared==-1: - substr = 'namespace ' + cpp_device_name + '{\n' - insert_index = data.index(substr) + len(substr) - data = data[:insert_index] + '\n' + function_declaration + '\n{\n' + core + '\n}\n' + data[insert_index:] - else: - i1 = data.find('{', already_declared) - if data[i1-1]=='\n': - i1 = i1 - 1 - i2 = data.find('}', already_declared) + 1 - data = data[:i1] + '\n{\n' + core + '\n}' + data[i2:] - - #Write the header file - with open(header_path, "w+") as myfile: - myfile.write(data) - - #Updates the global ViennaCL headers - with open(os.path.join(builtin_database_dir, operation + '.hpp'), 'r+') as operation_header: - data = operation_header.read() - data = append_include(data, os.path.relpath(header_path, os.path.join(viennacl_root, os.pardir))) - - scope_name = '_'.join(('init', operation) + additional_parameters) - scope = data.index(scope_name) - function_call = ' ' + '::'.join(header_hierarchy + [cpp_device_name, function_name]) + '(' + ', '.join(['result'] + [additional_parameters_dict[k] + '()' for k in additional_parameters]) + ')' - if function_call not in data: - insert_index = data.rindex('\n', 0, data.index('return result', scope)) - data = data[:insert_index] + function_call + ';\n' + data[insert_index:] - - operation_header.seek(0) - operation_header.truncate() - operation_header.write(data)