Tuning: Merged tune branch.

- Much cleaner and more concise source
- Better exceptions handling
- Checks local minima to see if retuning is needed.

Resolved conflicts:
	bench/blas.cpp
	include/isaac/backend/templates/mproduct.h
	include/isaac/driver/buffer.h
	lib/array.cpp
	lib/backend/templates/mproduct.cpp
	lib/driver/buffer.cpp
	python/setup.py
	tune/pysrc/autotune.py
	tune/pysrc/dataset.py
	tune/pysrc/misc_tools.py
This commit is contained in:
Philippe Tillet
2015-06-28 17:53:16 -07:00
parent 48073dc710
commit e7cabf65ac
50 changed files with 832 additions and 3017 deletions

View File

@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 2.8.10)
# Add visibility of headers
file( GLOB_RECURSE MAKE_HEADERS_VISIBLE_SRC *.hpp *.h)
file( GLOB_RECURSE MAKE_HEADERS_VISIBLE_SRC *.cpp *.hpp *.h)
add_custom_target( MAKE_HEADERS_VISIBLE SOURCES ${MAKE_HEADERS_VISIBLE_SRC} )
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")

View File

@@ -27,9 +27,9 @@ if(MKL_FOUND)
else()
find_package(OpenBlas)
if(OPENBLAS_FOUND)
#set(BLAS_DEF ${BLAS_DEF} "-DBENCH_CBLAS")
#include_directories(${OPENBLAS_INCLUDE_DIR})
#set(BLAS_LIBS ${BLAS_LIBS} ${OPENBLAS_LIBRARIES} )
set(BLAS_DEF ${BLAS_DEF} "-DBENCH_CBLAS")
include_directories(${OPENBLAS_INCLUDE_DIR})
set(BLAS_LIBS ${BLAS_LIBS} ${OPENBLAS_LIBRARIES} )
endif()
endif()

View File

@@ -112,10 +112,17 @@ void bench(ad::numeric_type dtype, std::string operation)
#define BENCHMARK_HOST(OP, PERF) \
{\
ad::tools::timer tmr;\
double total_time = 0;\
std::vector<double> times;\
while(total_time < 1e-2){\
std::vector<int> cache_flusher(10000000, 0);\
tmr.start();\
OP;\
double t = 1e9*tmr.get();\
double time = tmr.get();\
times.push_back(time);\
total_time += time;\
}\
double t = 1e9*median(times);\
std::cout << " " << PERF << std::flush;\
}
@@ -127,6 +134,8 @@ void bench(ad::numeric_type dtype, std::string operation)
cudaEvent_t start, stop;\
cudaEventCreate(&start);\
cudaEventCreate(&stop);\
OP;\
cudaThreadSynchronize();\
while(total_time*1e-3 < 1e-3){\
flush = ad::zeros(1e6, 1, dtype);\
cudaEventRecord(start,0);\
@@ -290,15 +299,15 @@ void bench(ad::numeric_type dtype, std::string operation)
if(operation.substr(0,4)=="gemm")
{
std::vector<std::tuple<int_t, int_t, int_t> > MNKs;
// MNKs.push_back(std::make_tuple(896,896,896));
// MNKs.push_back(std::make_tuple(3072,3072,3072));
// MNKs.push_back(std::make_tuple(1024,64,768));
// MNKs.push_back(std::make_tuple(768,64,128));
// MNKs.push_back(std::make_tuple(64,64,32000));
// MNKs.push_back(std::make_tuple(1024,1024,32000));
MNKs.push_back(std::make_tuple(896,896,896));
MNKs.push_back(std::make_tuple(3072,3072,3072));
MNKs.push_back(std::make_tuple(1024,64,768));
MNKs.push_back(std::make_tuple(768,64,128));
MNKs.push_back(std::make_tuple(64,64,32000));
MNKs.push_back(std::make_tuple(1024,1024,32000));
for(unsigned int N = 1 ; N <10 ; ++N)
MNKs.push_back(std::make_tuple(128*N, 128*N, 128*N));
// for(unsigned int N = 1 ; N <10 ; ++N)
// MNKs.push_back(std::make_tuple(128*N, 128*N, 128*N));
/*---------*/
/*--BLAS3--*/
/*---------*/
@@ -308,6 +317,7 @@ void bench(ad::numeric_type dtype, std::string operation)
int_t N = std::get<1>(MNK);
int_t K = std::get<2>(MNK);
std::cout << M << "," << N << "," << K;
std::cout << std::flush;
/* ISAAC */
ad::array C(M, N, dtype), A(M, K, dtype), B(N, K, dtype);
#if HAS_A_BLAS

View File

@@ -49,13 +49,44 @@ def main():
return optlist
def find_library(name, cmake_glob_list):
compiler=new_compiler()
cvars = sysconfig.get_config_vars()
compiler = new_compiler()
dirs = []
for gpath in cmake_glob_list.split(';'):
path = glob(gpath)
if path:
dirs += [path[0]]
return compiler.find_library_file(dirs, name)
return compiler.find_library_file(cvars['LIBDIR'].split(';') + dirs, name)
def find_opencl():
cvars = sysconfig.get_config_vars()
is_on_android = '-mandroid' in cvars['PY_CFLAGS']
lib = find_library('OpenCL', '${ANDROID_CL_GLOB_HINTS}' if is_on_android else '${X86_CL_GLOB_HINTS}')
return {'include': '', 'lib': dirname(lib)} if lib else None
def find_in_path(name, path):
"Find a file in a search path"
#adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
for dir in path.split(os.pathsep):
binpath = os.path.join(dir, name)
if os.path.exists(binpath):
return os.path.abspath(binpath)
return None
def find_cuda():
if 'CUDAHOME' in os.environ:
home = os.environ['CUDAHOME']
nvcc = os.path.join(home, 'bin', 'nvcc')
else:
nvcc = find_in_path('nvcc', os.environ['PATH'])
if nvcc:
home = dirname(os.path.dirname(nvcc))
return {'include': os.path.join(home, 'include'),
'lib': os.path.join(home, 'lib64')}
else:
return None
#Tweaks warning, because boost-numpy and boost-python won't compile cleanly without these changes
cvars = sysconfig.get_config_vars()
@@ -63,14 +94,27 @@ def main():
cvars["CFLAGS"] = cvars["BASECFLAGS"] + ' ' + cvars['OPT']
cvars["LDFLAGS"] = '-Wl,--no-as-needed ' + cvars["LDFLAGS"]
is_on_android = '-mandroid' in cvars['PY_CFLAGS']
opencl = find_library('OpenCL', '${ANDROID_CL_GLOB_HINTS}' if is_on_android else '${X86_CL_GLOB_HINTS}')
#OpenCL
opencl_config = find_opencl()
library_dirs = [dirname(library) for library in [opencl] if library is not None]
#CUDA
cuda_config = find_cuda()
#Includes
#Libraries
libraries = ['OpenCL']
if cuda_config: libraries += ['cuda', 'nvrtc']
#Backends:
backend_defines = ['-DISAAC_WITH_OPENCL']
if cuda_config: backend_defines += ['-DISAAC_WITH_CUDA']
#Library directories
library_dirs = [config['lib'] for config in [opencl_config, cuda_config] if config is not None]
#Include directories
include ='${INCLUDE_DIRECTORIES_STR}'.split() + ['external/boost/include', os.path.join(find_module("numpy")[1], "core", "include")]
#Sources
#Source files
src = '${LIBISAAC_SRC_STR}'.split() + [os.path.join('src', 'wrap', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'model.cpp', 'exceptions.cpp']]
boostsrc = 'external/boost/libs/'
for s in ['numpy','python','smart_ptr','system','thread']:
@@ -84,7 +128,7 @@ def main():
src += glob(boostsrc + "/thread/src/pthread/*.cpp")
src= [f for f in src if not f.endswith("once_atomic.cpp")]
#Setup
setup(
name='isaac',
version='1.0',
@@ -96,12 +140,12 @@ def main():
ext_package="isaac",
ext_modules=[Extension(
'_isaac',src,
extra_compile_args= ['-D__CL_ENABLE_EXCEPTIONS', '-std=c++11', '-Wno-unused-function', '-Wno-unused-local-typedefs', '-Wno-sign-compare'],
extra_compile_args= backend_defines + ['-std=c++11', '-Wno-unused-function', '-Wno-unused-local-typedefs', '-Wno-sign-compare'],
extra_link_args=['-Wl,-soname=_isaac.so'],
undef_macros=[],
include_dirs=include,
library_dirs=library_dirs,
libraries=['OpenCL']
libraries=libraries
)],
cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass},
classifiers=[

View File

@@ -179,10 +179,8 @@ ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(pow)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(dot)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(outer)
namespace detail
{
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(assign)
}
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(assign)
#undef ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR

View File

@@ -15,6 +15,7 @@ namespace detail
bool is_node_leaf(op_element const & op);
bool is_scalar_reduction(array_expression::node const & node);
bool is_vector_reduction(array_expression::node const & node);
bool is_assignment(op_element const & op);
bool is_elementwise_operator(op_element const & op);
bool is_elementwise_function(op_element const & op);
bool is_cast(op_element const & op);

View File

@@ -175,7 +175,7 @@ public:
base(binding_policy_t binding_policy);
virtual unsigned int lmem_usage(expressions_tuple const &) const;
virtual unsigned int registers_usage(expressions_tuple const &) const;
virtual std::vector<int_t> input_sizes(expressions_tuple const & expressions) = 0;
virtual std::vector<int_t> input_sizes(expressions_tuple const & expressions) const = 0;
virtual ~base();
std::string generate(const char * suffix, expressions_tuple const & expressions, driver::Device const & device);
virtual int is_invalid(expressions_tuple const & expressions, driver::Device const & device) const = 0;

View File

@@ -25,7 +25,7 @@ private:
public:
maxpy(parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
maxpy(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE);
std::vector<int_t> input_sizes(expressions_tuple const & expressions);
std::vector<int_t> input_sizes(expressions_tuple const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program & program, const char * suffix, base & fallback, controller<expressions_tuple> const &);
};

View File

@@ -48,10 +48,10 @@ private:
void enqueue_block(driver::CommandQueue & queue, int_t M, int_t N, int_t K, array const & A, array const & B, array const & C,
value_scalar const &alpha, value_scalar const &beta, driver::Program & program, const char * suffix, execution_options_type const & options);
array create_slice(array & M, int_t s0_0, int_t s0_1, int_t s1_0, int_t s1_1, bool swap);
std::vector<int_t> infos(expressions_tuple const & expressions, isaac::symbolic::preset::gemm::args &arguments);
std::vector<int_t> infos(expressions_tuple const & expressions, isaac::symbolic::preset::gemm::args &arguments) const;
public:
mproduct(mproduct::parameters_type const & parameters, bool check_bound, char A_trans, char B_trans);
std::vector<int_t> input_sizes(expressions_tuple const & expressions);
std::vector<int_t> input_sizes(expressions_tuple const & expressions) const;
void cleanup(values_holder beta, controller<expressions_tuple> const & ctr, model & fallback,
lhs_rhs_element* eA, lhs_rhs_element* eB, lhs_rhs_element* eC, lhs_rhs_element* ebeta, array const & A, array const & B, array const & C);
void enqueue(driver::CommandQueue & queue, driver::Program & program, const char * suffix, base & fallback, controller<expressions_tuple> const &ctr);

View File

@@ -34,7 +34,7 @@ private:
unsigned int lmem_usage() const;
std::string generate_impl(const char * suffix, expressions_tuple const &, driver::Device const & device, std::vector<mapping_type> const &) const;
public:
virtual std::vector<int_t> input_sizes(expressions_tuple const & expressions);
virtual std::vector<int_t> input_sizes(expressions_tuple const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program & program, const char * suffix, base & fallback, controller<expressions_tuple> const &);
private:
reduction_type reduction_type_;

View File

@@ -27,7 +27,7 @@ private:
public:
reduction(reduction::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
reduction(unsigned int simd, unsigned int ls, unsigned int ng, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE);
std::vector<int_t> input_sizes(expressions_tuple const & expressions);
std::vector<int_t> input_sizes(expressions_tuple const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program & program, const char * suffix, base & fallback, controller<expressions_tuple> const &);
private:
std::vector< driver::Buffer > tmp_;

View File

@@ -22,7 +22,7 @@ private:
public:
vaxpy(vaxpy::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
vaxpy(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
std::vector<int_t> input_sizes(expressions_tuple const & expressions);
std::vector<int_t> input_sizes(expressions_tuple const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program & program, const char * suffix, base & fallback, controller<expressions_tuple> const &);
};

View File

@@ -1,6 +1,8 @@
#ifndef ISAAC_DRIVER_BUFFER_H
#define ISAAC_DRIVER_BUFFER_H
#include "isaac/types.h"
#include "isaac/driver/common.h"
#include "isaac/driver/context.h"
#include "isaac/driver/handle.h"

View File

@@ -27,6 +27,7 @@ enum device_type
DEVICE_TYPE_ACCELERATOR = CL_DEVICE_TYPE_ACCELERATOR
};
#ifdef ISAAC_WITH_CUDA
namespace nvrtc
@@ -34,7 +35,7 @@ namespace nvrtc
namespace exception
{
#define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class name: public std::exception { const char * what() const throw(){ return "NVRTC: Error- " msg; } }
#define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } }
ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory ,"out of memory exception");
ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure ,"program creation failure");
@@ -59,7 +60,7 @@ namespace cuda
class base: public std::exception{};
#define ISAAC_CREATE_CUDA_EXCEPTION(name, msg) class name: public base { const char * what() const throw(){ return "CUDA: Error- " msg; } }
#define ISAAC_CREATE_CUDA_EXCEPTION(name, msg) class name: public base { public:const char * what() const throw(){ return "CUDA: Error- " msg; } }
ISAAC_CREATE_CUDA_EXCEPTION(invalid_value ,"invalid value");
@@ -129,6 +130,72 @@ void check(CUresult);
#endif
namespace ocl
{
namespace exception
{
class base: public std::exception{};
#define ISAAC_CREATE_CL_EXCEPTION(name, msg) class name: public base { public: const char * what() const throw(){ return "OpenCL: Error- " msg; } }
ISAAC_CREATE_CL_EXCEPTION(device_not_found, "device not found");
ISAAC_CREATE_CL_EXCEPTION(device_not_available, "device not available");
ISAAC_CREATE_CL_EXCEPTION(compiler_not_available, "compiler not available");
ISAAC_CREATE_CL_EXCEPTION(mem_object_allocation_failure, "object allocation failure");
ISAAC_CREATE_CL_EXCEPTION(out_of_resources, "launch out of resources");
ISAAC_CREATE_CL_EXCEPTION(out_of_host_memory, "out of host memory");
ISAAC_CREATE_CL_EXCEPTION(profiling_info_not_available, "profiling info not available");
ISAAC_CREATE_CL_EXCEPTION(mem_copy_overlap, "mem copy overlap");
ISAAC_CREATE_CL_EXCEPTION(image_format_mismatch, "image format mismatch");
ISAAC_CREATE_CL_EXCEPTION(image_format_not_supported, "image format not supported");
ISAAC_CREATE_CL_EXCEPTION(build_program_failure, "build program failure");
ISAAC_CREATE_CL_EXCEPTION(map_failure, "map failure");
ISAAC_CREATE_CL_EXCEPTION(invalid_value, "invalid value");
ISAAC_CREATE_CL_EXCEPTION(invalid_device_type, "invalid device type");
ISAAC_CREATE_CL_EXCEPTION(invalid_platform, "invalid platform");
ISAAC_CREATE_CL_EXCEPTION(invalid_device, "invalid device");
ISAAC_CREATE_CL_EXCEPTION(invalid_context, "invalid context");
ISAAC_CREATE_CL_EXCEPTION(invalid_queue_properties, "invalid queue properties");
ISAAC_CREATE_CL_EXCEPTION(invalid_command_queue, "invalid command queue");
ISAAC_CREATE_CL_EXCEPTION(invalid_host_ptr, "invalid host pointer");
ISAAC_CREATE_CL_EXCEPTION(invalid_mem_object, "invalid mem object");
ISAAC_CREATE_CL_EXCEPTION(invalid_image_format_descriptor, "invalid image format descriptor");
ISAAC_CREATE_CL_EXCEPTION(invalid_image_size, "invalid image size");
ISAAC_CREATE_CL_EXCEPTION(invalid_sampler, "invalid sampler");
ISAAC_CREATE_CL_EXCEPTION(invalid_binary, "invalid binary");
ISAAC_CREATE_CL_EXCEPTION(invalid_build_options, "invalid build options");
ISAAC_CREATE_CL_EXCEPTION(invalid_program, "invalid program");
ISAAC_CREATE_CL_EXCEPTION(invalid_program_executable, "invalid program executable");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_name, "invalid kernel name");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_definition, "invalid kernel definition");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel, "invalid kernel");
ISAAC_CREATE_CL_EXCEPTION(invalid_arg_index, "invalid arg index");
ISAAC_CREATE_CL_EXCEPTION(invalid_arg_value, "invalid arg value");
ISAAC_CREATE_CL_EXCEPTION(invalid_arg_size, "invalid arg size");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_args, "invalid kernel args");
ISAAC_CREATE_CL_EXCEPTION(invalid_work_dimension, "invalid work dimension");
ISAAC_CREATE_CL_EXCEPTION(invalid_work_group_size, "invalid work group size");
ISAAC_CREATE_CL_EXCEPTION(invalid_work_item_size, "invalid work item size");
ISAAC_CREATE_CL_EXCEPTION(invalid_global_offset, "invalid global offset");
ISAAC_CREATE_CL_EXCEPTION(invalid_event_wait_list, "invalid event wait list");
ISAAC_CREATE_CL_EXCEPTION(invalid_event, "invalid event");
ISAAC_CREATE_CL_EXCEPTION(invalid_operation, "invalid operation");
ISAAC_CREATE_CL_EXCEPTION(invalid_gl_object, "invalid GL object");
ISAAC_CREATE_CL_EXCEPTION(invalid_buffer_size, "invalid buffer size");
ISAAC_CREATE_CL_EXCEPTION(invalid_mip_level, "invalid MIP level");
ISAAC_CREATE_CL_EXCEPTION(invalid_global_work_size, "invalid global work size");
#ifdef CL_INVALID_PROPERTY
ISAAC_CREATE_CL_EXCEPTION(invalid_property, "invalid property");
#endif
}
void check(cl_int err);
}
}
}

View File

@@ -64,6 +64,7 @@ array::array(array & M, slice const & s0, slice const & s1) : dtype_(M.dtype_),
context_(M.data_.context()), data_(M.data_)
{ }
template<typename DT>
array::array(int_t shape0, int_t shape1, std::vector<DT> const & data, driver::Context context)
: dtype_(to_numeric_type<DT>::value),
@@ -471,8 +472,7 @@ DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_MAX_TYPE, maximum, x.dtype())
DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_MIN_TYPE, minimum, x.dtype())
DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_POW_TYPE, pow, x.dtype())
namespace detail
{ DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ASSIGN_TYPE, assign, x.dtype()) }
DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ASSIGN_TYPE, assign, x.dtype())
DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_GREATER_TYPE, operator >, INT_TYPE)

View File

@@ -21,11 +21,16 @@ namespace detail
|| node.op.type_family==OPERATOR_COLUMNS_REDUCTION_TYPE_FAMILY;
}
bool is_elementwise_operator(op_element const & op)
bool is_assignment(op_element const & op)
{
return op.type== OPERATOR_ASSIGN_TYPE
|| op.type== OPERATOR_INPLACE_ADD_TYPE
|| op.type== OPERATOR_INPLACE_SUB_TYPE
|| op.type== OPERATOR_INPLACE_SUB_TYPE;
}
bool is_elementwise_operator(op_element const & op)
{
return is_assignment(op)
|| op.type== OPERATOR_ADD_TYPE
|| op.type== OPERATOR_SUB_TYPE
|| op.type== OPERATOR_ELEMENT_PROD_TYPE

View File

@@ -97,7 +97,7 @@ maxpy::maxpy(unsigned int simd, unsigned int ls1, unsigned int ls2,
base_impl<maxpy, maxpy_parameters>(maxpy_parameters(simd, ls1, ls2, ng1, ng2, fetch), bind)
{}
std::vector<int_t> maxpy::input_sizes(expressions_tuple const & expressions)
std::vector<int_t> maxpy::input_sizes(expressions_tuple const & expressions) const
{
isaac::array_expression const & array_expression = *(expressions.data().front());
std::pair<int_t, int_t> size = matrix_size(lhs_most(array_expression.tree(), array_expression.root()));

View File

@@ -3,6 +3,7 @@
#include "isaac/backend/keywords.h"
#include "isaac/model/model.h"
#include "isaac/symbolic/preset.h"
#include "isaac/exception/operation_not_supported.h"
#include "isaac/tools/make_vector.hpp"
#include "isaac/tools/to_string.hpp"
#include "isaac/tools/miscellaneous.hpp"
@@ -42,10 +43,13 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
return N*size_of(numeric_t);
}
int mproduct::is_invalid_impl(driver::Device const &, expressions_tuple const &) const
int mproduct::is_invalid_impl(driver::Device const &, expressions_tuple const & expressions) const
{
if (p_.A_fetching_policy!=FETCH_FROM_LOCAL && p_.B_fetching_policy!=FETCH_FROM_LOCAL&& (p_.local_fetch_0!=0 || p_.local_fetch_1!=0))
return TEMPLATE_GLOBAL_MEMORY_REQUIRES_ZERO_LOCAL_FETCH;
std::vector<int_t> MNK = input_sizes(expressions);
int_t M = MNK[0]; int_t N = MNK[1];
if(p_.depth > 1 && M*N*p_.depth > 1e6)
throw operation_not_supported_exception("This would necessitate a temporary larger than 1MB");
if ((p_.mS % p_.simd_width) > 0 || (p_.nS % p_.simd_width) > 0)
return TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE;
@@ -642,7 +646,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
return array(M, s0, s1);
}
std::vector<int_t> mproduct::infos(expressions_tuple const & expressions, symbolic::preset::gemm::args& arguments)
std::vector<int_t> mproduct::infos(expressions_tuple const & expressions, symbolic::preset::gemm::args& arguments) const
{
isaac::array_expression & array_expression = (*expressions.data().front());
array_expression::container_type & array = array_expression.tree();
@@ -663,7 +667,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
else throw;
}
std::vector<int_t> mproduct::input_sizes(expressions_tuple const & expressions)
std::vector<int_t> mproduct::input_sizes(expressions_tuple const & expressions) const
{
symbolic::preset::gemm::args dummy;
return infos(expressions, dummy);

View File

@@ -26,7 +26,7 @@ int mreduction::is_invalid_impl(driver::Device const &, expressions_tuple const
unsigned int mreduction::lmem_usage() const
{
return p_.local_size_0*(p_.local_size_1+1);
return (p_.local_size_0+1)*p_.local_size_1;
}
std::string mreduction::generate_impl(const char * suffix, expressions_tuple const & expressions, driver::Device const & device, std::vector<mapping_type> const & mappings) const
@@ -83,7 +83,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
{"array2", "#pointer += #start1 + #start2*#ld; "
"#ld *= #nldstride; "}}, expressions, mappings);
unsigned int local_size_0_ld = p_.local_size_0+1;
unsigned int local_size_0_ld = p_.local_size_0;
std::string local_size_0_ld_str = to_string(local_size_0_ld);
for (const auto & e : reductions)
@@ -321,7 +321,7 @@ mreduction::mreduction(mreduction::parameters_type const & parameters,
base_impl<mreduction, mreduction_parameters>(parameters, binding_policy),
reduction_type_(rtype){ }
std::vector<int_t> mreduction::input_sizes(expressions_tuple const & expressions)
std::vector<int_t> mreduction::input_sizes(expressions_tuple const & expressions) const
{
array_expression const & first_expression = *expressions.data().front();
std::vector<std::size_t> idx = filter_nodes(&is_reduction, first_expression, false);

View File

@@ -35,7 +35,7 @@ inline void reduction::reduce_1d_local_memory(kernel_generation_stream & stream,
std::string const & buf_str, std::string const & buf_value_str, driver::backend_type backend) const
{
stream << "#pragma unroll" << std::endl;
stream << "for(unsigned int stride = " << size/2 << "; stride >0; stride /=2)" << std::endl;
stream << "for(unsigned int stride = " << size/2 << "; stride > 0; stride /=2)" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
stream << LocalBarrier(backend) << ";" << std::endl;
@@ -269,7 +269,7 @@ reduction::reduction(unsigned int simd, unsigned int ls, unsigned int ng,
base_impl<reduction, reduction_parameters>(reduction_parameters(simd,ls,ng,fetch), bind)
{}
std::vector<int_t> reduction::input_sizes(expressions_tuple const & expressions)
std::vector<int_t> reduction::input_sizes(expressions_tuple const & expressions) const
{
std::vector<size_t> reductions_idx = filter_nodes(&is_reduction, *(expressions.data().front()), false);
int_t N = vector_size(lhs_most(expressions.data().front()->tree(), reductions_idx[0]));

View File

@@ -101,7 +101,7 @@ vaxpy::vaxpy(unsigned int simd, unsigned int ls, unsigned int ng,
{}
std::vector<int_t> vaxpy::input_sizes(expressions_tuple const & expressions)
std::vector<int_t> vaxpy::input_sizes(expressions_tuple const & expressions) const
{
int_t size = static_cast<array_expression const *>(expressions.data().front().get())->shape()[0];
return tools::make_vector<int_t>() << size;

View File

@@ -13,14 +13,22 @@ Buffer::Buffer(cl::Buffer const & buffer) : backend_(OPENCL), context_(buffer.ge
}
Buffer::Buffer(Context const & context, std::size_t size) : backend_(context.backend_), context_(context), h_(backend_)
{
switch(backend_)
{
#ifdef ISAAC_WITH_CUDA
case CUDA: cuda::check(cuMemAlloc(h_.cu.get(), size)); break;
case CUDA:
cuda::check(cuMemAlloc(h_.cu.get(), size));
break;
#endif
case OPENCL: *h_.cl = cl::Buffer(*context.h_.cl, CL_MEM_READ_WRITE, size); break;
default: throw;
case OPENCL:
cl_int err;
*h_.cl = cl::Buffer(*context.h_.cl, CL_MEM_READ_WRITE, size, NULL, &err);
ocl::check(err);
break;
default:
throw;
}
}

View File

@@ -103,6 +103,70 @@ void check(CUresult err)
#endif
namespace ocl
{
void check(cl_int err)
{
using namespace isaac::driver::ocl::exception;
switch(err)
{
case CL_SUCCESS: break;
case CL_DEVICE_NOT_FOUND: throw device_not_found();
case CL_DEVICE_NOT_AVAILABLE: throw device_not_available();
case CL_COMPILER_NOT_AVAILABLE: throw compiler_not_available();
case CL_MEM_OBJECT_ALLOCATION_FAILURE: throw mem_object_allocation_failure();
case CL_OUT_OF_RESOURCES: throw out_of_resources();
case CL_OUT_OF_HOST_MEMORY: throw out_of_host_memory();
case CL_PROFILING_INFO_NOT_AVAILABLE: throw profiling_info_not_available();
case CL_MEM_COPY_OVERLAP: throw mem_copy_overlap();
case CL_IMAGE_FORMAT_MISMATCH: throw image_format_mismatch();
case CL_IMAGE_FORMAT_NOT_SUPPORTED: throw image_format_not_supported();
case CL_BUILD_PROGRAM_FAILURE: throw build_program_failure();
case CL_MAP_FAILURE: throw map_failure();
case CL_INVALID_VALUE: throw invalid_value();
case CL_INVALID_DEVICE_TYPE: throw invalid_device_type();
case CL_INVALID_PLATFORM: throw invalid_platform();
case CL_INVALID_DEVICE: throw invalid_device();
case CL_INVALID_CONTEXT: throw invalid_context();
case CL_INVALID_QUEUE_PROPERTIES: throw invalid_queue_properties();
case CL_INVALID_COMMAND_QUEUE: throw invalid_command_queue();
case CL_INVALID_HOST_PTR: throw invalid_host_ptr();
case CL_INVALID_MEM_OBJECT: throw invalid_mem_object();
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: throw invalid_image_format_descriptor();
case CL_INVALID_IMAGE_SIZE: throw invalid_image_size();
case CL_INVALID_SAMPLER: throw invalid_sampler();
case CL_INVALID_BINARY: throw invalid_binary();
case CL_INVALID_BUILD_OPTIONS: throw invalid_build_options();
case CL_INVALID_PROGRAM: throw invalid_program();
case CL_INVALID_PROGRAM_EXECUTABLE: throw invalid_program_executable();
case CL_INVALID_KERNEL_NAME: throw invalid_kernel_name();
case CL_INVALID_KERNEL_DEFINITION: throw invalid_kernel_definition();
case CL_INVALID_KERNEL: throw invalid_kernel();
case CL_INVALID_ARG_INDEX: throw invalid_arg_index();
case CL_INVALID_ARG_VALUE: throw invalid_arg_value();
case CL_INVALID_ARG_SIZE: throw invalid_arg_size();
case CL_INVALID_KERNEL_ARGS: throw invalid_kernel_args();
case CL_INVALID_WORK_DIMENSION: throw invalid_work_dimension();
case CL_INVALID_WORK_GROUP_SIZE: throw invalid_work_group_size();
case CL_INVALID_WORK_ITEM_SIZE: throw invalid_work_item_size();
case CL_INVALID_GLOBAL_OFFSET: throw invalid_global_offset();
case CL_INVALID_EVENT_WAIT_LIST: throw invalid_event_wait_list();
case CL_INVALID_EVENT: throw invalid_event();
case CL_INVALID_OPERATION: throw invalid_operation();
case CL_INVALID_GL_OBJECT: throw invalid_gl_object();
case CL_INVALID_BUFFER_SIZE: throw invalid_buffer_size();
case CL_INVALID_MIP_LEVEL: throw invalid_mip_level();
case CL_INVALID_GLOBAL_WORK_SIZE: throw invalid_global_work_size();
#ifdef CL_INVALID_PROPERTY
case CL_INVALID_PROPERTY: throw invalid_property();
#endif
default: throw;
}
}
}
}
}

View File

@@ -1,4 +1,5 @@
#include "isaac/driver/command_queue.h"
#include "isaac/driver/common.h"
#include "isaac/driver/context.h"
#include "isaac/driver/device.h"
#include "isaac/driver/event.h"
@@ -22,9 +23,15 @@ CommandQueue::CommandQueue(Context const & context, Device const & device, cl_co
switch(backend_)
{
#ifdef ISAAC_WITH_CUDA
case CUDA: cuda::check(cuStreamCreate(h_.cu.get(), 0)); break;
case CUDA:
cuda::check(cuStreamCreate(h_.cu.get(), 0));
break;
#endif
case OPENCL: *h_.cl = cl::CommandQueue(*context.h_.cl, *device.h_.cl, properties); break;
case OPENCL:
cl_int err;
*h_.cl = cl::CommandQueue(*context.h_.cl, *device.h_.cl, properties, &err);
ocl::check(err);
break;
default: throw;
}
}
@@ -61,7 +68,7 @@ Event CommandQueue::enqueue(Kernel const & kernel, NDRange global, driver::NDRan
break;
#endif
case OPENCL:
h_.cl->enqueueNDRangeKernel(*kernel.h_.cl, cl::NullRange, (cl::NDRange)global, (cl::NDRange)local, NULL, event.h_.cl.get());
ocl::check(h_.cl->enqueueNDRangeKernel(*kernel.h_.cl, cl::NullRange, (cl::NDRange)global, (cl::NDRange)local, NULL, event.h_.cl.get()));
break;
default: throw;
}

View File

@@ -29,7 +29,9 @@ Context::Context(Device const & device) : backend_(device.backend_), device_(dev
break;
#endif
case OPENCL:
*h_.cl = cl::Context(std::vector<cl::Device>(1, *device_.h_.cl));
cl_int err;
*h_.cl = cl::Context(std::vector<cl::Device>(1, *device_.h_.cl), NULL, NULL, NULL, &err);
ocl::check(err);
break;
default:
throw;

View File

@@ -123,8 +123,8 @@ Program::Program(Context const & context, std::string const & source) : backend_
*h_.cl = cl::Program(*context_.h_.cl, source);
try{
h_.cl->build(devices);
}catch(cl::Error const & e){
ocl::check(h_.cl->build(devices));
}catch(ocl::exception::build_program_failure const & e){
for(std::vector< cl::Device >::const_iterator it = devices.begin(); it != devices.end(); ++it)
std::cout << "Device : " << it->getInfo<CL_DEVICE_NAME>()
<< "Build Status = " << h_.cl->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(*it) << std::endl

View File

@@ -59,7 +59,7 @@ extern "C"
clRetainMemObject(mx); \
is::array y(N, TYPE_ISAAC, cl::Buffer(my), offy, incy); \
clRetainMemObject(my); \
execute(is::detail::assign(y, x + alpha*y), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \
execute(is::assign(y, x + alpha*y), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \
return clblasSuccess; \
}
@@ -75,7 +75,7 @@ extern "C"
{\
is::array x(N, TYPE_ISAAC, cl::Buffer(mx), offx, incx);\
clRetainMemObject(mx);\
execute(is::detail::assign(x, alpha*x), x.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
execute(is::assign(x, alpha*x), x.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
return clblasSuccess;\
}
@@ -94,7 +94,7 @@ extern "C"
clRetainMemObject(mx);\
is::array y(N, TYPE_ISAAC, cl::Buffer(my), offy, incy);\
clRetainMemObject(my);\
execute(is::detail::assign(y, x), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
execute(is::assign(y, x), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
return clblasSuccess;\
}
@@ -116,7 +116,7 @@ extern "C"
clRetainMemObject(my); \
is::scalar s(TYPE_ISAAC, cl::Buffer(dotProduct), offDP); \
clRetainMemObject(dotProduct); \
execute(is::detail::assign(s, dot(x,y)), s.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \
execute(is::assign(s, dot(x,y)), s.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \
return clblasSuccess; \
}
@@ -134,7 +134,7 @@ extern "C"
clRetainMemObject(mx);\
is::scalar s(TYPE_ISAAC, cl::Buffer(asum), offAsum);\
clRetainMemObject(asum);\
execute(is::detail::assign(s, sum(abs(x))), s.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
execute(is::assign(s, sum(abs(x))), s.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
return clblasSuccess;\
}
@@ -170,9 +170,9 @@ extern "C"
\
is::driver::Context const & context = A.context();\
if(transA==clblasTrans)\
execute(is::detail::assign(y, alpha*dot(A.T(), x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
execute(is::assign(y, alpha*dot(A.T(), x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
else\
execute(is::detail::assign(y, alpha*dot(A, x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
execute(is::assign(y, alpha*dot(A, x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
return clblasSuccess;\
}
@@ -215,14 +215,14 @@ extern "C"
is::driver::Context const & context = C.context();\
/*Operation*/\
if((transA==clblasTrans) && (transB==clblasTrans)){\
execute(is::detail::assign(C, alpha*dot(A.T(), B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
execute(is::assign(C, alpha*dot(A.T(), B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
}\
else if((transA==clblasTrans) && (transB==clblasNoTrans))\
execute(is::detail::assign(C, alpha*dot(A.T(), B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
execute(is::assign(C, alpha*dot(A.T(), B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
else if((transA==clblasNoTrans) && (transB==clblasTrans))\
execute(is::detail::assign(C, alpha*dot(A, B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
execute(is::assign(C, alpha*dot(A, B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
else\
execute(is::detail::assign(C, alpha*dot(A, B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
execute(is::assign(C, alpha*dot(A, B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
return clblasSuccess;\
}

View File

@@ -49,13 +49,44 @@ def main():
return optlist
def find_library(name, cmake_glob_list):
compiler=new_compiler()
cvars = sysconfig.get_config_vars()
compiler = new_compiler()
dirs = []
for gpath in cmake_glob_list.split(';'):
path = glob(gpath)
if path:
dirs += [path[0]]
return compiler.find_library_file(dirs, name)
return compiler.find_library_file(cvars['LIBDIR'].split(';') + dirs, name)
def find_opencl():
cvars = sysconfig.get_config_vars()
is_on_android = '-mandroid' in cvars['PY_CFLAGS']
lib = find_library('OpenCL', '/opt/adreno-driver*/lib' if is_on_android else '/opt/AMDAPPSDK*/lib/x86_64')
return {'include': '', 'lib': dirname(lib)} if lib else None
def find_in_path(name, path):
"Find a file in a search path"
#adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
for dir in path.split(os.pathsep):
binpath = os.path.join(dir, name)
if os.path.exists(binpath):
return os.path.abspath(binpath)
return None
def find_cuda():
if 'CUDAHOME' in os.environ:
home = os.environ['CUDAHOME']
nvcc = os.path.join(home, 'bin', 'nvcc')
else:
nvcc = find_in_path('nvcc', os.environ['PATH'])
if nvcc:
home = dirname(os.path.dirname(nvcc))
return {'include': os.path.join(home, 'include'),
'lib': os.path.join(home, 'lib64')}
else:
return None
#Tweaks warning, because boost-numpy and boost-python won't compile cleanly without these changes
cvars = sysconfig.get_config_vars()
@@ -63,14 +94,27 @@ def main():
cvars["CFLAGS"] = cvars["BASECFLAGS"] + ' ' + cvars['OPT']
cvars["LDFLAGS"] = '-Wl,--no-as-needed ' + cvars["LDFLAGS"]
is_on_android = '-mandroid' in cvars['PY_CFLAGS']
opencl = find_library('OpenCL', '/opt/adreno-driver*/lib' if is_on_android else '/opt/AMDAPPSDK*/lib/x86_64')
#OpenCL
opencl_config = find_opencl()
library_dirs = [dirname(library) for library in [opencl] if library is not None]
#CUDA
cuda_config = find_cuda()
#Includes
#Libraries
libraries = ['OpenCL']
if cuda_config: libraries += ['cuda', 'nvrtc']
#Backends:
backend_defines = ['-DISAAC_WITH_OPENCL']
if cuda_config: backend_defines += ['-DISAAC_WITH_CUDA']
#Library directories
library_dirs = [config['lib'] for config in [opencl_config, cuda_config] if config is not None]
#Include directories
include =' src/include'.split() + ['external/boost/include', os.path.join(find_module("numpy")[1], "core", "include")]
#Sources
#Source files
src = 'src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/model/model.cpp src/lib/model/predictors/random_forest.cpp src/lib/backend/templates/mreduction.cpp src/lib/backend/templates/reduction.cpp src/lib/backend/templates/mproduct.cpp src/lib/backend/templates/maxpy.cpp src/lib/backend/templates/base.cpp src/lib/backend/templates/vaxpy.cpp src/lib/backend/mapped_object.cpp src/lib/backend/stream.cpp src/lib/backend/parse.cpp src/lib/backend/keywords.cpp src/lib/backend/binder.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/wrap/clBLAS.cpp '.split() + [os.path.join('src', 'wrap', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'model.cpp', 'exceptions.cpp']]
boostsrc = 'external/boost/libs/'
for s in ['numpy','python','smart_ptr','system','thread']:
@@ -84,7 +128,7 @@ def main():
src += glob(boostsrc + "/thread/src/pthread/*.cpp")
src= [f for f in src if not f.endswith("once_atomic.cpp")]
#Setup
setup(
name='isaac',
version='1.0',
@@ -96,12 +140,12 @@ def main():
ext_package="isaac",
ext_modules=[Extension(
'_isaac',src,
extra_compile_args= ['-D__CL_ENABLE_EXCEPTIONS', '-std=c++11', '-Wno-unused-function', '-Wno-unused-local-typedefs', '-Wno-sign-compare'],
extra_compile_args= backend_defines + ['-std=c++11', '-Wno-unused-function', '-Wno-unused-local-typedefs', '-Wno-sign-compare'],
extra_link_args=['-Wl,-soname=_isaac.so'],
undef_macros=[],
include_dirs=include,
library_dirs=library_dirs,
libraries=['OpenCL']
libraries=libraries
)],
cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass},
classifiers=[

View File

@@ -106,6 +106,11 @@ namespace detail
return ndarray_to_iscarray(np::from_object(obj, to_np_dtype(tools::extract_dtype(odtype))), context);
}
std::shared_ptr<isc::array> create_zeros_array(isc::int_t M, isc::int_t N, bp::object odtype, isc::driver::Context context)
{
return std::shared_ptr<isc::array>(new isc::array(isc::zeros(M, N, tools::extract_dtype(odtype), context)));
}
std::shared_ptr<isc::array> create_empty_array(bp::object sizes, bp::object odtype, isc::driver::Context context)
{
typedef std::shared_ptr<isc::array> result_type;
@@ -281,9 +286,13 @@ void export_core()
.def("__init__", bp::make_constructor(detail::construct_scalar, bp::default_call_policies(), (bp::arg(""), bp::arg("context")=isc::driver::queues.default_context())))
;
//Other numpy-like initializers
//Other numpy-like initializers
bp::def("empty", &detail::create_empty_array, (bp::arg("shape"), bp::arg("dtype") = bp::scope().attr("float32"), bp::arg("context")=isc::driver::queues.default_context()));
//Assign
bp::def("assign", static_cast<isc::array_expression (*)(isc::array const &, isc::array const &)>(&isc::assign));\
bp::def("assign", static_cast<isc::array_expression (*)(isc::array const &, isc::array_expression const &)>(&isc::assign));\
//Binary
#define MAP_FUNCTION(name) \
bp::def(#name, static_cast<isc::array_expression (*)(isc::array const &, isc::array const &)>(&isc::name));\
@@ -302,6 +311,8 @@ void export_core()
bp::def(#name, static_cast<isc::array_expression (*)(isc::array const &)>(&isc::name));\
bp::def(#name, static_cast<isc::array_expression (*)(isc::array_expression const &)>(&isc::name));
bp::def("zeros", &detail::create_zeros_array, (bp::arg("shape"), bp::arg("dtype") = bp::scope().attr("float32"), bp::arg("context")=isc::driver::queues.default_context()));
MAP_FUNCTION(abs)
MAP_FUNCTION(acos)
MAP_FUNCTION(asin)

View File

@@ -2,7 +2,9 @@
#include <boost/python/suite/indexing/vector_indexing_suite.hpp>
#include <boost/python/suite/indexing/map_indexing_suite.hpp>
#include "isaac/model/model.h"
#include "isaac/symbolic/execute.h"
#include "common.hpp"
#include "driver.h"
@@ -65,16 +67,26 @@ namespace detail
std::shared_ptr<isc::driver::Context> make_context(isc::driver::Device const & dev)
{ return std::shared_ptr<isc::driver::Context>(new isc::driver::Context(dev)); }
bp::tuple flush(isc::array_expression const & expression, unsigned int queue_id, bp::list dependencies, bool tune, int label, std::string const & program_name, bool force_recompile)
bp::object enqueue(isc::array_expression const & expression, unsigned int queue_id, bp::list dependencies, bool tune, int label, std::string const & program_name, bool force_recompile)
{
std::list<isc::driver::Event> events;
std::vector<isc::driver::Event> cdependencies = tools::to_vector<isc::driver::Event>(dependencies);
std::shared_ptr<isc::array> parray(new isc::array(isc::control(expression,
isc::execution_options_type(queue_id, &events, &cdependencies),
isc::dispatcher_options_type(tune, label),
isc::compilation_options_type(program_name, force_recompile))));
isc::execution_options_type execution_options(queue_id, &events, &cdependencies);
isc::dispatcher_options_type dispatcher_options(tune, label);
isc::compilation_options_type compilation_options(program_name, force_recompile);
isc::array_expression::container_type::value_type root = expression.tree()[expression.root()];
if(isc::detail::is_assignment(root.op))
{
isc::execute(isc::control(expression, execution_options, dispatcher_options, compilation_options), isaac::models(execution_options.queue(expression.context())));
return bp::make_tuple(bp::ptr(root.lhs.array), tools::to_list(events.begin(), events.end()));
}
else
{
std::shared_ptr<isc::array> parray(new isc::array(isc::control(expression, execution_options, dispatcher_options, compilation_options)));
return bp::make_tuple(parray, tools::to_list(events.begin(), events.end()));
}
}
}
struct state_type{ };
@@ -152,7 +164,7 @@ void export_driver()
bp::def("get_platforms", &detail::get_platforms);
bp::def("flush", &detail::flush, (bp::arg("expression"), bp::arg("queue_id") = 0, bp::arg("dependencies")=bp::list(), bp::arg("tune") = false, bp::arg("label")=-1, bp::arg("program_name")="", bp::arg("recompile") = false));
bp::def("enqueue", &detail::enqueue, (bp::arg("expression"), bp::arg("queue_id") = 0, bp::arg("dependencies")=bp::list(), bp::arg("tune") = false, bp::arg("label")=-1, bp::arg("program_name")="", bp::arg("recompile") = false));
bp::class_<state_type>("state_type")
.def_readwrite("queue_properties",&isc::driver::queues.queue_properties)

View File

@@ -2,6 +2,7 @@
#include <boost/python.hpp>
#include "isaac/exception/operation_not_supported.h"
#include "isaac/driver/common.h"
#include "common.hpp"
#include "exceptions.h"
@@ -83,4 +84,12 @@ void export_exceptions()
wrap::exception<isaac::operation_not_supported_exception>("OperationNotSupported", bp::init<std::string>())
.def("__str__", &isaac::operation_not_supported_exception::what)
;
wrap::exception<isaac::driver::ocl::exception::out_of_resources>("LaunchOutOfResources")
.def("__str__", &isaac::driver::ocl::exception::out_of_resources::what)
;
wrap::exception<isaac::driver::ocl::exception::mem_object_allocation_failure>("MemObjectAllocationFailure")
.def("__str__", &isaac::driver::ocl::exception::mem_object_allocation_failure::what)
;
}

View File

@@ -47,7 +47,7 @@ void export_model()
#undef __PROP
}
#define WRAP_BASE(name) bp::class_<isaac::base_impl<isaac::name, isaac::name::parameters_type>, bp::bases<isaac::base>, boost::noncopyable>(#name "_base_impl", bp::no_init);
#define WRAP_BASE(name) bp::class_<isaac::base_impl<isaac::name, isaac::name::parameters_type>, bp::bases<isaac::base>, boost::noncopyable>(#name, bp::no_init);
#define WRAP_TEMPLATE(name, basename, ...) bp::class_<isaac::name, bp::bases<isaac::base_impl<isaac::basename, isaac::basename::parameters_type> > >(#name, bp::init<__VA_ARGS__>())\
.add_property("local_size_0", &isc::name::local_size_0)\
.add_property("local_size_1", &isc::name::local_size_1);

View File

@@ -1,18 +0,0 @@
find_program(PYINSTALLER pyinstaller)
if(PYINSTALLER)
set(SPEC_IN "${CMAKE_CURRENT_SOURCE_DIR}/pyinstaller_build.spec")
set(SPEC "${CMAKE_CURRENT_BINARY_DIR}/pyinstaller_build.spec")
set(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/build/timestamp")
file(GLOB DEPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/pysrc/*.py")
LIST(APPEND DEPS "${CMAKE_CURRENT_SOURCE_DIR}/pyinstaller_build.spec")
configure_file(${SPEC_IN} ${SPEC})
add_custom_command(OUTPUT ${OUTPUT}
COMMAND ${PYINSTALLER} ${SPEC_IN} ${CMAKE_CURRENT_SOURCE_DIR}
COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT}
DEPENDS ${DEPS} python)
add_custom_target(autotune ALL DEPENDS ${OUTPUT})
endif()

View File

@@ -1,74 +0,0 @@
Metadata-Version: 1.1
Name: pyopencl
Version: 2014.1
Summary: Python wrapper for OpenCL
Home-page: http://mathema.tician.de/software/pyopencl
Author: Andreas Kloeckner
Author-email: inform@tiker.net
License: MIT
Description: PyOpenCL lets you access GPUs and other massively parallel compute
devices from Python. It tries to offer computing goodness in the
spirit of its sister project `PyCUDA <http://mathema.tician.de/software/pycuda>`_:
* Object cleanup tied to lifetime of objects. This idiom, often
called
`RAII <http://en.wikipedia.org/wiki/Resource_Acquisition_Is_Initialization>`_
in C++, makes it much easier to write correct, leak- and
crash-free code.
* Completeness. PyOpenCL puts the full power of OpenCL's API at
your disposal, if you wish. Every obscure `get_info()` query and
all CL calls are accessible.
* Automatic Error Checking. All CL errors are automatically
translated into Python exceptions.
* Speed. PyOpenCL's base layer is written in C++, so all the niceties
above are virtually free.
* Helpful and complete `Documentation <http://documen.tician.de/pyopencl>`_
as well as a `Wiki <http://wiki.tiker.net/PyOpenCL>`_.
* Liberal license. PyOpenCL is open-source under the
`MIT license <http://en.wikipedia.org/wiki/MIT_License>`_
and free for commercial, academic, and private use.
* Broad support. PyOpenCL was tested and works with Apple's, AMD's, and Nvidia's
CL implementations.
To use PyOpenCL, you just need `numpy <http://numpy.org>`_ and an OpenCL
implementation.
(See this `howto <http://wiki.tiker.net/OpenCLHowTo>`_ for how to get one.)
Places on the web related to PyOpenCL:
* `Python package index <http://pypi.python.org/pypi/pyopencl>`_ (download releases)
.. image:: https://badge.fury.io/py/pyopencl.png
:target: http://pypi.python.org/pypi/pyopencl
* `C. Gohlke's Windows binaries <http://www.lfd.uci.edu/~gohlke/pythonlibs/#pyopencl>`_ (download Windows binaries)
* `Github <http://github.com/pyopencl/pyopencl>`_ (get latest source code, file bugs)
* `Documentation <http://documen.tician.de/pyopencl>`_ (read how things work)
* `Wiki <http://wiki.tiker.net/PyOpenCL>`_ (read installation tips, get examples, read FAQ)
Platform: UNKNOWN
Classifier: Environment :: Console
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: Intended Audience :: Other Audience
Classifier: Intended Audience :: Science/Research
Classifier: License :: OSI Approved :: MIT License
Classifier: Natural Language :: English
Classifier: Programming Language :: C++
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 2
Classifier: Programming Language :: Python :: 2.4
Classifier: Programming Language :: Python :: 2.5
Classifier: Programming Language :: Python :: 2.6
Classifier: Programming Language :: Python :: 2.7
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.2
Classifier: Programming Language :: Python :: 3.3
Classifier: Topic :: Scientific/Engineering
Classifier: Topic :: Scientific/Engineering :: Mathematics
Classifier: Topic :: Scientific/Engineering :: Physics

File diff suppressed because it is too large Load Diff

View File

@@ -1,55 +0,0 @@
../pyopencl/_mymako.py
../pyopencl/array.py
../pyopencl/algorithm.py
../pyopencl/version.py
../pyopencl/cache.py
../pyopencl/clrandom.py
../pyopencl/reduction.py
../pyopencl/ipython.py
../pyopencl/_cluda.py
../pyopencl/__init__.py
../pyopencl/scan.py
../pyopencl/capture_call.py
../pyopencl/tools.py
../pyopencl/clmath.py
../pyopencl/elementwise.py
../pyopencl/characterize/performance.py
../pyopencl/characterize/__init__.py
../pyopencl/compyte/dtypes.py
../pyopencl/compyte/array.py
../pyopencl/compyte/__init__.py
../pyopencl/cl/pyopencl-ranluxcl.cl
../pyopencl/cl/pyopencl-airy.cl
../pyopencl/cl/pyopencl-eval-tbl.cl
../pyopencl/cl/pyopencl-bessel-y.cl
../pyopencl/cl/pyopencl-bessel-j.cl
../pyopencl/cl/pyopencl-complex.h
../pyopencl/_mymako.pyc
../pyopencl/array.pyc
../pyopencl/algorithm.pyc
../pyopencl/version.pyc
../pyopencl/cache.pyc
../pyopencl/clrandom.pyc
../pyopencl/reduction.pyc
../pyopencl/ipython.pyc
../pyopencl/_cluda.pyc
../pyopencl/__init__.pyc
../pyopencl/scan.pyc
../pyopencl/capture_call.pyc
../pyopencl/tools.pyc
../pyopencl/clmath.pyc
../pyopencl/elementwise.pyc
../pyopencl/characterize/performance.pyc
../pyopencl/characterize/__init__.pyc
../pyopencl/compyte/dtypes.pyc
../pyopencl/compyte/array.pyc
../pyopencl/compyte/__init__.pyc
../pyopencl/_cl.so
../pyopencl/_pvt_struct.so
./
dependency_links.txt
SOURCES.txt
top_level.txt
requires.txt
not-zip-safe
PKG-INFO

View File

@@ -1 +0,0 @@

View File

@@ -1,3 +0,0 @@
pytools>=2014.2
pytest>=2
decorator>=3.2.0

View File

@@ -1,3 +0,0 @@
_cl
_pvt_struct
pyopencl

View File

@@ -1,6 +1,4 @@
from sklearn import tree
from sklearn import ensemble
from sklearn.grid_search import GridSearchCV
import numpy as np
def gmean(a, axis=0, dtype=None):
@@ -18,29 +16,33 @@ def gmean(a, axis=0, dtype=None):
def nrmse(y_ground, y):
N = y.size
rmsd = np.sqrt(np.sum((y_ground - y)**2)/N)
if len(y_ground) > 1:
return rmsd/(np.max(y_ground) - np.min(y_ground))
else:
return rmsd
def train(X, Y, profiles):
X = np.array(X)
Y = np.array(Y)
M = X.shape[0]
def train_model(X, Y, profiles, perf, metric):
p = np.random.permutation(X.shape[0])
X = X[p,:]
Y = Y[p,:]
Y = np.array([perf(xx, yy) for xx, yy in zip(X, Y)])
Y[np.isinf(Y)] = 0
#Train the model
cut = int(0.9*X.shape[0])
#Train the model
cut = int(0.9*M)
XTr, YTr = X[:cut,:], Y[:cut,:]
XCv, YCv = X[cut:,:], Y[cut:,:]
nrmses = {}
for N in range(1,20):
for depth in range(1,20):
for N in range(1,min(M+1,20)):
for depth in range(1,min(M+1,20)):
clf = ensemble.RandomForestRegressor(N, max_depth=depth).fit(XTr, YTr)
t = np.argmax(clf.predict(XCv), axis = 1)
y = np.array([YCv[i,t[i]] for i in range(t.size)])
ground = np.max(YCv[:,:], axis=1)
nrmses[clf] = nrmse(ground, y)
clf = min(nrmses, key=nrmses.get)
print 'The optimal classifer has NRMSE = %.2g (%d estimators and the max depth is %d'%(nrmses[clf], clf.n_estimators, clf.max_depth)
return clf
return clf, nrmses[clf]

197
tune/optimize.py Normal file
View File

@@ -0,0 +1,197 @@
import isaac as isc
import random
from copy import deepcopy
from sys import stdout
from itertools import product
from deap import algorithms
from deap import base
from deap import creator
from deap import tools as deap_tools
from numpy import cumsum
import tools
fetch_types = [isc.fetching_policy_type.FETCH_FROM_LOCAL,
isc.fetching_policy_type.FETCH_FROM_LOCAL,
isc.fetching_policy_type.FETCH_FROM_GLOBAL_CONTIGUOUS,
isc.fetching_policy_type.FETCH_FROM_GLOBAL_STRIDED]
def exhaustive(template, sizes, context):
tree, _ = tools.tree_of(template, sizes, context)
metric = tools.metric_of(template)
nbits = tools.genetic_infos_of(template)['nbits']
categorical = tools.genetic_infos_of(template)['categorical']
ranges = [range(2**x) for x in nbits]
ranges = list(product(*ranges))
timings = {}
best = None
for idx, r in enumerate(ranges):
parameters = tuple([fetch_types[x] if i in categorical else 2**x for i,x in enumerate(r)])
try:
time = tools.benchmark(template, parameters, tree)
if not best or time < best[1]:
best = parameters, time
except (isc.OperationNotSupported, isc.LaunchOutOfResources, isc.MemObjectAllocationFailure):
pass
if best:
stdout.write('%.2f %% | Best %.2f [ for %s ]\r'%(float(idx*100)/len(ranges),metric(sizes, best[1]), best[0]))
return best[0]
def genetic(template, sizes, context, naccept=200, niter = 1000, cxpb=0.4, mutpb=0.4, popsize = 10, initializer = None, prior = None):
tree, _ = tools.tree_of(template, sizes, context)
metric = tools.metric_of(template)
genetic_infos = tools.genetic_infos_of(template)
nbits = genetic_infos['nbits']
offsets = cumsum([0] + nbits)
def bin2gray(A):
g = [int(A[0])]
for i in range(1, len(A)):
g += [int(A[i-1] != A[i])]
return g
def gray2int(A):
b = [A[0]]
for i in range(1, len(A)):
b += [int(b[i-1] != A[i])]
return int(''.join(map(str,b)), 2)
def encode(genome):
encoded = [bin2gray(bin(x)[2:].zfill(nb)) for x, nb in zip(genome, nbits)]
return sum(encoded, [])
def decode(genome):
result = []
for off1,off2 in zip(offsets[:-1],offsets[1:]):
result += [gray2int(genome[off1:off2])]
result = [fetch_types[x] if i in genetic_infos['categorical'] else 2**x for i,x in enumerate(result)]
return result
def evaluate(genome):
idx = tuple(genome)
if idx not in cache:
cache[idx] = tools.benchmark(template, decode(genome), tree)
return cache[idx],
cache = {}
hof = deap_tools.HallOfFame(1)
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register("evaluate", evaluate)
toolbox.register("mate", deap_tools.cxTwoPoint)
toolbox.register("mutate", deap_tools.mutFlipBit)
toolbox.register("select", deap_tools.selNSGA2)
#Initialization
if initializer is None:
initializer = ([random.randint(0, 2**x) for x in nbits] for i in iter(int,1))
population = []
genome = encode(prior if prior else list(initializer.next()))
while len(population) < popsize:
individual = creator.Individual(genome)
try:
individual.fitness.values = toolbox.evaluate(genome)
population += [individual]
except (isc.OperationNotSupported, isc.LaunchOutOfResources, isc.MemObjectAllocationFailure ):
pass
genome = encode(list(initializer.next()))
hof.update(population)
x = []
y = []
it = 0
while len(cache) < naccept and it<niter:
pad = len(cache) - len(x)
x += [len(cache)]*pad
y += [metric(sizes, hof[0].fitness.values[0])]*pad
offspring = []
while len(offspring) < popsize:
try:
op_choice = random.random()
#Cross-over
if op_choice < cxpb:
ind1, ind2 = map(toolbox.clone, random.sample(population, 2))
ind1, ind2 = toolbox.mate(ind1, ind2)
ind = ind1
toolbox.evaluate(ind)
offspring += [ind]
#Mutation
elif op_choice < cxpb + mutpb:
ind = toolbox.clone(random.choice(population))
ind, = toolbox.mutate(ind, 1.0/offsets[-1])
toolbox.evaluate(ind)
offspring += [ind]
#Reproduction
else:
offspring += [random.choice(population)]
except (isc.OperationNotSupported, isc.LaunchOutOfResources, isc.MemObjectAllocationFailure):
pass
#Update fitnesses
fitnesses = toolbox.map(toolbox.evaluate, offspring)
for ind, fit in zip(offspring, fitnesses):
ind.fitness.values = fit
#Update population
population[:] = toolbox.select(population + offspring, popsize)
hof.update(population)
optimal = '(%s)'%','.join(map(str,decode(hof[0])))
stdout.write('Iter %d | %d evaluated | Best %.2f [ for %s ]\r'%(it, x[-1], y[-1], optimal))
stdout.flush()
it += 1
stdout.write('\n')
return tuple(decode(hof[0])), x, y
def is_local_optimum(parameters, template, sizes, context):
tree, _ = tools.tree_of(template, sizes, context)
genetic_infos = tools.genetic_infos_of(template)
if issubclass(template, isc.vaxpy):
sweep_over = [0,1,2]
elif issubclass(template, isc.reduction):
sweep_over = [0,1,2]
elif issubclass(template, isc.maxpy):
sweep_over = [0,1,2,3,4]
elif issubclass(template, isc.mreduction):
sweep_over = [0,1,2,3,4]
elif issubclass(template, isc.mproduct):
sweep_over = [1,2,3,4,5,7,10,11]
#Evaluate the provided parameters guess
try:
reference = tools.benchmark(template, parameters, tree)
except (isc.OperationNotSupported, isc.LaunchOutOfResources, isc.MemObjectAllocationFailure):
return False
#Latency bound -- ignore
if reference < 2e-5:
return True
timings = {}
domain = [[v for v in [x/2, x, x*2] if 1 <= v <= 2**2**genetic_infos['nbits'][i]] \
if i in sweep_over else [x] for i, x in enumerate(parameters)]
for x in product(*domain):
if x==parameters:
pass
try:
time = tools.benchmark(template, x, tree)
if time/reference < .97:
return False
except (isc.OperationNotSupported, isc.LaunchOutOfResources, isc.MemObjectAllocationFailure):
pass
return True

View File

@@ -1,32 +0,0 @@
#!/usr/bin/env
import os, sys
prefix = sys.argv[2]
sys.path.append('/home/philippe/Development/ATIDLAS/build/python/pyatidlas/build/lib.linux-x86_64-2.7/')
sys.path.append(os.path.join(prefix, 'pysrc'))
a = Analysis([os.path.join(prefix, 'pysrc','autotune.py')],
hiddenimports=['scipy.sparse.csgraph._validation',
'scipy.special._ufuncs_cxx',
'scipy.sparse.linalg.dsolve.umfpack',
'scipy.integrate.vode',
'scipy.integrate.lsoda',
'sklearn.utils.sparsetools._graph_validation',
'sklearn.utils.sparsetools._graph_tools',
'sklearn.utils.lgamma',
'sklearn.tree._utils'],
hookspath=None,
excludes=['scipy.io.matlab','matplotlib','PyQt4'],
runtime_hooks=None)
pyz = PYZ(a.pure)
exe = EXE(pyz,
a.scripts,
a.binaries,
a.zipfiles,
a.datas,
name='autotune',
debug=False,
strip=None,
upx=True,
console=True )

View File

@@ -1,235 +0,0 @@
from __future__ import division
import argparse, itertools, os, sys, json
import misc_tools, optimize, dataset
import isaac as isc
import numpy as np
from numpy import random
from model import train_model
TYPES = { 'vaxpy': {'template':isc.vaxpy,
'perf-index':lambda x: 3*x[0]*x[1][0]/x[2]*1e-9,
'perf-measure':'GB/s'},
'maxpy': {'template':isc.maxpy,
'perf-index':lambda x: 3*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
'perf-measure':'GB/s'},
'dot': {'template':isc.reduction,
'perf-index':lambda x: 2*x[0]*x[1][0]/x[2]*1e-9,
'perf-measure':'GB/s'},
'gemv': {'template': {'N': isc.mreduction_rows, 'T': isc.mreduction_cols},
'perf-index':lambda x: x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
'perf-measure':'GB/s'},
'gemm': {'template': {('N','N'): isc.mproduct_nn, ('T','N'): isc.mproduct_tn,
('N','T'): isc.mproduct_nt, ('T','T'): isc.mproduct_tt},
'perf-index': lambda x: 2*x[1][0]*x[1][1]*x[1][2]/x[2]*1e-9,
'perf-measure': 'GFLOP/s'} }
def do_tuning(args):
device = args.device
context = isc.context(device)
context.queues.append(isc.command_queue(context, device))
if os.path.isfile(args.out):
json_out = json.load(open(args.out, 'r'))
else:
json_out = {}
json_out["version"] = "1.0"
def map_to_list(T, x):
return list(map(T, x if isinstance(x, list) else [x]))
if(args.method=='simple'):
default_tuning_sizes = {'vaxpy': args.blas1_size, 'dot': args.blas1_size,
'maxpy' : args.blas2_size, 'gemv' : args.blas2_size,
'gemm': args.blas3_size}
for operation in ['vaxpy', 'dot', 'maxpy', 'gemv', 'gemm']:
for datatype in [isc.float32, isc.float64]:
dtypestr = datatype.__name__
if operation not in args.operations and operation + '-' + dtypestr not in args.operations:
continue
#Check data-type
if datatype is isc.float64 and not device.double_fp_config:
sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...')
continue
#~ #Helper for execution
def execute(symbolic, sizes, Template, parameters = None, fname = os.devnull):
if parameters is not None:
return misc_tools.benchmark(Template(*parameters), symbolic)
with open(fname, "w+") as archive:
return optimize.genetic(symbolic, Template, lambda t: TYPES[operation]['perf-index']([datatype(0).size, sizes, t]),
TYPES[operation]['perf-measure'], archive)
def log_spaced_points(a,b,N,r=128):
t = np.ceil(np.exp(np.linspace(np.log(a), np.log(b), N))/r)*r
return t.reshape(t.size,1).astype(int)
#Helper for tuning
def tune(execution_handler, layouts, tuning_sizes, training_sizes):
print('-----')
print(' '.join(map(str, ("Now tuning:", dtypestr, '-', operation, '-'.join(layouts), '[' + device.name, '(' + device.platform.name + ')]'))))
#Update JSON
full_operation = operation + ''.join(layouts)
prefix = os.path.join('data',os.path.join(full_operation,dtypestr))
if not os.path.exists(prefix):
os.makedirs(prefix)
if full_operation not in json_out:
json_out[full_operation] = {}
json_out[full_operation][dtypestr] = {}
D = json_out[full_operation][dtypestr]
if args.method == 'simple':
print 'Size : ', ','.join(map(str, default_tuning_sizes[operation]))
profiles = [execution_handler(map(int,default_tuning_sizes[operation]))]
else:
def compute_perf(x, t):
return TYPES[operation]['perf-index']([datatype(0).size, x, t])
profiles = dataset.sample_profiles(execution_handler, tuning_sizes)
if args.build_model:
X, Y, profiles = dataset.sample_dataset(prefix, profiles, execution_handler, training_sizes)
#profiles = np.loadtxt(prefix+'/profiles.csv')
#X = np.loadtxt(prefix+'/X.csv',ndmin=2)
#Y = np.loadtxt(prefix+'/Y.csv',ndmin=2)
clf = train_model(X, Y, profiles, compute_perf, TYPES[operation]['perf-measure'])
D['predictor'] = [{'children_left': e.tree_.children_left.tolist(),
'children_right': e.tree_.children_right.tolist(),
'threshold': e.tree_.threshold.astype('float64').tolist(),
'feature': e.tree_.feature.astype('float64').tolist(),
'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_]
D['profiles'] = [map(int, x) for x in profiles]
Template = TYPES[operation]['template']
#Vector AXPY
if operation=='vaxpy':
def execution_handler(sizes, fname=os.devnull, parameters=None):
x = isc.empty(sizes[0], datatype, context=context)
y = isc.empty(sizes[0], datatype, context=context)
return execute(x + y, sizes, Template, parameters, fname)
tune(execution_handler, (), log_spaced_points(1e4, 1e7, 20), log_spaced_points(1e4, 1e7, 1000))
#Dot
if operation=='dot':
def execution_handler(sizes, fname=os.devnull, parameters=None):
x = isc.empty(sizes[0], datatype, context=context)
y = isc.empty(sizes[0], datatype, context=context)
s = isc.scalar(datatype)
return execute(isc.dot(x, y), sizes, Template, parameters, fname)
tune(execution_handler, (), log_spaced_points(1e4, 1e7, 50), log_spaced_points(1e4, 1e7, 1000))
#Matrix AXPY
if operation=='maxpy':
def execution_handler(sizes, fname=os.devnull, parameters=None):
A = isc.empty(sizes, datatype, context=context)
C = isc.empty(sizes, datatype, context=context)
return execute(A + C, sizes, Template, parameters, fname)
tune(execution_handler, 64, 5000, 2, (),'log', 'log')
#Row-wise dot
if operation=='gemv':
for A_trans in args.gemv_layouts:
def execution_handler(sizes, fname=os.devnull, parameters=None):
A = isc.empty(sizes if A_trans=='N' else sizes[::-1], datatype, context=context)
x = isc.empty(sizes[1], datatype, context=context)
LHS = A if A_trans=='N' else A.T
return execute(isc.dot(LHS, x), sizes, Template[A_trans], parameters, fname)
tuning_sizes = itertools.chain( itertools.product([128, 512, 2048, 8192], [128, 512, 2048, 8192]),
itertools.product([128, 512, 2048, 8192], [16384, 32768, 65536]),
itertools.product([16384, 32768, 65536], [128, 512, 2048, 8192]))
training_sizes = itertools.chain( itertools.product([2**k for k in range(4, 13)], [2**k for k in range(4, 13)]),
itertools.product([2**k for k in range(4, 13)], [2**k for k in range(13, 17)]),
itertools.product([2**k for k in range(13, 17)], [2**k for k in range(4, 13)]))
tune(execution_handler, (A_trans,), tuning_sizes, training_sizes)
#Matrix Product
if operation=='gemm':
for L in args.gemm_layouts:
A_trans = L[0]
B_trans = L[1]
def execution_handler(sizes, fname=os.devnull, parameters=None):
A = isc.empty((sizes[0], sizes[2]) if A_trans=='N' else (sizes[2], sizes[0]), datatype, context=context)
B = isc.empty((sizes[2], sizes[1]) if B_trans=='N' else (sizes[1], sizes[2]), datatype, context=context)
LHS = A if A_trans=='N' else A.T
RHS = B if B_trans=='N' else B.T
return execute(isc.dot(LHS, RHS), sizes, Template[(A_trans, B_trans)], parameters, fname)
tuning_sizes = itertools.product([64, 256, 1024, 2560], [64, 256, 1024, 2560], [256, 2560, 32768, 65536])
training_sizes = itertools.product([2**k for k in range(6, 13)], [2**k for k in range(6, 13)], [2**k for k in range(6, 17)])
tune(execution_handler,(A_trans,B_trans), tuning_sizes, training_sizes)
json.dump(json_out, open(args.out,'w'))
class ArgumentsHandler:
def __init__(self, devices):
#Command line arguments
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(dest='action')
print_devices_parser = subparsers.add_parser('list-devices', help='List the devices available')
tune_parser = subparsers.add_parser('tune', help='Auto-tuning')
tune_parser.add_argument("--device", default=0, type=int)
tune_parser.add_argument("--operations", default = 'vaxpy,maxpy,dot,gemv,gemm-float32', type=str)
tune_parser.add_argument("--gemm-layouts", default='NN,NT,TN,TT', type=str)
tune_parser.add_argument("--gemv-layouts", default='N,T', type=str)
tune_parser.add_argument("--out", default='', type=str)
tune_parser.add_argument("--viennacl-src-path", default='', type=str)
tune_subparsers = tune_parser.add_subparsers(dest='method')
simple_parser = tune_subparsers.add_parser('simple', help = 'Tune each operation for unique sizes')
simple_parser.add_argument("--blas1-size", default = 10e6, type=int)
simple_parser.add_argument("--blas2-size", nargs=2, default=[2560,2560], type=int)
simple_parser.add_argument("--blas3-size", nargs=3, default=[1536,1536,1536],type=int)
full_parser = tune_subparsers.add_parser('full', help = 'Tune each operation for randomly chosen sizes')
full_parser.add_argument("--build-model", default=True, type=bool)
full_parser.add_argument("--sample-size", default=64, type=int)
args = parser.parse_args()
self.__dict__ = args.__dict__.copy()
if self.action == 'tune':
#Retypes
self.device = devices[int(self.device)]
if not self.out:
self.out = misc_tools.sanitize_string(self.device.name) + '.json'
self.operations = self.operations.split(',')
self.gemm_layouts = self.gemm_layouts.split(',')
self.gemv_layouts = self.gemv_layouts.split(',')
if self.method == 'simple':
self.blas1_size = [int(float(self.blas1_size))]
self.blas2_size = map(int, self.blas2_size)
self.blas3_size = map(int, self.blas3_size)
if __name__ == "__main__":
isc.state.queue_properties = isc.CL_QUEUE_PROFILING_ENABLE
platforms = isc.get_platforms()
devices = [d for platform in platforms for d in platform.get_devices()]
args = ArgumentsHandler(devices)
print("----------------")
print("Devices available:")
print("----------------")
for (i, d) in enumerate(devices):
print 'Device', i, '|', isc.device_type_to_string(d.type), '|', d.name, 'on', d.platform.name
print("----------------")
if args.action=='tune':
print("------")
print("Auto-tuning")
print("------")
do_tuning(args)

View File

@@ -1,54 +0,0 @@
import os
import sys
import re
import random
import numpy as np
def sample_profiles(execution_handler, generator):
print "Sampling profiles..."
t = np.empty(0)
profiles = []
for i, x in enumerate(generator):
print x
if i==0:
X = np.empty((0,len(x)))
y = execution_handler(x)
if y not in profiles:
profiles.append(y)
idx = profiles.index(y)
X = np.vstack((X, x))
t = np.append(t, idx)
idx = int(t[np.argmax(np.linalg.norm(X, axis=1))])
profiles = [profiles[idx]] + [x for i,x in enumerate(profiles) if i!=idx]
return profiles
def sample_dataset(prefix_name, profiles, execution_handler, generator):
P = len(profiles)
print "Generating the dataset..."
Y = np.empty((0, P))
for i,x in enumerate(generator):
if i==0:
X = np.empty((0,len(x)))
new_y = np.zeros(P)
for j,y in enumerate(profiles):
try:
new_y[j] = execution_handler(x, os.devnull, y)
except:
new_y[j] = float('inf')
X = np.vstack((X, x))
Y = np.vstack((Y, new_y))
if i%10==0:
sys.stdout.write('%d data points generated\r'%i)
sys.stdout.flush()
idx = np.argsort(Y[np.argmax(np.linalg.norm(X, axis=1)),:])
Y = Y[:, idx]
profiles = [profiles[i] for i in idx]
if not os.path.exists(prefix_name):
os.makedirs(prefix_name)
np.savetxt(os.path.join(prefix_name,"X.csv"), X)
np.savetxt(os.path.join(prefix_name,"Y.csv"), Y)
np.savetxt(os.path.join(prefix_name,"profiles.csv"), profiles)
return X, Y, profiles

View File

@@ -1,205 +0,0 @@
import random, time, sys, copy
import misc_tools
import numpy as np
import isaac as isc
from deap import algorithms
from deap import base
from deap import creator
from deap import tools as deap_tools
from collections import OrderedDict as odict
def closest_divisor(N, x):
x_low=x_high=max(1,min(round(x),N))
while N % x_low > 0 and x_low>0:
x_low = x_low - 1
while N % x_high > 0 and x_high < N:
x_high = x_high + 1
return x_low if x - x_low < x_high - x else x_high
def b_gray_to_bin(A='00000000', endian='big'):
assert type(endian) is str
assert endian == 'little' or endian == 'big'
if endian == 'little': A = A[::-1] # Make sure endianness is big before conversion
b = A[0]
for i in range(1, len(A)): b += str( int(b[i-1] != A[i]) )
if endian == 'little': b = b[::-1] # Convert back to little endian if necessary
return b
class GeneticOperators(object):
class Pow2(object):
def __init__(self, v):
self.value = v
@property
def decoded():
return 2**self.value
def __init__(self, symbolic, Template, out):
self.device = symbolic.context.queues[0].device
self.symbolic = symbolic
self.Template = Template
self.cache = {}
self.out = out
self.genome_info = {
isc.vaxpy: [2,4,4,isc.fetching_policy_type],
isc.reduction: [2,4,4,isc.fetching_policy_type],
isc.maxpy: [2,3,3,3,3,isc.fetching_policy_type],
isc.mreduction_rows: [2,3,3,3,3,isc.fetching_policy_type],
isc.mreduction_cols: [2,3,3,3,3,isc.fetching_policy_type],
isc.mproduct_nn: [2,3,3,3,3,3,3,3,isc.fetching_policy_type,isc.fetching_policy_type,3],
isc.mproduct_nt: [2,3,3,3,3,3,3,3,isc.fetching_policy_type,isc.fetching_policy_type,3],
isc.mproduct_tn: [2,3,3,3,3,3,3,3,isc.fetching_policy_type,isc.fetching_policy_type,3],
isc.mproduct_tt: [2,3,3,3,3,3,3,3,isc.fetching_policy_type,isc.fetching_policy_type,3]
}[Template]
self.indpb = 1.0/sum([1 if x==isc.fetching_policy_type else x for x in self.genome_info])
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)
self.toolbox = base.Toolbox()
self.toolbox.register("population", self.init)
self.toolbox.register("evaluate", self.evaluate)
self.toolbox.register("mate", deap_tools.cxTwoPoint)
self.toolbox.register("mutate", self.mutate)
self.toolbox.register("select", deap_tools.selNSGA2)
def decode(self, genome):
fetching_policy_type = isc.fetching_policy_type
fetch = [fetching_policy_type.FETCH_FROM_LOCAL, fetching_policy_type.FETCH_FROM_GLOBAL_STRIDED, fetching_policy_type.FETCH_FROM_GLOBAL_CONTIGUOUS]
is_gemm = self.Template in [isc.mproduct_nn, isc.mproduct_nt, isc.mproduct_tn, isc.mproduct_tt]
result = []
offset = 0
for i, x in enumerate(self.genome_info):
if x==isc.fetching_policy_type:
result.append(fetch[genome[offset]])
offset = offset + 1
else:
decoded = int(b_gray_to_bin(''.join(genome[offset:offset+x])), 2)
result.append(decoded if is_gemm and i in [11, 12] else 2**decoded)
offset = offset + x
#GEMM peculiarities
if is_gemm:
if fetching_policy_type.FETCH_FROM_LOCAL in result:
lf1 = result[1]*result[3]/result[10]
else:
result[10] = 0
lf1 = 0
result.append(lf1)
return result
def init(self, N):
result = []
allowed_idx = [0] if self.Template in [isc.mproduct_nn, isc.mproduct_nt, isc.mproduct_tn, isc.mproduct_tt] else [1,2]
for idx in allowed_idx:
current = []
while len(current) < N/len(allowed_idx):
while True:
bincode = []
for i, x in enumerate(self.genome_info):
if x==isc.fetching_policy_type:
bincode = bincode + [idx]
else:
bincode = bincode + [str(random.randint(0,1)) for i in range(x)]
parameters = self.decode(bincode)
template = self.Template(*parameters)
array_expressions = isc.array_expression_container(self.symbolic)
registers_usage = template.registers_usage(array_expressions)/4
lmem_usage = template.lmem_usage(array_expressions)
local_size = parameters[1]*parameters[3]
occupancy_record = misc_tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
if not misc_tools.skip(template, self.symbolic):
current.append(creator.Individual(bincode))
break
result = result + current
return result
def mutate(self, individual):
while True:
new_individual = copy.deepcopy(individual)
for i in range(len(new_individual)):
if isinstance(individual[i], int) and random.random() < 0.1:
while new_individual[i] == individual[i]:
new_individual[i] = random.randint(0, 2)
elif not isinstance(individual[i], int) and random.random() < self.indpb:
new_individual[i] = '1' if new_individual[i]=='0' else '0'
parameters = self.decode(new_individual)
template = self.Template(*parameters)
if not misc_tools.skip(template, self.symbolic):
break
return new_individual,
def evaluate(self, individual):
if tuple(individual) not in self.cache:
parameters = self.decode(individual)
template = self.Template(*parameters)
tt = misc_tools.benchmark(template, self.symbolic)
self.out.write(','.join([str(tt)]+map(str,map(int,parameters)))+'\n')
self.cache[tuple(individual)] = tt
return self.cache[tuple(individual)],
def optimize(self, maxtime, maxgen, compute_perf, perf_metric):
hof = deap_tools.HallOfFame(1)
# Begin the generational process
gen = 0
maxtime = time.strptime(maxtime, '%Mm%Ss')
maxtime = maxtime.tm_min*60 + maxtime.tm_sec
start_time = time.time()
mu = 30
cxpb = 0.2
mutpb = 0.7
population = self.init(mu)
invalid_ind = [ind for ind in population if not ind.fitness.valid]
fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
hof.update(population)
while time.time() - start_time < maxtime and gen < maxgen:
# Vary the population
offspring = []
for _ in xrange(mu):
op_choice = random.random()
if op_choice < cxpb: # Apply crossover
while True:
ind1, ind2 = map(self.toolbox.clone, random.sample(population, 2))
ind1, ind2 = self.toolbox.mate(ind1, ind2)
del ind1.fitness.values
parameters = self.decode(ind1)
template = self.Template(*parameters)
if not misc_tools.skip(template, self.symbolic):
break
offspring.append(ind1)
elif op_choice < cxpb + mutpb: # Apply mutation
ind = self.toolbox.clone(random.choice(population))
ind, = self.toolbox.mutate(ind)
del ind.fitness.values
offspring.append(ind)
else: # Apply reproduction
offspring.append(random.choice(population))
#for x in offspring:
#print self.decode(x)
# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
# Update the hall of fame with the generated individuals
hof.update(offspring)
# Select the next generation population
population[:] = self.toolbox.select(population + offspring, mu)
#Update
gen = gen + 1
best_profile = '(%s)'%','.join(map(str,self.decode(hof[0])))
best_performance = compute_perf(hof[0].fitness.values[0])
sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\r'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile))
sys.stdout.flush()
sys.stdout.write('\n')
return self.decode(hof[0])

View File

@@ -1,246 +0,0 @@
from __future__ import division
import time
import os
import sys
import isaac as isc
import numpy as np
class PhysicalLimitsNV:
def __init__(self, dev):
self.compute_capability = dev.nv_compute_capability
if self.compute_capability[0]==1:
if self.compute_capability[1]<=1:
self.warps_per_mp = 24
self.threads_per_mp = 768
self.num_32b_reg_per_mp = 8192
self.reg_alloc_unit_size = 256
else:
self.warps_per_mp = 32
self.threads_per_mp = 1024
self.num_32b_reg_per_mp = 16384
self.reg_alloc_unit_size = 512
self.threads_per_warp = 32
self.thread_blocks_per_mp = 8
self.reg_alloc_granularity = 'block'
self.reg_per_thread = 124
self.shared_mem_per_mp = 16384
self.shared_mem_alloc_unit_size = 512
self.warp_alloc_granularity = 2
self.max_thread_block_size = 512
elif self.compute_capability[0]==2:
self.threads_per_warp = 32
self.warps_per_mp = 48
self.threads_per_mp = 1536
self.thread_blocks_per_mp = 8
self.num_32b_reg_per_mp = 32768
self.reg_alloc_unit_size = 64
self.reg_alloc_granularity = 'warp'
self.reg_per_thread = 63
self.shared_mem_per_mp = 49152
self.shared_mem_alloc_unit_size = 128
self.warp_alloc_granularity = 2
self.max_thread_block_size = 1024
elif self.compute_capability[0]==3:
self.threads_per_warp = 32
self.warps_per_mp = 64
self.threads_per_mp = 2048
self.thread_blocks_per_mp = 16
self.num_32b_reg_per_mp = 65536
self.reg_alloc_unit_size = 256
self.reg_alloc_granularity = 'warp'
if(self.compute_capability[1]==5):
self.reg_per_thread = 255
else:
self.reg_per_thread = 63
self.shared_mem_per_mp = 49152
self.shared_mem_alloc_unit_size = 256
self.warp_alloc_granularity = 4
self.max_thread_block_size = 1024
elif self.compute_capability[0]==5: #[KR]: copy-pasted from Kepler and adjusted according to http://en.wikipedia.org/wiki/CUDA
self.threads_per_warp = 32
self.warps_per_mp = 64
self.threads_per_mp = 2048
self.thread_blocks_per_mp = 32
self.num_32b_reg_per_mp = 65536
self.reg_alloc_unit_size = 256
self.reg_alloc_granularity = 'warp'
self.reg_per_thread = 255
self.shared_mem_per_mp = 65536
self.shared_mem_alloc_unit_size = 256
self.warp_alloc_granularity = 4
self.max_thread_block_size = 1024
else:
raise Exception('Compute capability not supported!')
class PhysicalLimitsAMD:
def __init__(self, dev):
infos =\
{
#APU:
'Devastator': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 8192},
'Scrapper': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 8192},
#HD5000
'Cedar': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 8192},
'Redwood': {'arch': 'VLIW', 'WFmax_cu': 62, 'LDS_cu': 32768, 'GPR_cu': 16384},
'Juniper': {'arch': 'VLIW', 'WFmax_cu': 24.8, 'LDS_cu': 32768, 'GPR_cu': 16384},
'Cypress': {'arch': 'VLIW', 'WFmax_cu': 27.6, 'LDS_cu': 32768, 'GPR_cu': 16384},
'Hemlock': {'arch': 'VLIW', 'WFmax_cu': 24.8, 'LDS_cu': 32768, 'GPR_cu': 16384},
#HD6000
'Seymour': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 16384},
'Caicos': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 16384},
'Turks': {'arch': 'VLIW', 'WFmax_cu': 41.3, 'LDS_cu': 32768, 'GPR_cu': 16384},
'Whistler': {'arch': 'VLIW', 'WFmax_cu': 41.3, 'LDS_cu': 32768, 'GPR_cu': 16384},
'Barts': {'arch': 'VLIW', 'WFmax_cu': 49.6, 'LDS_cu': 32768, 'GPR_cu': 16384},
#HD7000
'Capeverde': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
'Pitcairn': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
'Bonaire': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
'Tahiti': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
#Rx 200
'Oland': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
'Tonga': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
'Hawaii': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536}
}
self.WFsize = 64
self.WFmax_cu = infos[dev.name]['WFmax_cu']
self.LDS_cu = infos[dev.name]['LDS_cu']
self.GPR_cu = infos[dev.name]['GPR_cu']
self.arch = infos[dev.name]['arch']
pass
def _int_floor(value, multiple_of=1):
"""Round C{value} down to be a C{multiple_of} something."""
# Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
from math import floor
return int(floor(value/multiple_of))*multiple_of
def _int_ceiling(value, multiple_of=1):
"""Round C{value} up to be a C{multiple_of} something."""
# Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
from math import ceil
return int(ceil(value/multiple_of))*multiple_of
class OccupancyRecord:
def init_nvidia(self, dev, threads, shared_mem, registers):
pl = PhysicalLimitsNV(dev)
limits = []
allocated_warps = max(1,_int_ceiling(threads/pl.threads_per_warp))
max_warps_per_mp = pl.warps_per_mp
limits.append((min(pl.thread_blocks_per_mp, _int_floor(max_warps_per_mp/allocated_warps)), 'warps'))
if registers>0:
if registers > pl.reg_per_thread:
limits.append((0, 'registers'))
else:
allocated_regs = {'warp': allocated_warps,
'block': _int_ceiling(_int_ceiling(allocated_warps, pl.warp_alloc_granularity)*registers*pl.threads_per_warp,allocated_warps)}[pl.reg_alloc_granularity]
max_reg_per_mp = {'warp': _int_floor(pl.num_32b_reg_per_mp/_int_ceiling(registers*pl.threads_per_warp, pl.reg_alloc_unit_size), pl.warp_alloc_granularity),
'block':pl.num_32b_reg_per_mp}[pl.reg_alloc_granularity]
limits.append((_int_floor(max_reg_per_mp/allocated_regs), 'registers'))
if shared_mem>0:
allocated_shared_mem = _int_ceiling(shared_mem, pl.shared_mem_alloc_unit_size)
max_shared_mem_per_mp = pl.shared_mem_per_mp
limits.append((_int_floor(max_shared_mem_per_mp/allocated_shared_mem), 'shared memory'))
limit, limited_by = min(limits)
warps_per_mp = limit*allocated_warps
self.occupancy = 100*warps_per_mp/pl.warps_per_mp
def init_amd(self, dev, threads, shared_mem, NReg):
pl = PhysicalLimitsAMD(dev)
limits = {}
WFwg = _int_ceiling(threads/pl.WFsize)
#WFmax without constraint
if pl.arch=='VLIW':
limits['wg'] = pl.WFmax_cu if WFwg > pl.WFmax_cu else _int_floor(pl.WFmax_cu,WFwg)
else:
limits['wg'] = min(16*WFwg, pl.WFmax_cu)
#WFmax with LDS constraints
if shared_mem > 0:
WGmax = _int_floor(pl.LDS_cu/shared_mem)
limits['lds'] = WGmax*WFwg
#WFmax with GPR constraints
if NReg > 0:
#Amount of work group per CU
NRegWG = NReg*pl.WFsize*WFwg
WGmax = _int_floor(pl.GPR_cu/NRegWG)
limits['gpr'] = WFwg*WGmax
self.occupancy = 100.0*min(list(limits.values()))/pl.WFmax_cu
def __init__(self, dev, threads, shared_mem=0, registers=0):
vendor = dev.vendor
if vendor == isc.vendor.AMD:
self.init_amd(dev, threads, shared_mem, registers)
elif vendor == isc.vendor.NVIDIA:
self.init_nvidia(dev, threads, shared_mem, registers)
elif vendor == isc.vendor.INTEL:
if registers>128:
self.occupancy = 0
else:
self.occupancy = 100
def skip(template, symbolic):
device = symbolic.context.queues[0].device
local_size = template.local_size_0*template.local_size_1
vendor = device.vendor
if vendor == isc.vendor.AMD and local_size%64!=0:
return True
elif vendor == isc.vendor.NVIDIA and local_size%32!=0:
return True
elif vendor == isc.vendor.INTEL and local_size%8!=0:
return True
array_expressions = isc.array_expression_container(symbolic)
registers_usage = template.registers_usage(array_expressions)/4
lmem_usage = template.lmem_usage(array_expressions)
occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
if template.is_invalid(array_expressions, device) or occupancy_record.occupancy < 10:
return True
return False
def benchmark(template, symbolic):
queue = symbolic.context.queues[0]
device = queue.device
array_expressions = isc.array_expression_container(symbolic)
registers_usage = template.registers_usage(array_expressions)/4
lmem_usage = template.lmem_usage(array_expressions)
local_size = template.local_size_0*template.local_size_1
occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
if occupancy_record.occupancy < 15 :
return float("inf")
else:
queue.models[template, isc.float32] = isc.model(isc.float32, template, queue)
timings = []
current_time = 0
x, events = isc.flush(symbolic)
symbolic.context.queues[0].synchronize()
while current_time < 1e-3:
x, events = isc.flush(symbolic)
symbolic.context.queues[0].synchronize()
timings.append(1e-9*sum([e.elapsed_time for e in events]))
current_time = current_time + timings[-1]
return np.max(timings)
def sanitize_string(string, keep_chars = ['_']):
string = string.replace(' ', '_').replace('-', '_').lower()
string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip()
return string

View File

@@ -1,9 +0,0 @@
import array, random, itertools
import deap.tools
import numpy as np
from genetic import GeneticOperators
def genetic(symbolic, Template, compute_perf, perf_metric, out):
GA = GeneticOperators(symbolic, Template, out)
return GA.optimize(maxtime='5m0s', maxgen=10000, compute_perf=compute_perf, perf_metric=perf_metric)

102
tune/tools.py Normal file
View File

@@ -0,0 +1,102 @@
import isaac as isc
from numpy import mean, median
from math import ceil, exp, log, sqrt
def sanitize(string, keep_chars = ['_']):
string = string.replace(' ', '_').replace('-', '_').lower()
string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip()
return string
def distance(x, y):
return sqrt(sum([(a - b)**2 for a, b in zip(x, y)]))
def linspace(a, b, n=100):
if n < 2:
return b
diff = (float(b) - a)/(n - 1)
return [diff * i + a for i in range(n)]
def expspace(a,b,N,r=128):
return [int(ceil(exp(x)/r)*r) for x in linspace(log(a), log(b), N)]
def benchmark(template, setting, tree):
queue = tree.context.queues[0]
queue.models[template, isc.float32] = isc.model(isc.float32, template(*setting), queue)
times = []
total = 0
i = 0
while total < 1e-2:
#z = isc.zeros(1, 10000000, isc.float32, tree.context)
z, events = isc.enqueue(tree)
tree.context.queues[0].synchronize()
times.append(1e-9*sum([e.elapsed_time for e in events]))
total += times[-1]
i+=1
return mean(times)
def tree_of(template, sizes, context):
if issubclass(template, isc.vaxpy):
N, = sizes
x = isc.empty(N, dtype=isc.float32, context=context)
y = isc.empty(N, dtype=isc.float32, context=context)
return x + y, (x, y)
elif issubclass(template, isc.reduction):
N, = sizes
x = isc.empty(N, context=context)
y = isc.empty(N, context=context)
return isc.dot(x, y), (x, y)
elif issubclass(template, isc.maxpy):
M, N = sizes
A = isc.empty((M,N), context=context)
B = isc.empty((M,N), context=context)
return A + B, (A, B)
elif issubclass(template, isc.mreduction):
T = template is isc.mreduction_cols
M, N = sizes[::-1] if T else sizes
A = isc.empty((M,N), context=context)
x = isc.empty(N, context=context)
return isc.dot(A.T, x) if T else isc.dot(A, x), (A, x)
elif issubclass(template, isc.mproduct):
AT = template is isc.mproduct_tn or template is isc.mproduct_tt
BT = template is isc.mproduct_nt or template is isc.mproduct_tt
M, N, K = sizes
A = isc.empty((K, M) if AT else (M, K), context=context)
B = isc.empty((N, K) if BT else (K, N), context=context)
AA = A.T if AT else A
BB = B.T if BT else B
return isc.dot(AA, BB), (A, B)
def memory_footprint(template, sizes):
if issubclass(template, isc.vaxpy):
return 4*3*sizes[0]*1e-9
elif issubclass(template, isc.reduction):
return 4*2*sizes[0]*1e-9
elif issubclass(template, isc.maxpy):
return 4*3*sizes[0]*sizes[1]*1e-9
elif issubclass(template, isc.mreduction):
return 4*sizes[0]*sizes[1]*1e-9
elif issubclass(template, isc.mproduct):
return 4*(sizes[0]*sizes[1] + sizes[0]*sizes[2] + sizes[1]*sizes[2])*1e-9
def metric_of(template):
memory_bound = [isc.vaxpy, isc.reduction, isc.maxpy, isc.mreduction]
compute_bound = [isc.mproduct]
if any([issubclass(template, x) for x in memory_bound]):
return lambda sizes, t: memory_footprint(template, sizes)/t
elif any([issubclass(template, x) for x in compute_bound]):
return lambda sizes, t: 2*sizes[0]*sizes[1]*sizes[2]*1e-9/t
def genetic_infos_of(template):
if issubclass(template, isc.vaxpy):
return {'categorical': [3], 'nbits': [3,4,4,2] }
elif issubclass(template, isc.reduction):
return {'categorical': [3], 'nbits':[3,4,4,2]}
elif issubclass(template, isc.maxpy):
return {'categorical': [5], 'nbits': [3,3,3,3,4,2]}
elif issubclass(template, isc.mreduction):
return {'categorical': [5], 'nbits': [3,3,3,3,4,2]}
elif issubclass(template, isc.mproduct):
return {'categorical': [8,9], 'nbits': [3,3,3,3,3,2,2,2,2,2,3,3]}

133
tune/tune.py Normal file
View File

@@ -0,0 +1,133 @@
import random, argparse, json, os
from math import log, isinf
from itertools import chain, product
from numpy import argsort, argmax
from operator import mul
from sklearn import ensemble
import isaac as isc
import optimize, tools, model
def unique(L):
seen = set()
seen_add = seen.add
return [ x for x in L if not (x in seen or seen_add(x))]
def pow2range(a, b):
return [2**x for x in range(a, b)]
def tune(device, operation, json_path):
#List devices
platforms = isc.get_platforms()
context = isc.context(device)
#List of size tuples to use
sizes = list({isc.vaxpy: [(x,) for x in tools.expspace(1e3, 1e7, 4)],
isc.mreduction_cols: product(pow2range(4,17), pow2range(4,17)),
isc.mproduct_nt: product(pow2range(4, 17), pow2range(4, 17), pow2range(4, 17))}[operation])
sizes = unique(sizes)
sizes = [x for x in sizes if 1e-4 <= tools.memory_footprint(operation, x) <= 1e-1]
#Training data
performance = tools.metric_of(operation)
profiles = []
X = []
Y = []
for idx, x in enumerate(sizes):
print x
nparams = len(profiles)
tree, operands = tools.tree_of(operation, x, context)
#Check if the current best prediction is not a local optimum
if idx==0:
tune = True
predicted = None
else:
if nparams==1:
predicted = profiles[0]
else:
clf = ensemble.RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y)
#clf, nrmse = model.train(X, Y, profiles)
predperf = clf.predict(x)[0]
best = (-predperf).argsort()[:5]
perf = [performance(x, tools.benchmark(operation, profiles[b], tree)) for b in best]
predicted = profiles[best[argmax(perf)]]
tune = not optimize.is_local_optimum(predicted, operation, x, context)
#Retune if necessary
if tune:
#new = optimize.exhaustive(operation, x, context)
new = optimize.genetic(operation, x, context, niter=1000, naccept=1000, popsize=20, prior=predicted)[0]
if new not in profiles:
profiles.append(new)
if idx > 0:
for xx,yy in zip(X, Y):
_tree, _operands = tools.tree_of(operation, xx, context)
time = tools.benchmark(operation, new, _tree)
perf = performance(xx, time)
yy.append(0 if isinf(perf) else perf)
#Update dataset
y = []
fastest = max(predperf) if nparams > 1 else None
for ip, p in enumerate(profiles):
perf = 0 if fastest and ip < nparams and predperf[ip]/fastest < .1 else performance(x,tools.benchmark(operation, p, tree))
y.append(0 if isinf(perf) else perf)
X.append(x)
Y.append(y)
#Build model
clf, nrmse = model.train(X, Y, profiles)
print 'The optimal classifer has NRMSE = %.2g (%d estimators and the max depth is %d'%(nrmse, clf.n_estimators, clf.max_depth)
#Export to JSON
if os.path.isfile(json_path):
json_data = json.load(open(args.out, 'r'))
else:
json_data = {}
json_data["version"] = "1.0"
operation_name = operation.__name__
if operation_name not in json_data:
json_data[operation_name] = {}
json_data[operation_name]['float32'] = {}
D = json_data[operation_name]['float32']
if len(profiles) > 1:
D['predictor'] = [{'children_left': e.tree_.children_left.tolist(),
'children_right': e.tree_.children_right.tolist(),
'threshold': e.tree_.threshold.astype('float64').tolist(),
'feature': e.tree_.feature.astype('float64').tolist(),
'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_]
D['profiles'] = [map(int, x) for x in profiles]
json.dump(json_data, open(json_path,'w'))
def parse_arguments():
platforms = isc.get_platforms()
devices = [d for platform in platforms for d in platform.get_devices()]
#Command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--device", default=0, type=int, help='Device to tune for')
parser.add_argument("-o", "--operation", type=str, required=True, help='Operation to tune for')
parser.add_argument("-j", "--json", default='', type=str)
args = parser.parse_args()
device = devices[int(args.device)]
print("----------------")
print("Devices available:")
print("----------------")
for (i, d) in enumerate(devices):
selected = '[' + ('x' if device==d else '') + ']'
print selected , '-', isc.device_type_to_string(d.type), '-', d.name, 'on', d.platform.name
print("----------------")
operation = {'vaxpy': isc.vaxpy, 'dot': isc.reduction,
'maxpy': isc.maxpy, 'gemv_n': isc.mreduction_rows, 'gemv_t': isc.mreduction_cols,
'gemm_nn': isc.mproduct_nn, 'gemv_tn': isc.mproduct_tn, 'gemm_nt': isc.mproduct_nt, 'gemm_tt':isc.mproduct_tt}[args.operation]
if not args.json:
json = tools.sanitize(device.name) + '.json'
return (device, operation, json)
if __name__ == "__main__":
isc.state.queue_properties = isc.CL_QUEUE_PROFILING_ENABLE
args = parse_arguments()
tune(*args)