From 95f2564c1a8636a40f9234d67093dcf79bbf23f6 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 24 Aug 2015 19:24:33 -0400 Subject: [PATCH] Tuning: Android UI improvement --- cmake/python/setup.py | 2 +- include/isaac/driver/common.h | 17 +- include/nvrtc.h | 392 --------------------------------- lib/driver/backend.cpp | 12 +- lib/kernels/templates/gemm.cpp | 28 +-- lib/profiles/profiles.cpp | 11 +- python/setup.py | 6 +- tune/android/isaac.kv | 26 ++- tune/android/main.py | 47 +++- tune/android/screens/tune.kv | 14 +- tune/android/tune/optimize.py | 229 ++++++++++--------- tune/android/tune/tools.py | 9 +- tune/android/tune/tune.py | 355 +++++++++++++++-------------- 13 files changed, 422 insertions(+), 726 deletions(-) delete mode 100644 include/nvrtc.h diff --git a/cmake/python/setup.py b/cmake/python/setup.py index 75e7b444a..bcd2cf00a 100644 --- a/cmake/python/setup.py +++ b/cmake/python/setup.py @@ -155,7 +155,7 @@ def main(): author='Philippe Tillet', author_email='ptillet@g.harvard.edu', license='MPL 2.0', - packages=['isaac','isaac.external', 'isaac.external.sklearn'], + packages=['isaac', 'isaac.external', 'isaac.external.sklearn'], ext_package="isaac", ext_modules=extensions, cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass}, diff --git a/include/isaac/driver/common.h b/include/isaac/driver/common.h index e94853de6..3c8dde3b2 100644 --- a/include/isaac/driver/common.h +++ b/include/isaac/driver/common.h @@ -1,15 +1,14 @@ #ifndef ISAAC_DRIVER_COMMON_H #define ISAAC_DRIVER_COMMON_H - - -#include -#include #include -#ifdef ISAAC_WITH_CUDA -#include -#include -#endif +//OpenCL Backend +#include "isaac/driver/external/CL/cl.h" +#include "isaac/driver/external/CL/cl_ext.h" +//CUDA Backend +#include "isaac/driver/external/CUDA/cuda.h" +#include "isaac/driver/external/CUDA/nvrtc.h" + #include "isaac/defines.h" DISABLE_MSVC_WARNING_C4275 @@ -30,7 +29,6 @@ enum backend_type -#ifdef ISAAC_WITH_CUDA namespace nvrtc { @@ -130,7 +128,6 @@ namespace cuda void check(CUresult); } -#endif namespace ocl { diff --git a/include/nvrtc.h b/include/nvrtc.h deleted file mode 100644 index 390de0cc9..000000000 --- a/include/nvrtc.h +++ /dev/null @@ -1,392 +0,0 @@ -/* - * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#ifndef __NVRTC_H__ -#define __NVRTC_H__ - -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - -#include - - -/*****************************//** - * - * \defgroup error Error Handling - * - ********************************/ - - -/** - * \ingroup error - * \brief CUDA Online Compiler API call result code. - */ -typedef enum { - NVRTC_SUCCESS = 0, - NVRTC_ERROR_OUT_OF_MEMORY = 1, - NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, - NVRTC_ERROR_INVALID_INPUT = 3, - NVRTC_ERROR_INVALID_PROGRAM = 4, - NVRTC_ERROR_INVALID_OPTION = 5, - NVRTC_ERROR_COMPILATION = 6, - NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7 -} nvrtcResult; - - -/** - * \ingroup error - * \brief ::nvrtcGetErrorString is a helper function that stringifies the - * given #nvrtcResult code, e.g., \link #nvrtcResult NVRTC_SUCCESS - * \endlink to \c "NVRTC_SUCCESS". For unrecognized enumeration - * values, it returns \c "NVRTC_ERROR unknown". - * - * \param [in] result CUDA Online Compiler API result code. - * \return Message string for the given #nvrtcResult code. - */ -const char *nvrtcGetErrorString(nvrtcResult result); - - -/****************************************//** - * - * \defgroup query General Information Query - * - *******************************************/ - - -/** - * \ingroup query - * \brief ::nvrtcVersion sets the output parameters \p major and \p minor - * with the CUDA Online Compiler version number. - * - * \param [out] major CUDA Online Compiler major version number. - * \param [out] minor CUDA Online Compiler minor version number. - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink - * - */ -nvrtcResult nvrtcVersion(int *major, int *minor); - - -/********************************//** - * - * \defgroup compilation Compilation - * - ***********************************/ - - -/** - * \ingroup compilation - * \brief ::nvrtcProgram is the unit of compilation, and an opaque handle for - * a program. - * - * To compile a CUDA program string, an instance of nvrtcProgram must be - * created first with ::nvrtcCreateProgram, then compiled with - * ::nvrtcCompileProgram. - */ -typedef struct _nvrtcProgram *nvrtcProgram; - - -/** - * \ingroup compilation - * \brief ::nvrtcCreateProgram creates an instance of ::nvrtcProgram with the - * given input parameters, and sets the output parameter \p prog with - * it. - * - * \param [out] prog CUDA Online Compiler program. - * \param [in] src CUDA program source. - * \param [in] name CUDA program name.\n - * \p name can be \c NULL; \c "default_program" is - * used when \p name is \c NULL. - * \param [in] numHeaders Number of headers used.\n - * \p numHeaders must be greater than or equal to 0. - * \param [in] headers Sources of the headers.\n - * \p headers can be \c NULL when \p numHeaders is - * 0. - * \param [in] includeNames Name of each header by which they can be - * included in the CUDA program source.\n - * \p includeNames can be \c NULL when \p numHeaders - * is 0. - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink - * - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink - * - * \see ::nvrtcDestroyProgram - */ -nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, - const char *src, - const char *name, - int numHeaders, - const char **headers, - const char **includeNames); - - -/** - * \ingroup compilation - * \brief ::nvrtcDestroyProgram destroys the given program. - * - * \param [in] prog CUDA Online Compiler program. - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink - * - * \see ::nvrtcCreateProgram - */ -nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog); - - -/** - * \ingroup compilation - * \brief ::nvrtcCompileProgram compiles the given program. - * - * The valid compiler options are: - * - * - Compilation targets - * - --gpu-architecture=\ (-arch)\n - * Specify the name of the class of GPU architectures for which the - * input must be compiled.\n - * - Valid GPU architecture names: - * - compute_20 - * - compute_30 - * - compute_35 - * - compute_50 - * - Default: compute_20 - * - Separate compilation / whole-program compilation - * - --device-c (-dc)\n - * Generate relocatable code that can be linked with other relocatable - * device code. It is equivalent to --relocatable-device-code=true. - * - --device-w (-dw)\n - * Generate non-relocatable code. It is equivalent to - * --relocatable-device-code=false. - * - --relocatable-device-code=[true, false] (-rdc)\n - * Enable (disable) the generation of relocatable device code. - * - Default: false - * - Debugging support - * - --device-debug (-G)\n - * Generate debug information. - * - --generate-line-info (-lineinfo)\n - * Generate line-number information. - * - Code generation - * - --maxrregcount=\ (-maxrregcount)\n - * Specify the maximum amount of registers that GPU functions can use. - * Until a function-specific limit, a higher value will generally - * increase the performance of individual GPU threads that execute this - * function. However, because thread registers are allocated from a - * global register pool on each GPU, a higher value of this option will - * also reduce the maximum thread block size, thereby reducing the amount - * of thread parallelism. Hence, a good maxrregcount value is the result - * of a trade-off. If this option is not specified, then no maximum is - * assumed. Value less than the minimum registers required by ABI will - * be bumped up by the compiler to ABI minimum limit. - * - --ftz=[true, false] (-ftz)\n - * When performing single-precision floating-point operations, flush - * denormal values to zero or preserve denormal values. --use_fast_math - * implies --ftz=true. - * - Default: false - * - --prec-sqrt=[true, false] (-prec-sqrt)\n - * For single-precision floating-point square root, use IEEE - * round-to-nearest mode or use a faster approximation. --use_fast_math - * implies --prec-sqrt=false. - * - Default: true - * - --prec-div=[true, false] (-prec-div)\n - * For single-precision floating-point division and reciprocals, use IEEE - * round-to-nearest mode or use a faster approximation. --use_fast_math - * implies --prec-div=false. - * - Default: true - * - --fmad=[true, false] (-fmad)\n - * Enables (disables) the contraction of floating-point multiplies and - * adds/subtracts into floating-point multiply-add operations (FMAD, - * FFMA, or DFMA). --use_fast_math implies --fmad=true. - * - Default: true - * - --use_fast_math (-use_fast_math)\n - * Make use of fast math operations. --use_fast_math implies --ftz=true - * --prec-div=false --prec-sqrt=false --fmad=true. - * - Preprocessing - * - --define-macro=\ (-D)\n - * macrodef can be either name or - * name=definitions. - * - name\n - * Predefine name as a macro with definition 1. - * - name=definition\n - * The contents of definition are tokenized and preprocessed - * as if they appeared during translation phase three in a \c \#define - * directive. In particular, the definition will be truncated by - * embedded new line characters. - * - --undefine-macro=\ (-U)\n - * Cancel any previous definition of \em name. - * - --include-path=\ (-I)\n - * Add the directory dir to the list of directories to be - * searched for headers. These paths are searched after the list of - * headers given to ::nvrtcCreateProgram. - * - --pre-include=\ (-include)\n - * Preinclude header during preprocessing. - * - Language Dialect - * - --std=c++11 (-std=c++11)\n - * Set language dialect to C++11. - * - --builtin-move-forward=[true, false] (-builtin-move-forward)\n - * Provide builtin definitions of std::move and std::forward, when C++11 - * language dialect is selected. - * - Default : true - * - --builtin-initializer-list=[true, false] (-builtin-initializer-list)\n - * Provide builtin definitions of std::initializer_list class and member - * functions when C++11 language dialect is selected. - * - Default : true - * - Misc - * - --disable-warnings (-w)\n - * Inhibit all warning messages. - * - --restrict (-restrict)\n - * Programmer assertion that all kernel pointer parameters are restrict - * pointers. - * - --device-as-default-execution-space -* (-default-device)\n - * Treat entities with no execution space annotation as \c __device__ - * entities. - * - * \param [in] prog CUDA Online Compiler program. - * \param [in] numOptions Number of compiler options passed. - * \param [in] options Compiler options in the form of C string array.\n - * \p options can be \c NULL when \p numOptions is 0. - * - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink - * - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink - * - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink - */ -nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, - int numOptions, const char **options); - - -/** - * \ingroup compilation - * \brief ::nvrtcGetPTXSize sets \p ptxSizeRet with the size of the PTX - * generated by the previous compilation of \p prog (including the - * trailing \c NULL). - * - * \param [in] prog CUDA Online Compiler program. - * \param [out] ptxSizeRet Size of the generated PTX (including the trailing - * \c NULL). - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink - * - * \see ::nvrtcGetPTX - */ -nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet); - - -/** - * \ingroup compilation - * \brief ::nvrtcGetPTX stores the PTX generated by the previous compilation - * of \p prog in the memory pointed by \p ptx. - * - * \param [in] prog CUDA Online Compiler program. - * \param [out] ptx Compiled result. - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink - * - * \see ::nvrtcGetPTXSize - */ -nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx); - - -/** - * \ingroup compilation - * \brief ::nvrtcGetProgramLogSize sets \p logSizeRet with the size of the - * log generated by the previous compilation of \p prog (including the - * trailing \c NULL). - * - * Note that compilation log may be generated with warnings and informative - * messages, even when the compilation of \p prog succeeds. - * - * \param [in] prog CUDA Online Compiler program. - * \param [out] logSizeRet Size of the compilation log - * (including the trailing \c NULL). - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink - * - * \see ::nvrtcGetProgramLog - */ -nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet); - - -/** - * \ingroup compilation - * \brief ::nvrtcGetProgramLog stores the log generated by the previous - * compilation of \p prog in the memory pointed by \p log. - * - * \param [in] prog CUDA Online Compiler program. - * \param [out] log Compilation log. - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink - * - * \see ::nvrtcGetProgramLogSize - */ -nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log); - - -#ifdef __cplusplus -} -#endif /* __cplusplus */ - -#endif /* __NVRTC_H__ */ diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp index 0f9e881f2..eae47885f 100644 --- a/lib/driver/backend.cpp +++ b/lib/driver/backend.cpp @@ -130,12 +130,12 @@ void backend::platforms(std::vector & platforms) #ifdef ISAAC_WITH_CUDA platforms.push_back(Platform(CUDA)); #endif -// cl_uint nplatforms; -// ocl::check(clGetPlatformIDs(0, NULL, &nplatforms)); -// std::vector clplatforms(nplatforms); -// ocl::check(clGetPlatformIDs(nplatforms, clplatforms.data(), NULL)); -// for(cl_platform_id p: clplatforms) -// platforms.push_back(Platform(p)); + cl_uint nplatforms; + ocl::check(clGetPlatformIDs(0, NULL, &nplatforms)); + std::vector clplatforms(nplatforms); + ocl::check(clGetPlatformIDs(nplatforms, clplatforms.data(), NULL)); + for(cl_platform_id p: clplatforms) + platforms.push_back(Platform(p)); } void backend::synchronize(Context const & context) diff --git a/lib/kernels/templates/gemm.cpp b/lib/kernels/templates/gemm.cpp index 353a58718..80eb7aff7 100644 --- a/lib/kernels/templates/gemm.cpp +++ b/lib/kernels/templates/gemm.cpp @@ -467,23 +467,23 @@ gemm_parameters::gemm_parameters(unsigned int simd_width stream << "}" << std::endl; - if(A_trans_=='N' || B_trans_=='T') - stream << "int Ky = K - idT.y;" << std::endl; - if(A_trans_=='T' || B_trans_=='N') - stream << "int Kx = K - idT.x;" << std::endl; +// if(A_trans_=='N' || B_trans_=='T') +// stream << "int Ky = K - idT.y;" << std::endl; +// if(A_trans_=='T' || B_trans_=='N') +// stream << "int Kx = K - idT.x;" << std::endl; - if(A_trans_=='N' || B_trans_=='T') - for(unsigned int k = 0; k < p_.kL; k += p_.local_fetch_1) - stream << "int condy" << k << " = " << k << " < Ky;" << std::endl; +// if(A_trans_=='N' || B_trans_=='T') +// for(unsigned int k = 0; k < p_.kL; k += p_.local_fetch_1) +// stream << "int condy" << k << " = " << k << " < Ky;" << std::endl; - if(A_trans_=='T' || B_trans_=='N') - { - for(unsigned int k = 0 ; k < p_.kL ; k += p_.local_fetch_0*p_.simd_width) - for(unsigned int s = 0 ; s < p_.simd_width ; ++s) - stream << "int condx" << k + s << " = " << k + s << " < Kx;" << std::endl; - } +// if(A_trans_=='T' || B_trans_=='N') +// { +// for(unsigned int k = 0 ; k < p_.kL ; k += p_.local_fetch_0*p_.simd_width) +// for(unsigned int s = 0 ; s < p_.simd_width ; ++s) +// stream << "int condx" << k + s << " = " << k + s << " < Kx;" << std::endl; +// } - fetch_to_lds(true); +// fetch_to_lds(true); stream << "//Write back C" << std::endl; stream << "M += ids.x;" << std::endl; diff --git a/lib/profiles/profiles.cpp b/lib/profiles/profiles.cpp index b23d26c46..3008ed500 100644 --- a/lib/profiles/profiles.cpp +++ b/lib/profiles/profiles.cpp @@ -119,16 +119,15 @@ void profiles::value_type::execute(controller const & expr) else if(predictor_.get()) { std::vector predictions = predictor_->predict(x); -// do{ + do{ label = std::distance(predictions.begin(),std::max_element(predictions.begin(), predictions.end())); -// predictions[label] = 0; -// }while(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE); + predictions[label] = 0; + }while(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE); } //Execution -// std::cout << label << std::endl; -// if(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE) -// throw operation_not_supported_exception("Running this operation would require an overly large temporary."); + if(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE) + throw operation_not_supported_exception("Running this operation would require an overly large temporary."); return templates_[label]->enqueue(queue_, program, tools::to_string(label), *fallback_, expr); } diff --git a/python/setup.py b/python/setup.py index 28fb8015e..0dc3fa2fb 100644 --- a/python/setup.py +++ b/python/setup.py @@ -114,7 +114,7 @@ def main(): #Include directories numpy_include = os.path.join(find_module("numpy")[1], "core", "include") - include =' src/include src/lib/external /usr/local/cuda/include'.split() + ['external/boost/', 'external/boost/boost/', numpy_include] + include =' src/include src/lib/external'.split() + ['external/boost/', 'external/boost/boost/', numpy_include] #Android if for_android: @@ -124,7 +124,7 @@ def main(): libraries += ['gnustl_shared'] #Source files - src = 'src/lib/exception/operation_not_supported.cpp src/lib/exception/unknown_datatype.cpp src/lib/value_scalar.cpp src/lib/driver/check.cpp src/lib/driver/ndrange.cpp src/lib/driver/platform.cpp src/lib/driver/backend.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/event.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/buffer.cpp src/lib/driver/context.cpp src/lib/kernels/templates/axpy.cpp src/lib/kernels/templates/gemv.cpp src/lib/kernels/templates/dot.cpp src/lib/kernels/templates/base.cpp src/lib/kernels/templates/ger.cpp src/lib/kernels/templates/gemm.cpp src/lib/kernels/stream.cpp src/lib/kernels/keywords.cpp src/lib/kernels/mapped_object.cpp src/lib/kernels/binder.cpp src/lib/kernels/parse.cpp src/lib/wrap/clBLAS.cpp src/lib/profiles/predictors/random_forest.cpp src/lib/profiles/presets.cpp src/lib/profiles/profiles.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/expression.cpp src/lib/symbolic/io.cpp src/lib/symbolic/preset.cpp src/lib/array.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']] + src = 'src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/program_cache.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/profiles/presets.cpp src/lib/profiles/profiles.cpp src/lib/profiles/predictors/random_forest.cpp src/lib/kernels/templates/gemv.cpp src/lib/kernels/templates/axpy.cpp src/lib/kernels/templates/gemm.cpp src/lib/kernels/templates/ger.cpp src/lib/kernels/templates/dot.cpp src/lib/kernels/templates/base.cpp src/lib/kernels/mapped_object.cpp src/lib/kernels/stream.cpp src/lib/kernels/parse.cpp src/lib/kernels/keywords.cpp src/lib/kernels/binder.cpp src/lib/wrap/clBLAS.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']] boostsrc = 'external/boost/libs/' for s in ['numpy','python','smart_ptr','system','thread']: src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x] @@ -155,7 +155,7 @@ def main(): author='Philippe Tillet', author_email='ptillet@g.harvard.edu', license='MPL 2.0', - packages=['isaac','isaac.external', 'isaac.external.sklearn'], + packages=['isaac', 'isaac.external', 'isaac.external.sklearn'], ext_package="isaac", ext_modules=extensions, cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass}, diff --git a/tune/android/isaac.kv b/tune/android/isaac.kv index 689424204..a60bf74b9 100644 --- a/tune/android/isaac.kv +++ b/tune/android/isaac.kv @@ -27,16 +27,26 @@ BoxLayout: ActionPrevious: with_previous: False - ActionToggleButton: - id: Tune - text: 'Tune' - on_release: app.show_tune() - group: 'menu' - - ActionToggleButton: + ActionButton: text: 'Settings' on_release: app.open_settings() group: 'menu' - + + BoxLayout: + orientation: 'horizontal' + size_hint: 1, .1 + + Button: + id: Tune + text: 'Tune' + on_release: app.show_tune() + group: 'menu' + + Button: + text: 'Benchmark' + group: 'menu' + + + ScreenManager: id: sm diff --git a/tune/android/main.py b/tune/android/main.py index ef2fd7845..65f2b6822 100644 --- a/tune/android/main.py +++ b/tune/android/main.py @@ -1,5 +1,7 @@ from os.path import dirname, realpath, join +from kivy.logger import Logger +from kivy.uix.scrollview import ScrollView from kivy.uix.boxlayout import BoxLayout from kivy.uix.label import Label from kivy.uix.checkbox import CheckBox @@ -12,11 +14,23 @@ from kivy.uix.settings import SettingsWithNoMenu import isaac as sc import json +import thread -from tune.tune import do_tuning +from tune.tune import Tuner __version__ = '1.0' +class ScrollableLabel(ScrollView): + text = StringProperty('') + +class LabelLogger: + def __init__(self, label): + self.label = label; + + def info(self, msg): + self.label.text += msg + '\n' + + class IsaacScreen(Screen): fullscreen = BooleanProperty(False) @@ -54,16 +68,24 @@ class IsaacApp(App): #Default view self.show_tune() + + #Logger + self.logger = LabelLogger(self.screens['Tune'].ids.out) def start_tuning(self): - #FIXME: will be buggy if two devices from two different platforms have the same name - device = next(x for x in self.isaac_handler.devices if x.name==self.config.get('hardware', 'device')) - operation = sc.templates.axpy - json_path = '' - #FIXME: Move profiling logics into tuning - sc.driver.default.queue_properties = sc.driver.PROFILING_ENABLE - print device.infos - do_tuning(device, operation, json_path) + button = self.screens['Tune'].ids.action_button + if button.text == 'Run': + #FIXME: will be buggy if two devices from two different platforms have the same name + device = next(x for x in self.isaac_handler.devices if x.name==self.config.get('hardware', 'device')) + #FIXME: Move profiling logics into tuning + sc.driver.default.queue_properties = sc.driver.PROFILING_ENABLE + self.logger.info('Using ' + device.name) + self.logger.info('') + tuner = Tuner(self.logger, device, sc.templates.axpy, '') + tid = thread.start_new_thread(Tuner.run, (tuner,)) + else: + pass + button.text = 'Running...' if button.text == 'Run' else button.text def show_benchmark(self): pass @@ -77,7 +99,8 @@ class IsaacApp(App): if 'Settings' not in self.screens: self.screens['Settings'] = Screen(name='Settings') self.screens['Settings'].add_widget(settings) - self.root.ids.sm.switch_to(self.screens['Settings'], direction='left') + if self.root.ids.sm.current != 'Settings': + self.root.ids.sm.switch_to(self.screens['Settings'], direction='left') def build_config(self, config): self.isaac_handler = IsaacHandler() @@ -104,10 +127,10 @@ class IsaacApp(App): 'key': operation.lower(), 'options': ['Simple', 'Intermediate', 'Full']}] - settings.add_json_panel('Settings', + settings.add_json_panel('ISAAC', self.config, data=json.dumps(layout)) - + def close_settings(self, *args): pass diff --git a/tune/android/screens/tune.kv b/tune/android/screens/tune.kv index 53483caff..f6e1ebdcf 100644 --- a/tune/android/screens/tune.kv +++ b/tune/android/screens/tune.kv @@ -1,3 +1,10 @@ +: + Label: + size_hint_y: None + height: self.texture_size[1] + text_size: self.width, None + text: root.text + IsaacScreen: name: 'Tune' fullscreen: True @@ -6,11 +13,14 @@ IsaacScreen: orientation: 'vertical' Button: - text: 'Start tuning' + id: action_button + background_color: .3,.3,.3,.3 + text: 'Run' pos: 0, 0 size_hint: 1, .1 on_release: app.start_tuning() - Label: + ScrollableLabel: + id: out text: '' diff --git a/tune/android/tune/optimize.py b/tune/android/tune/optimize.py index 187de5410..19d79b70d 100644 --- a/tune/android/tune/optimize.py +++ b/tune/android/tune/optimize.py @@ -12,14 +12,13 @@ from external.deap import tools as deap_tools from numpy import cumsum import tools +from tools import profile_execution_failure fetch_types = [sc.templates.fetching_policy_type.FETCH_FROM_LOCAL, sc.templates.fetching_policy_type.FETCH_FROM_LOCAL, sc.templates.fetching_policy_type.FETCH_FROM_LOCAL, sc.templates.fetching_policy_type.FETCH_FROM_LOCAL] -to_catch = (sc.OperationNotSupported, sc.OclLaunchOutOfResources, sc.CudaLaunchOutOfResources, sc.MemObjectAllocationFailure, sc.InvalidWorkGroupSize, sc.OutOfHostMemory, sc.InvalidValue) - def exhaustive(template, sizes, context): tree, _ = tools.tree_of(template, sizes, context) metric = tools.metric_of(template) @@ -35,127 +34,137 @@ def exhaustive(template, sizes, context): time = tools.benchmark(template, parameters, tree) if not best or time < best[1]: best = parameters, time - except to_catch: + except profile_execution_failure: pass if best: stdout.write('%.2f %% | Best %.2f [ for %s ]\r'%(float(idx*100)/len(ranges),metric(sizes, best[1]), best[0])) return best[0] - -def genetic(template, sizes, context, naccept=200, niter = 1000, cxpb=0.4, mutpb=0.4, popsize = 10, initializer = None, prior = None): - tree, _ = tools.tree_of(template, sizes, context) - metric = tools.metric_of(template) - genetic_infos = tools.genetic_infos_of(template) - nbits = genetic_infos['nbits'] - offsets = cumsum([0] + nbits) - def bin2gray(A): - g = [int(A[0])] - for i in range(1, len(A)): - g += [int(A[i-1] != A[i])] - return g +class GeneticOptimizer: - def gray2int(A): - b = [A[0]] - for i in range(1, len(A)): - b += [int(b[i-1] != A[i])] - return int(''.join(map(str,b)), 2) - - def encode(genome): - encoded = [bin2gray(bin(x)[2:].zfill(nb)) for x, nb in zip(genome, nbits)] - return sum(encoded, []) + def __init__(self, logger, naccept=500, niter=1000, cxpb=.4, mutpb=.4, popsize=10): + self.logger = logger + self.naccept = naccept + self.niter = niter + self.cxpb = cxpb + self.mutpb = mutpb + self.popsize = popsize - def decode(genome): - result = [] - for off1,off2 in zip(offsets[:-1],offsets[1:]): - result += [gray2int(genome[off1:off2])] - result = [fetch_types[x] if i in genetic_infos['categorical'] else 2**x for i,x in enumerate(result)] - return result + def run(self, template, sizes, context, initializer = None, prior = None): + tree, _ = tools.tree_of(template, sizes, context) + metric = tools.metric_of(template) + genetic_infos = tools.genetic_infos_of(template) + nbits = genetic_infos['nbits'] + offsets = cumsum([0] + nbits) - def evaluate(genome): - idx = tuple(genome) - if idx not in cache: - cache[idx] = tools.benchmark(template, decode(genome), tree) - return cache[idx], + def bin2gray(A): + g = [int(A[0])] + for i in range(1, len(A)): + g += [int(A[i-1] != A[i])] + return g - cache = {} - hof = deap_tools.HallOfFame(1) - - creator.create("FitnessMin", base.Fitness, weights=(-1.0,)) - creator.create("Individual", list, fitness=creator.FitnessMin) - - toolbox = base.Toolbox() - toolbox.register("evaluate", evaluate) - toolbox.register("mate", deap_tools.cxTwoPoint) - toolbox.register("mutate", deap_tools.mutFlipBit) - toolbox.register("select", deap_tools.selNSGA2) - - #Initialization - if initializer is None: - initializer = ([random.randint(0, 2**x) for x in nbits] for i in iter(int,1)) - population = [] - - genome = encode(prior if prior else list(initializer.next())) - while len(population) < popsize: - individual = creator.Individual(genome) - try: - individual.fitness.values = toolbox.evaluate(genome) - population += [individual] - except to_catch: - pass - genome = encode(list(initializer.next())) - hof.update(population) - - x = [] - y = [] - it = 0 - - while len(cache) < naccept and it 0: - for xx,yy in zip(X, Y): - _tree, _operands = tools.tree_of(operation, xx, context) - try: - time = tools.benchmark(operation, new, _tree) - perf = performance(xx, time) - except to_catch: - perf = 0 - yy.append(0 if isinf(perf) else perf) - #Update dataset - y = [] - fastest = max(predperf) if nparams > 1 else None - for ip, p in enumerate(profiles): - try: - perf = 0 if fastest and ip < nparams and predperf[ip]/fastest < .1 else performance(x,tools.benchmark(operation, p, tree)) - except to_catch: - perf = 0 - y.append(0 if isinf(perf) else perf) - X.append(x) - Y.append(y) + blas1_sizes = [(x,) for x in tools.expspace(1e3, 1e8, 30)] + sizes[sc.templates.axpy] = blas1_sizes + sizes[sc.templates.dot] = blas1_sizes + + #BLAS2 training sizes + if levels['BLAS2']=='simple': + blas2_sizes = [(1536, 1536)] + elif levels['BLAS2']=='intermediate': + blas2_sizes = [(1000,256), + (4096,256), + (256, 1000), + (256, 4096), + (169,256), + (169, 384), + (729,256), + (3025,96)] + else: + blas2_sizes = product(pow2range(4,17), pow2range(4,17)) + sizes[sc.templates.ger] = blas2_sizes + sizes[sc.templates.gemv_n] = blas2_sizes + sizes[sc.templates.gemv_t] = blas2_sizes + + #BLAS3 training sizes + if levels['BLAS3']=='simple': + blas3_sizes = [(1536,1536,1536)] + elif levels['BLAS3']=='intermediate': + blas3_sizes = [(32, 32, 16000), + (3025,96,363), + (729,128,1200), + (169,384,2304), + (169,192,1728), + (169,128,1728), + (169,1728,128), + (169,1728,192), + (169,2304,384), + (729,1200,128), + (1728,128,169), + (1728,192,169), + (2304,384,169), + (1200,128,729), + (363,96,3025)] + elif levels['BLAS3']=='full': + blas3_sizes = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 15)) + sizes[sc.templates.gemm_nn] = blas3_sizes + sizes[sc.templates.gemm_tn] = blas3_sizes + sizes[sc.templates.gemm_nt] = blas3_sizes + sizes[sc.templates.gemm_tt] = blas3_sizes + + #Remove duplicates + sizes = unique(list(sizes[operation])) + sizes = [x for x in sizes if 1e-4 <= tools.memory_footprint(operation, x) <= 1e-1] - for (fname, data) in zip(['X.csv', 'Y.csv', 'profiles.csv'], [X, Y, profiles]): - with open(os.path.join(savepath, fname), 'wb') as f: - csv.writer(f).writerows(data) - - - unused = where(bincount(argmax(Y, 1))==0)[0] - profiles = [x for ix,x in enumerate(profiles) if ix not in unused] - Y = delete(Y, unused, axis=1) - - #Export to JSON - json_path = tools.sanitize(device.name) + '.json' if not json_path else json_path - if os.path.isfile(json_path): - json_data = json.load(open(json_path, 'r')) - else: - json_data = {} - json_data["version"] = "1.0" - operation_name = operation.__name__ - if operation_name not in json_data: - json_data[operation_name] = {} - json_data[operation_name]['float32'] = {} - D = json_data[operation_name]['float32'] - if len(profiles) > 1: - clf, nrmse = model.train(X, Y, profiles) - D['predictor'] = [{'children_left': e.tree_.children_left.tolist(), - 'children_right': e.tree_.children_right.tolist(), - 'threshold': e.tree_.threshold.astype('float64').tolist(), - 'feature': e.tree_.feature.astype('float64').tolist(), - 'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_] - D['profiles'] = [map(int, x) for x in profiles] - json.dump(json_data, open(json_path,'w')) + #Training data + performance = tools.metric_of(operation) + profiles, X, Y = [], [], [] + + #Restore previous run + savepath = os.path.join('save', operation.__name__) + if not os.path.exists(savepath): + os.makedirs(savepath) + + try: + with open(os.path.join(savepath, 'X.csv')) as f: + X = [tuple(map(int, row)) for row in csv.reader(f, delimiter=',')] + + with open(os.path.join(savepath, 'Y.csv')) as f: + Y = [map(float, row) for row in csv.reader(f, delimiter=',')] + + with open(os.path.join(savepath, 'profiles.csv')) as f: + def mmap(x): + if x=='FETCH_FROM_LOCAL': + return sc.templates.fetching_policy_type.FETCH_FROM_LOCAL + if x=='FETCH_FROM_GLOBAL_CONTIGUOUS': + return sc.templates.fetching_policy_type.FETCH_FROM_GLOBAL_CONTIGUOUS + if x=='FETCH_FROM_GLOBAL_STRIDED': + return sc.templates.fetching_policy_type.FETCH_FROM_GLOBAL_STRIDED + return int(x) + profiles = [map(mmap,row) for v in row for row in csv.reader(f, delimiter=',')] + except: + pass + + for idx, x in enumerate(sizes): + if x in X: + self.pprint_datapoint(x, Y[X.index(x)]) + continue + idx = len(X) + nparams = len(profiles) + tree, operands = tools.tree_of(operation, x, context) + #Check if the current best prediction is not a local optimum + if idx==0: + tune = True + predicted = None + else: + if nparams==1: + predicted = profiles[0] + else: + clf = RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y) + #clf, nrmse = model.train(X, Y, profiles) + predperf = clf.predict(x)[0] + best = (-predperf).argsort()[:5] + perf = [] + for b in best: + try: + perf += [performance(x, tools.benchmark(operation, profiles[b], tree))] + except (sc.OperationNotSupported, sc.LaunchOutOfResources, sc.MemObjectAllocationFailure): + pass + predicted = profiles[best[argmax(perf)]] + tune = not optimize.is_local_optimum(predicted, operation, x, context) + #tune = True + #Retune if necessary + if tune: + #new = optimize.exhaustive(operation, x, context) + optimizer = optimize.GeneticOptimizer(self.logger, naccept=1000, niter=1000, cxpb=.4, mutpb=.4, popsize=20) + new = optimizer.run(operation, x, context, prior=predicted)[0] + if new not in profiles: + profiles.append(new) + if idx > 0: + for xx,yy in zip(X, Y): + _tree, _operands = tools.tree_of(operation, xx, context) + try: + time = tools.benchmark(operation, new, _tree) + perf = performance(xx, time) + except profile_execution_failure: + perf = 0 + yy.append(0 if isinf(perf) else perf) + #Update dataset + y = [] + fastest = max(predperf) if nparams > 1 else None + for ip, p in enumerate(profiles): + try: + perf = 0 if fastest and ip < nparams and predperf[ip]/fastest < .1 else performance(x,tools.benchmark(operation, p, tree)) + except profile_execution_failure: + perf = 0 + y.append(0 if isinf(perf) else perf) + X.append(x) + Y.append(y) + + for (fname, data) in zip(['X.csv', 'Y.csv', 'profiles.csv'], [X, Y, profiles]): + with open(os.path.join(savepath, fname), 'wb') as f: + csv.writer(f).writerows(data) + + #Update logging + self.pprint_datapoint(x, y) + + #Export to JSON + json_path = tools.sanitize(device.name) + '.json' if not self.json_path else self.json_path + if os.path.isfile(json_path): + json_data = json.load(open(json_path, 'r')) + else: + json_data = {} + json_data["version"] = "1.0" + operation_name = operation.__name__ + if operation_name not in json_data: + json_data[operation_name] = {} + json_data[operation_name]['float32'] = {} + D = json_data[operation_name]['float32'] + if len(profiles) > 1: + clf, nrmse = model.train(X, Y, profiles) + D['predictor'] = [{'children_left': e.tree_.children_left.tolist(), + 'children_right': e.tree_.children_right.tolist(), + 'threshold': e.tree_.threshold.astype('float64').tolist(), + 'feature': e.tree_.feature.astype('float64').tolist(), + 'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_] + D['profiles'] = [map(int, x) for x in profiles] + json.dump(json_data, open(json_path,'w'))