diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..bdb9e1ce7
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 2.8)
+project(triton)
+include(CTest)
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+
+# Options
+option(BUILD_TESTS "Build C++ Triton tests" ON)
+option(BUILD_PYTHON_MODULE "Build Python Triton bindings" OFF)
+
+# LLVM
+find_package(LLVM REQUIRED)
+include_directories(${LLVM_INCLUDE_DIRS})
+add_definitions(${LLVM_DEFINITIONS})
+
+# Default build type
+if(NOT CMAKE_BUILD_TYPE)
+  message(STATUS "Default build type: Release")
+  set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+# Compiler flags
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+# Tests
+if(BUILD_TESTS)
+  message(STATUS "Adding C++ tests")
+  add_subdirectory(tests)
+endif() 
+
+# Python module
+if(BUILD_PYTHON_MODULE)
+    message(STATUS "Adding Python module")
+    # PyBind11 wrapper source file
+    file(GLOB_RECURSE PYTHON_SRC python/src/bindings.cc)
+    include_directories(python/src/ ${PYTHON_INCLUDE_DIRS})
+endif()
+
+
+# Triton
+file(GLOB_RECURSE LIBTRITON_SRC lib/*.cc)
+add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC})
+link_directories(${LLVM_LIBRARY_DIRS})
+target_link_libraries(triton ${LLVM_LIBRARIES})
+
diff --git a/LICENSE b/LICENSE
new file mode 100755
index 000000000..464fb143d
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,26 @@
+/* Copyright 2018-2019 Philippe Tillet
+* 
+* Permission is hereby granted, free of charge, to any person obtaining 
+* a copy of this software and associated documentation files 
+* (the "Software"), to deal in the Software without restriction, 
+* including without limitation the rights to use, copy, modify, merge, 
+* publish, distribute, sublicense, and/or sell copies of the Software, 
+* and to permit persons to whom the Software is furnished to do so, 
+* subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be 
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+// The compiler front-end is based on a modified version of WGTCC
+// https://github.com/wgtdkp/wgtcc
+// Copyright (c) 2016 wgtdkp
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..8a5fe3e98
--- /dev/null
+++ b/README.md
@@ -0,0 +1,36 @@
+# Triton
+
+This is the development repository of Triton, a language and compiler for writing highly efficient custom Deep-Learning primitives. 
+
+The formal foundations of this project are described in the following MAPL2019 publication: [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). Please cite us if you use our work!
+
+
+The main features of Triton at the moment are:
+-  **PyTriton**: A Python API for writing custom operations for Triton-C compute-kernels. PyTriton automatically generates and just-in-time Tensorflow and PyTorch bindings.
+- **Triton-C**: An imperative, single-threaded language for writing highly efficient compute-kernels at a relatively high abstraction level using numpy-like extensions of the C language.
+- **Triton-IR**: An intermediate-representation for optimizing multi-dimensional array operations in linear algebra programs
+- **Triton-JIT**: An optimizing just-in-time compiler for Triton-C, which generates GPU code on par with state-of-the-art CUDA-C  (e.g.,  [CUTLASS](https://github.com/NVIDIA/cutlass)) and PTX (e.g., [ISAAC](https://github.com/ptillet/isaac)). This includes transparent support for mixed-precision and Tensor Cores.
+
+
+
+
+## Installation
+
+Triton is a fairly self-contained package and uses its own parser (forked from [wgtcc](https://github.com/wgtdkp/wgtcc)) and LLVM code-generator. However, at the moment it still relies on LLVM-8.0+ for PTX code generation.
+
+```
+sudo apt-get install llvm-8-dev
+git clone https://github.com/ptillet/triton.git;
+cd triton/python/;
+python setup.py develop;
+cd examples;
+python dot.py
+```
+
+## Tutorials
+
+- [The PyTriton API](https://github.com/ptillet/triton/blob/master/docs/pytriton.md)
+- [The Triton-C language](https://github.com/ptillet/triton/blob/master/docs/triton-c.md)
+- The Triton-IR representation (coming soon...)
+- The Triton-JIT compiler (coming soon...)
+
diff --git a/cmake/FindLLVM.cmake b/cmake/FindLLVM.cmake
new file mode 100644
index 000000000..30ebcbd89
--- /dev/null
+++ b/cmake/FindLLVM.cmake
@@ -0,0 +1,166 @@
+# - Find LLVM headers and libraries.
+# This module locates LLVM and adapts the llvm-config output for use with
+# CMake.
+#
+# A given list of COMPONENTS is passed to llvm-config.
+#
+# The following variables are defined:
+#  LLVM_FOUND          - true if LLVM was found
+#  LLVM_CXXFLAGS       - C++ compiler flags for files that include LLVM headers.
+#  LLVM_HOST_TARGET    - Target triple used to configure LLVM.
+#  LLVM_INCLUDE_DIRS   - Directory containing LLVM include files.
+#  LLVM_LDFLAGS        - Linker flags to add when linking against LLVM
+#                        (includes -LLLVM_LIBRARY_DIRS).
+#  LLVM_LIBRARIES      - Full paths to the library files to link against.
+#  LLVM_LIBRARY_DIRS   - Directory containing LLVM libraries.
+#  LLVM_ROOT_DIR       - The root directory of the LLVM installation.
+#                        llvm-config is searched for in ${LLVM_ROOT_DIR}/bin.
+#  LLVM_VERSION_MAJOR  - Major version of LLVM.
+#  LLVM_VERSION_MINOR  - Minor version of LLVM.
+#  LLVM_VERSION_STRING - Full LLVM version string (e.g. 6.0.0svn).
+#  LLVM_VERSION_BASE_STRING - Base LLVM version string without git/svn suffix (e.g. 6.0.0).
+#
+# Note: The variable names were chosen in conformance with the offical CMake
+# guidelines, see ${CMAKE_ROOT}/Modules/readme.txt.
+
+# Try suffixed versions to pick up the newest LLVM install available on Debian
+# derivatives.
+# We also want an user-specified LLVM_ROOT_DIR to take precedence over the
+# system default locations such as /usr/local/bin. Executing find_program()
+# multiples times is the approach recommended in the docs.
+set(llvm_config_names llvm-config-9 llvm-config-9.0 llvm-config90
+                      llvm-config-8 llvm-config-8.0 llvm-config80
+                      llvm-config)
+find_program(LLVM_CONFIG
+    NAMES ${llvm_config_names}
+    PATHS ${LLVM_ROOT_DIR}/bin NO_DEFAULT_PATH
+    DOC "Path to llvm-config tool.")
+find_program(LLVM_CONFIG NAMES ${llvm_config_names})
+
+# Prints a warning/failure message depending on the required/quiet flags. Copied
+# from FindPackageHandleStandardArgs.cmake because it doesn't seem to be exposed.
+macro(_LLVM_FAIL _msg)
+  if(LLVM_FIND_REQUIRED)
+    message(FATAL_ERROR "${_msg}")
+  else()
+    if(NOT LLVM_FIND_QUIETLY)
+      message(STATUS "${_msg}")
+    endif()
+  endif()
+endmacro()
+
+
+if(NOT LLVM_CONFIG)
+    if(NOT LLVM_FIND_QUIETLY)
+        message(WARNING "Could not find llvm-config (LLVM >= ${LLVM_FIND_VERSION}). Try manually setting LLVM_CONFIG to the llvm-config executable of the installation to use.")
+    endif()
+else()
+    macro(llvm_set var flag)
+       if(LLVM_FIND_QUIETLY)
+            set(_quiet_arg ERROR_QUIET)
+        endif()
+        set(result_code)
+        execute_process(
+            COMMAND ${LLVM_CONFIG} --${flag}
+            RESULT_VARIABLE result_code
+            OUTPUT_VARIABLE LLVM_${var}
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ${_quiet_arg}
+        )
+        if(result_code)
+            _LLVM_FAIL("Failed to execute llvm-config ('${LLVM_CONFIG}', result code: '${result_code})'")
+        else()
+            if(${ARGV2})
+                file(TO_CMAKE_PATH "${LLVM_${var}}" LLVM_${var})
+            endif()
+        endif()
+    endmacro()
+    macro(llvm_set_libs var flag)
+       if(LLVM_FIND_QUIETLY)
+            set(_quiet_arg ERROR_QUIET)
+        endif()
+        set(result_code)
+        execute_process(
+            COMMAND ${LLVM_CONFIG} --${flag} ${LLVM_FIND_COMPONENTS}
+            RESULT_VARIABLE result_code
+            OUTPUT_VARIABLE tmplibs
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ${_quiet_arg}
+        )
+        if(result_code)
+            _LLVM_FAIL("Failed to execute llvm-config ('${LLVM_CONFIG}', result code: '${result_code})'")
+        else()
+            file(TO_CMAKE_PATH "${tmplibs}" tmplibs)
+            string(REGEX MATCHALL "${pattern}[^ ]+" LLVM_${var} ${tmplibs})
+        endif()
+    endmacro()
+
+    llvm_set(VERSION_STRING version)
+    llvm_set(CXXFLAGS cxxflags)
+    llvm_set(HOST_TARGET host-target)
+    llvm_set(INCLUDE_DIRS includedir true)
+    llvm_set(ROOT_DIR prefix true)
+    llvm_set(ENABLE_ASSERTIONS assertion-mode)
+
+    # The LLVM version string _may_ contain a git/svn suffix, so cut that off
+    string(SUBSTRING "${LLVM_VERSION_STRING}" 0 5 LLVM_VERSION_BASE_STRING)
+
+    # Versions below 4.0 do not support components debuginfomsf and demangle
+    if(${LLVM_VERSION_STRING} MATCHES "^3\\..*")
+        list(REMOVE_ITEM LLVM_FIND_COMPONENTS "debuginfomsf" index)
+        list(REMOVE_ITEM LLVM_FIND_COMPONENTS "demangle" index)
+    endif()
+    # Versions below 8.0 not supported
+    if(${LLVM_VERSION_STRING} MATCHES "^[3-7]\\..*")
+        message(FATAL_ERROR "LLVM version below 8.0 not supported")
+    endif()
+
+    llvm_set(LDFLAGS ldflags)
+    # In LLVM 3.5+, the system library dependencies (e.g. "-lz") are accessed
+    # using the separate "--system-libs" flag.
+    llvm_set(SYSTEM_LIBS system-libs)
+    string(REPLACE "\n" " " LLVM_LDFLAGS "${LLVM_LDFLAGS} ${LLVM_SYSTEM_LIBS}")
+    llvm_set(LIBRARY_DIRS libdir true)
+    llvm_set_libs(LIBRARIES libs)
+    # LLVM bug: llvm-config --libs tablegen returns -lLLVM-3.8.0
+    # but code for it is not in shared library
+    if("${LLVM_FIND_COMPONENTS}" MATCHES "tablegen")
+        if (NOT "${LLVM_LIBRARIES}" MATCHES "LLVMTableGen")
+            set(LLVM_LIBRARIES "${LLVM_LIBRARIES};-lLLVMTableGen")
+        endif()
+    endif()
+
+    # Versions below 4.0 do not support llvm-config --cmakedir
+    if(${LLVM_VERSION_STRING} MATCHES "^3\\..*")
+        set(LLVM_CMAKEDIR ${LLVM_LIBRARY_DIRS}/cmake/llvm)
+    else()
+        llvm_set(CMAKEDIR cmakedir)
+    endif()
+
+    llvm_set(TARGETS_TO_BUILD targets-built)
+    string(REGEX MATCHALL "${pattern}[^ ]+" LLVM_TARGETS_TO_BUILD ${LLVM_TARGETS_TO_BUILD})
+endif()
+
+# Remove some clang-specific flags for gcc.
+if(CMAKE_COMPILER_IS_GNUCXX)
+    string(REPLACE "-Wcovered-switch-default " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS})
+    string(REPLACE "-Wstring-conversion " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS})
+    string(REPLACE "-fcolor-diagnostics " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS})
+    string(REPLACE "-Werror=unguarded-availability-new " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS})
+endif()
+
+# Remove gcc-specific flags for clang.
+if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+    string(REPLACE "-Wno-maybe-uninitialized " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS})
+endif()
+
+string(REGEX REPLACE "([0-9]+).*" "\\1" LLVM_VERSION_MAJOR "${LLVM_VERSION_STRING}" )
+string(REGEX REPLACE "[0-9]+\\.([0-9]+).*[A-Za-z]*" "\\1" LLVM_VERSION_MINOR "${LLVM_VERSION_STRING}" )
+
+
+# Use the default CMake facilities for handling QUIET/REQUIRED.
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(LLVM
+    REQUIRED_VARS LLVM_ROOT_DIR LLVM_HOST_TARGET
+    VERSION_VAR LLVM_VERSION_STRING)
diff --git a/docs/pytriton.md b/docs/pytriton.md
new file mode 100644
index 000000000..0e04e6246
--- /dev/null
+++ b/docs/pytriton.md
@@ -0,0 +1,243 @@
+# The PyTriton API
+
+
+## <span style="color:darkred"> Table of Contents </span>
+
+1. [Motivations](#motivations)
+2. [Triton Functions](#pytriton-function)
+    1. [Creation of Triton Kernels](#creation-triton-kernels)
+    2. [Usage of Triton Kernels](#usage-triton-kernels)
+3. [Integration with Automatic Differentiation](#autodiff)
+    1. [Basics](#autodiff-basics)
+    2. [Convenience](#autodiff-convenience)
+
+
+## <span style="color:darkred"> Motivations </span> <a name="motivations"></a>
+
+
+The purpose of PyTriton is to provide an API for easily executing Triton-C kernels from PyTorch and Tensorflow. One of the main advantages of PyTriton is that it is framework agnostic: any custom op written using this API will be transparently compatible with both Tensorflow and PyTorch without any additional effort required, as will be shown in this tutorial. 
+
+Consider for example the following piece of code:
+
+```python
+import numpy as np
+import triton
+
+def run_tf():
+  M, N, K = 128, 128, 128
+  a = tf.placeholder(tf.float32, shape=[M, K])
+  b = tf.placeholder(tf.float32, shape=[N, K])
+  c = triton.ops.dot(a, b, transpose_a = False, transpose_b = True)
+  da, db = tf.gradients(c, [a, b])
+  # Run
+  ha = np.random.rand(M, K).astype(np.float32)
+  hb = np.random.rand(K, N).astype(np.float32)
+  sess = tf.InteractiveSession()
+  sess.run(tf.global_variables_initializer())
+  result = sess.run([da], feed_dict = {a: ha, b: hb})
+
+def run_torch():
+  M, N, K = 128, 128, 128
+  a = torch.randn(M, K).cuda()
+  b = torch.randn(K, N).cuda()
+  a.requires_grad_(True)
+  b.requires_grad_(True)
+  c = triton.ops.dot(a, b, False, True)
+  c.backward()
+  da = a.grad.clone()
+  db = b.grad.clone()
+
+## Run on tensorflow
+# import tensorflow as tf
+# run_tf()
+
+## Run on pytorch
+# import torch
+# run_torch()
+```
+
+PyTriton works by detecting which frameworks are imported and automatically generating and just-in-time compiling C++ binding code for them. Specifically, the following chain of events is triggered when a Triton operation is executed:
+
+1. The imported frameworks are detected
+2. C++ binding code for Tensorflow or PyTorch is generated, compiled and cached.
+3. The corresponding custom-op is automatically loaded from the generated .so file, and a framework-agnostic wrapper is created.
+4. The wrapper is called and a tf.tensor or a torch.tensor is returned. In the case of Tensorflow, the gradient is also registered at this point if applicable
+
+
+The remainder of this tutorial will show you how to re-implement the above `triton.ops.dot` operation from scratch.
+
+## <span style="color:darkred"> PyTriton Functions </span> <a name="pytriton-function"></a>
+
+The PyTriton API provides a `triton.function` class which automatically handles the interaction with automatic differentiation in whichever framework was detected. Therefore, every differentiable custom operation written with PyTriton should inherit from this class
+
+```python
+import triton
+
+# Entry point
+class _dot(triton.function):
+
+  @staticmethod
+  # Forward Pass
+  def forward(ctx, *args):
+    #...
+  
+  @staticmethod
+  # Backward Pass
+  def backward(ctx, dy):
+    #...
+```
+
+### <span style="color:darkblue">Creation of Triton Kernels </span> <a name="creation-triton-kernel"></a>
+
+
+PyTriton also provides a `triton.kernel` class which automatically takes care of interaction with the Triton-JIT as well as the generation and compilation of C++ framework bindings code. For our dot operation we create a kernel from the Triton-C code derived at the end of the [previous tutorial](https://github.com/ptillet/triton/blob/master/docs/triton-c.md)
+
+``` 
+src = """
+__global__ void dot(TYPE * A, TYPE * B, TYPE * C,
+         int M, int N, int K,
+         int lda __multipleof(8),  int ldb __multipleof(8),  int ldc __multipleof(8)) {
+  // prologue
+  int pm = get_program_id(0);
+  int pn = get_program_id(1);
+  int rm[TM] = pm * TM + 0 ... TM;
+  int rn[TN] = pn * TN + 0 ... TN;
+  int rk[TK] = 0 ... TK;
+  float c[TM, TN] = 0;
+  // pointers to operands
+  TYPE* pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK + rm[BROADCAST_AM] * STRIDE_AM;
+  TYPE* pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK + rn[BROADCAST_BN] * STRIDE_BN;
+  // prefetches operands
+  TYPE a[SHAPE_A] = (*pa);
+  TYPE b[SHAPE_B] = (*pb);
+  // reduction loop
+  for(int k = K; k > 0; k-= TK){
+    c += USE_A @ USE_B;
+    pa = pa + TK * STRIDE_AK;
+    pb = pb + TK * STRIDE_BK;
+    a = *pa;
+    b = *pb;
+  }
+  // epilogue
+  int rcm[TM] =  pm * TM + 0 ... TM;
+  int rcn[TN] =  pn * TN + 0 ... TN;
+  TYPE* pc[TM, TN] = C + rcn[newaxis, :] + rcm[:, newaxis] * ldc;
+  *pc = c;
+}
+
+}
+"""
+
+  kernel = triton.kernel(src, ['C'])
+```
+
+Note that the second argument to `triton.kernel` constructors indicates which of the operands our kernel function should return. Here, we only return `C`.
+
+At this point, `kernel` is a callable object which takes the same signature as the `dot` function in our source code, except that pointers are treated as tensors: 
+```
+[tensor, tensor, tensor, int, int, int, int, int, int]
+```
+
+### <span style="color:darkblue">Usage of Triton Kernels </span> <a name="usage-triton-kernels"></a>
+
+However, in practice only A, B are provided by the user, and all the other `int` arguments should be derived from these operands only. Hence, we create a helper function that extracts shapes from the `A` and `B` tensors, and then returns the results of a call to `kernel`:
+
+```python
+  @staticmethod
+  def _call(a, b, transpose_a, transpose_b):
+    # extract shapes
+    shape_a = triton.shape(a)
+    shape_b = triton.shape(b)
+    M, Ka = shape_a[0], shape_a[1]
+    Kb, N = shape_b[0], shape_b[1]
+    # transpose shapes
+    if transpose_a:
+      M, Ka = Ka, M
+    if transpose_b:
+      Kb, N = N, Kb
+    # contiguous dimensions
+    lda = M if transpose_a else Ka
+    ldb = Kb if transpose_b else N
+    ldc = N
+    # data-type
+    dtype = a.dtype
+    # allocate output
+    c = triton.empty([M, N], dtype = dtype)
+    # compute
+    grid = lambda opt: [triton.cdiv(M, opt.d('TM')), triton.cdiv(N, opt.d('TN'))]
+    # macros -- not necessary but makes kernel source-code simpler
+    macros = {# handle A transposition
+              'USE_A'       : '^a'         if transpose_a else 'a',
+              'STRIDE_AK'   : 'lda'        if transpose_a else '1',
+              'STRIDE_AM'   : '1'          if transpose_a else 'lda',
+              'BROADCAST_AK': ':, newaxis' if transpose_a else 'newaxis, :',
+              'BROADCAST_AM': 'newaxis, :' if transpose_a else ':, newaxis',
+              'SHAPE_A'     : 'TK, TM'     if transpose_a else 'TM, TK',
+              # handle B transposition
+              'USE_B'       : '^b'         if transpose_b else 'b',
+              'STRIDE_BK'   : '1'          if transpose_b else 'ldb',
+              'STRIDE_BN'   : 'ldb'        if transpose_b else '1',
+              'BROADCAST_BK': 'newaxis, :' if transpose_b else ':, newaxis',
+              'BROADCAST_BN': ':, newaxis' if transpose_b else 'newaxis, :',
+              'SHAPE_B'     : 'TN, TK'     if transpose_b else 'TK, TN'}
+    return _dot.kernel(a, b, c, M, N, Ka, lda, ldb, ldc, grid,           
+                  AT = transpose_a, BT = transpose_b, TYPE = dtype, 
+                  TM = [32, 64, 128], TN = [32, 64, 128], TK = [8], **macros)
+
+```
+
+While this code should be mostly self-explanatory, there are a few of noteworthy things worth pointing out
+
+- `triton.shape` provides a framework-agnostic way to retrieve the shape of a tensor
+
+- `triton.empty` creates an empty tensor of the specified dimensions
+
+- `grid` corresponds to the grid with which our Triton kernel will be launched. Because in our case this grid depends on parametric tile variables, it is supplied as a function of compilation options `opt`, whose compile-time definition can be retrieved using `opt.d(name)`. Here, `opt.d('TM')` and `opt.d('TN')` retrieve the first and second tile dimension our kernel was compiled with. We also provide a helper `triton.cdiv` for ceil divisions.
+
+- `macros` provides a list of preprocessor definitions to compile the kernel with. Alternatively, these can also be supplied as named argument to the `_dot.kernel`. We recall that lists can be supplied to the preprocessor, in which case an auto-tuning procedure will be triggered. Here, the value of `TM` and `TN` are both tuned between 32, 64 and 128.
+
+## <span style="color:darkred"> Compatibility with Automatic Differentiation</span> <a name="autodiff"></a>
+
+At this point, our custom operation only takes two tensor arguments and transposition information, which is good. However, it is still not compatible with PyTorch's or TensorFlow's automatic differentiation engine, and a small amount of additional effort is needed.
+
+### <span style="color:darkblue"> Basics </span> <a name="autodiff-basics"></a>
+
+PyTriton binds to Tensorflow's and PyTorch's automatic differentiation framework using a single, common API inspired by PyTorch. It consists of two static methods `forward` and `backward` that take a context as their first input:
+
+```
+  @staticmethod
+  def forward(ctx, a, b, transpose_a = False, transpose_b = False):
+    ctx.save_for_backward(a, b)
+    ctx.t_a = transpose_a
+    ctx.t_b = transpose_b
+    return _dot._call(a, b, transpose_a, transpose_b)
+
+  @staticmethod
+  def backward(ctx, dy):
+    a, b = ctx.saved_tensors
+    t_a, t_b = ctx.t_a, ctx.t_b
+    if not t_a and not t_b:
+      da = _dot._call(dy, b, False, True)
+      db = _dot._call(a, dy, True, False)
+    elif not t_a and t_b:
+      da = _dot._call(dy, b, False, False)
+      db = _dot._call(dy, a, True, False)
+    elif t_a and not t_b:
+      da = _dot._call(b, dy, False, True)
+      db = _dot._call(a, dy, False, False)
+    elif t_a and t_b:
+      da = _dot._call(b, dy, True, True)
+      db = _dot._call(dy, a, True, True)
+    else:
+      assert False
+    return da, db, None, None, None, None, None, None, None
+```
+
+### <span style="color:darkblue">Convenience </span> <a name="autodiff-convenience"></a>
+
+Still like for PyTorch, a callable operation can be created using the `apply` method of our `triton.function` class. We wrap it as a module variable for convenience:
+
+```python
+dot = _dot.apply
+```
+And that's it! Our custom op is now created and ready to be used with both PyTorch and Tensorflow.
diff --git a/docs/triton-c.md b/docs/triton-c.md
new file mode 100644
index 000000000..6222169b5
--- /dev/null
+++ b/docs/triton-c.md
@@ -0,0 +1,436 @@
+# The Triton-C Programming Language
+
+## <span style="color:darkred"> Table of Contents </span>
+1. [Motivations](#motivations)
+2. [Vector Addition](#vector-addition)
+    1. [Differences with CUDA](#differences-with-cuda)
+    2. [Advantages over CUDA](#advantages-over-cuda)
+    	1. [Vectorization](#vectorization)
+    	2. [Parameterization](#parameterization)
+    	3. [Auto-Tuning](#auto-tuning)
+3. [Matrix Transposition](#matrix-transposition)
+    1. [Compute Kernel](#trans-compute-kernel)
+    2. [The __multipleof Attribute](#trans-multipleof)
+    3. [Conditional Dereferencing](#conditional-dereferencing)
+4. [Matrix Multiplication](#matrix-multiplication)
+    1. [Compute Kernel](#matmul-compute-kernel)
+    2. [Optimizations](#optimizations)
+    	1. [Pre-Fetching](#pre-fetching)
+    	1. [Rematerialization](#rematerialization)
+    3. [Fused Transpositions and Auto-Tuning](#fused-trans-autotuning)
+  
+## <span style="color:darkred"> Motivations </span> <a name="motivations"></a>
+
+In C and C++, arrays and pointers have similar semantics. Indeed, there is no native way to manipulate statically shaped multi-dimensional arrays (beyond initialization) as a whole:
+
+```c
+// C99
+float x[16][8] = {3.14};
+float y[16][8] = {5.17};
+// z = x + y
+float z[16][8];
+#pragma unroll
+for(int i = 0; i < 16; i++)
+  #pragma unroll
+  for(int j = 0; j < 8; j++)
+    z[i][j] = x[i][j] + y[i][j];
+```
+
+While it does not seem like a big deal at first sight, there are two issues with this:
+
+- **Ergonomics**:  Of course, it is possible to simplify the above code using functions in C
+```
+float z[16][8];
+add(z, x, y, 16, 8);
+```
+but this would be semantically different as the loops can no longer be unrolled due to their bounds being now dynamic arguments of the add function. This can be mitigated using templates metaprogramming (and operator overloads) in C++:
+
+```c
+// C++
+template<typename T, int M, int N> 
+class matrix;
+
+matrix<float, 16, 8> x = {3.14};
+matrix<float, 16, 8> y = {5.17};
+matrix<float, 16, 8> z = x + y;
+```
+
+While this is better and now equivalent to our initial code snippet, the syntax is not quite as ergonomically satisfying as what native syntactic support could provide:
+```c
+// Triton-C
+float x[16, 8] = 3.14;
+float y[16, 8] = 5.17;
+// float z[8, 8] = x + y; // doesn't compile -- incompatible shapes!
+float z[16, 8] = x + y;
+float u[16] = z[:, +]; // sum along the second axis
+float v[16, 32] = u[:, newaxis]; // broadcasting along the second axis
+```
+which is valid _Triton-C_. 
+
+
+- **Portability**: One other issue with our initial C program is that it is not portable. While it will run well on a single CPU thread, the operation `z = x + y` would underutilize a GPU Streaming Processor as it would execute on a single thread only. For this reason, it would have to be rewritten in CUDA as follows:
+```
+// CUDA
+// Launch on a block of 16 x 8 threads
+float x = 3.14;
+float y = 5.17;
+float z = x + y
+```
+In Triton-C, the same code can be used across many different platforms (only CPUs and GPUs are supported at the moment). Furthermore, Triton-C is single-threaded, hence easier to write than CUDA.
+
+- **Performance**: Another issue with our initial C code snippet is its performance. Although the loops are unrolled, the program does not carry any data-flow information pertaining to array operations. This issue gets more and more problematic as programs get increasingly complex, eventually culminating in matrix multiplication being remarkably hard to optimize. 
+
+    This can be worked around using heavy metaprogramming techniques (see [CUTLASS](https://github.com/NVIDIA/cutlass)), but even then programmers still have to allocate and synchronize shared memory manually and endure prohibitively long compilation procedures not easily amenable to auto-tuning. For these reasons, most Deep-Learning frameworks still rely heavily on highly optimized subroutines (e.g., BLAS), which makes the development of novel custom primitives time-consuming for experts and almost impossible for others.
+    
+    Triton addresses this issue by relying on **Triton-IR**, an LLVM-like IR for array operations, and **Triton-JIT**, an optimizing compiler for Triton-IR. These two systems are, however, beyond the scope of this tutorial. More information can be found [here](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf).
+
+    
+_Note: You might be thinking that this is exactly what [MLIR](https://github.com/tensorflow/mlir) was made for... and you're right! You can conceptually think of Triton-IR as a dialect for MLIR, and Triton-C as a frontend for it. I would like to integrate Triton-IR into MLIR in the future; If you're interested in making this a thing, let me know._
+
+
+
+## <span style="color:darkred">  Vector Addition </span> <a name="vector-addition"></a>
+
+### <span style="color:darkblue">  Differences with CUDA </span> <a name="differences-with-cuda"></a>
+
+Let's start it off by looking at a simple example. Vector addition, in its most trivial Triton-C implementation, can be written as follows:
+
+```c
+// Triton-C
+// launched on a grid of (N / 32) programs of 1 thread each
+__global__  void add(int N, float *a, float *b, float* c) {
+	int id = get_program_id(0);
+	int off[32] = id * 32 + (0 ... 32)
+	*(c + off) = *(a + off) + *(b + off);
+}
+```
+For reference, here is an equivalent CUDA kernel (NVCC will generate the same PTX code as Triton-JIT on the above code):
+
+```c
+// CUDA
+// launched on a grid of (N / 32) programs of 32 threads each
+__global__ void add(int N, float *a, float *b, float *c) {
+    int off = blockIdx.x * 32 + threadIdx.x;
+    c[off] = a[off] + b[off];
+}
+```
+
+As you can see, there are three main differences between our Triton-C kernel and the equivalent CUDA:
+
+- **The programming model is different**. 
+While Triton-C and CUDA both use a Single-Program, Multiple-Data (SPMD) programming model, each Triton-C kernel is single-threaded. 
+	Therefore, `get_program_id({0, 1, 2})` is equivalent to `blockIdx.{x, y, z}`, but there is no such thing as `blockDim` and `threadIdx`.
+    
+- **The semantics of arrays is different** 
+In the above Triton-C kernel, `off` is an array of 32 consecutive integers:  `int off[32] = {id * 32 + 0, id * 32 + 1, ..., id * 32 + 31}`. 
+	As a result, the statement: `c + off` implicitly broadcast `c` and creates an array of 32 pointers. This could also be done explicitly as follows:
+```
+float* c_broadcast[32] = c;
+float* c_ptr[32] = c_broadcast + off; // c_ptr = c + off
+```
+
+- **The semantics of the subscript operator is different**. 
+n C/CUDA, subscripting can be used to offset and dereference a pointer, but in Triton-C it can only be used to index and broadcast an array (think NumPy).
+
+### <span style="color:darkblue"> Advantages over CUDA </span> <a name="advantages-over-cuda"></a>
+
+At this point, the advantages of Triton-C over CUDA may not be obvious. But they should become clearer and clearer as this tutorial progresses. First and foremost, the purpose of this subsection is to show how Triton can be used to optimize vector additions by automatically taking care of load/store vectorization, code parameterization and auto-tuning -- all of which require nontrivial implementation efforts in CUDA.
+
+#### <span style="color:purple"> Vectorization </span> <a name="vectorization"></a>
+
+On some hardware architectures, vectorizing load/store operations can lead to better memory utilization and, in turn, noticeable performance gains. In general, 128-bit memory transactions are favored, leading to the following CUDA kernel:
+```c
+// CUDA
+// launched on a grid of (N / 128) programs of 32 threads each
+__global__ void add(int N, float4 *a, float4 *b, float4 *c) {
+    int off = blockIdx.x * 32 + threadIdx.x;
+    c[off] = a[off] + b[off];
+}
+```
+Or, for half-precision inputs:
+```c
+// CUDA
+// launched on a grid of (N / 256) programs of 32 threads each
+__global__ void add(int N, half8 *a, half8 *b, half8 *c) {
+    int off = blockIdx.x * 32 + threadIdx.x;
+    c[off] = a[off] + b[off];
+}
+```
+
+Now this is a bit annoying, because as a programmer you have to keep track of not only the ideal vector size for each data-type (which might change in future GPU architectures), but also of how many elements are processed in each thread-block -- and adjust the grid size of the kernel accordingly! Not to mention that you may want to tune the thread-block size as well.
+
+In Triton-C, this is not a problem as the compiler will figure out automatically when and where vectorization should be used, without any change in the source-code necessary.
+
+#### <span style="color:purple"> Parameterization </span> <a name="parameterization"></a>
+
+Specifically, the Triton compiler would refuse to 4-way vectorize our above compute kernel because it would require the array `int off[32]` to be distributed over 8 threads, which is less than a warp. Fortunately, it turns out that this problem can be easily solved using preprocessor directrives to _parameterize_ our kernel:
+```c
+// Triton-C
+// launched on a grid of (N / SIZE) programs of 1 thread each
+__global__  void add(int N, TYPE* a, TYPE* b, TYPE* c) {
+	int id = get_program_id(0);
+	int off[SIZE] = id * SIZE + (0 ... SIZE);
+	*(c + off) = *(a + off) + *(b + off);
+}
+// Not vectorized when compiled with -DSIZE=32 -DTYPE=float
+// 4-Vectorized when compiled with -DSIZE=128 -DTYPE=float
+// 8-Vectorized when compiled with -DSIZE=256 -DTYPE=half
+```
+Now, `TYPE` and `SIZE` are preprocessors macros which can be specified at compile-time, thereby giving the Triton compiler enough information to vectorize when beneficial without requiring any additional code modification.
+
+
+#### <span style="color:purple"> Auto-Tuning </span> <a name="auto-tuning"></a>
+
+As it turns out, different input vector lengths `N`  may require different values of `SIZE` to perform optimally. Fortunately, the Triton preprocessor also accepts lists of possible definitions for macros, in which case an auto-tuning procedure will be launched every-time new input sizes are encountered. For example, compiling the above kernel with the option`-DSIZE=[32, 64, 128, 256] -DTYPE=float`
+will result in the parameter `SIZE` being automatically tuned every time a new value of `N` is encountered.
+
+_Note: Tuning our reference CUDA kernel would be much more cumbersome, as template metaprogramming would have to be used to ensure that proper vector types would be used_
+
+
+## <span style="color:darkred">  Matrix Transposition </span> <a name="matrix-transposition"></a>
+
+Transpositions are (relatively) hard to efficiently write in CUDA because naive implementations typically suffer from _uncoalesced_ memory operations when writing back the transposed matrix to DRAM.  Of course, this can be fixed by using shared memory as shown [here](https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/), but this comes at the cost of simplicity and -- more importantly -- interferes with auto-tuning.
+
+### <span style="color:darkblue"> Compute Kernel </span> <a name="trans-compute-kernel"></a>
+
+In Triton, however, kernels are single-threaded and the compiler automatically detects if and when data should be temporarily stashed to shared memory in order to enable shared memory stores/loads. Therefore, an optimal Triton kernel for this operation would look like:
+
+```c
+// launched on a grid of (M / TM) x (N / TN) programs of 1 thread each
+__global__ void transpose(TYPE * X, TYPE * Y,  int M, int N, int ldx, int ldy) {
+// extract program ID
+  int pidm = get_program_id(0); //(1)
+  int pidn = get_program_id(1); //(2)
+  // create 1D range along the two matrix's axes
+  int rm[TM] = pidm * TM + 0 ... TM; //(3)
+  int rn[TN] = pidn * TN + 0 ... TN; //(4)
+  // create 2D array of pointers
+  TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; //(5)
+  TYPE* py[TN, TM] = Y + rm[newaxis, :] * ldy + rn[:, newaxis]; //(6)
+  // write back using the transposition operator '^'
+  *py = ^(*px); //(7)
+}
+```
+
+At a high level, this kernel loads a `TM x TN` tile from the input matrix `X`, transposes it and writes the resulting `TN x TM` tile to the output matrix `Y`. Eventually, transposition of the full input matrix is achieved by launching a grid of `(M / TM) x (N / TN)` programs decomposed as follows:
+
+- Statements (1) and (2) extract the coordinates the program in the above 2D launch grid. For example, the program producing the output tile `Y[TN:2TN-1, 2TN:3TN-1]` holds the values:
+```
+pidm = 2
+pidn = 1
+``` 
+
+- Statements (3) and (4) construct the ranges of indices:
+```
+rm = [pidm*TM + 0, pidm*TM + 1, ..., pidm*TM + (TM - 1)]
+rn = [pidn*TN + 0, pidn*TN + 1, ..., pidn*TN + (TN - 1)]
+```
+
+which will be used in statements (5) and (6) to construct tiles of pointers
+
+- Statements (5) constructs the following array of pointers `px` using numpy-style broadcasting semantics:
+```
+│ X + (pidm*TM + 0)       + (pidn*TN + 0)*ldx,  ...,  ...,  X + (pidm*TM + 0)      +  (pidn*TN + TN - 1)*ldx) │
+│      ⋮                                                                                       ⋮             │
+│      ⋮                                                                                       ⋮             │
+│ X + (pidm*TM + TM - 1)  + (pidn*TN + 0)*ldx,  ...,  ...,  X + (pidm*TM + TM - 1) +  (pidn*TN + TN - 1)*ldx) │
+```
+- Statement (6) constructs the following array of pointers `py` using numpy-style broadcasting semantics:
+```
+│ Y + (pidn*TN + 0)       + (pidm*TM + 0)*ldy,  ...,  ...,  Y + (pidn*TN + 0)      +  (pidm*TM + TM - 1)*ldy) │
+│      ⋮                                                                                       ⋮             │
+│      ⋮                                                                                       ⋮             │
+│ Y + (pidn*TN + TN - 1)  + (pidn*TN + 0)*ldy,  ...,  ...,  Y + (pidn*TN + TN - 1) +  (pidm*TM + TM - 1)*ldy) │
+```
+- Statement (7) element-wise dereferences the above array of pointers `*px`, transposes it using the unary transposition operator `^`, and writes it back at the location specified by `py`.
+
+### <span style="color:darkblue"> The __multipleof Attribute </span> <a name="trans-multipleof"></a>
+
+The memory loads and store in our transposition kernel are not vectorizable by default, since `X + ldx` (and `Y + ldy`) may be misaligned when `ldx` (and `ldy`) are not multiples of e.g., 4. This is unfortunate because tensor dimensions can be easily made into  nice powers of two in Deep Learning, due to batch-sizes and layer width being flexible.
+
+For this reason, Triton provides a __multipleof(N) attributes for variables that are guaranteed to always be multiple of N. In the case of Matrix Transpositions, vector loads can be enabled by modifying the function's signature as follows:
+
+```c
+__global__ void transpose(TYPE * X, TYPE * Y,  int M, int N, int ldx __multipleof(8), int ldy __multipleof(8)) {
+// ...
+}
+```
+    
+### <span style="color:darkblue"> Conditional Dereferencing </span> <a name="conditional-dereferencing"></a>
+
+You might have noticed that the above code will fail when `M` and `N` are not multiples of `TM` and `TN` respectively. Fortunately, the above kernel can be slightly modified to handle thie situation, as shown below:
+```c
+// launched on a grid of ((M + TM - 1) / TM) x ((N + TN - 1) / TN) programs
+__global__ void transpose(TYPE * X, TYPE * Y,  int M, int N, int ldx, int ldy) {
+   // ...
+   // create bounds-checking mask
+   bool checkx[TM, TN] = (rm[:, newaxis] < M) && (rn[newaxis, :] < N); //(7a)
+   bool checky[TN, TM] = (rm[newaxis, :] < M) && (rn[:, newaxis] < N); //(7b)
+   // conditional write-back using the conditional dereferencing operatior '*?()'
+   *?(checky)py = ^(*?(checkx)px); //(7)
+}
+```
+
+Here, statements (7a) creates an array of booleans `checkx[TM, TN]` such that `checkx(i, j) = True` if and only if `px(i, j)` should be dereferenced. Statement (7b) does the same for `py`. Both `px` and `py` are then conditionally dereferenced using Triton-C's conditional dereferencing operator `*?(predicate) pointer`.
+
+
+## <span style="color:darkred">  Matrix Multiplication </span> <a name="matrix-multiplication"></a>
+
+The purpose of this section is to present a Triton-C implementation of matrix multiplication that achieves performance competitive with the best existing hand-written CUDA kernels (see [CUTLASS](https://github.com/NVIDIA/cutlass)). We will also see how pre-processors macros can be leveraged to fuse transposition operations as well as to provide support for auto-tuning and FP16 Tensor Cores.
+
+_Note: Bounds-checking is ommitted throughout for the sake of clarity. This feature can be easily added into our kernel, but may result in a slight performance hit because LLVM and PTXAS have issues dealing with conditionals and predicates inside loops._
+
+### <span style="color:darkblue"> Compute Kernel </span> <a name="matmul-compute-kernel"></a>
+
+Matrix multiplications of the form `C = A x B` can be implemented in Triton-C fairly concisely, as shown below:
+
+```c
+// Triton-C
+// launched on a grid of (M / TM) x (N / TN) programs
+__global__ void dot(TYPE * A, TYPE * B, TYPE * C,  int M, int N, int K,
+        	        int lda __multipleof(8),  int ldb __multipleof(8),  int ldc __multipleof(8)) {
+  // prologue
+  int pm = get_program_id(0); //(1)
+  int pn = get_program_id(1); //(2)
+  int rm[TM] = pm * TM + 0 ... TM; //(3)
+  int rn[TN] = pn * TN + 0 ... TN; //(4)
+  int rk[TK] = 0 ... TK; //(5)
+  // initialize accumulator 
+  float c[TM, TN] = 0; //(6)
+  // pointers to operands
+  TYPE* pa[TM, TK] = A + rk[newaxis, :] * 1 + rm[:, newaxis] * lda; //(7)
+  TYPE* pb[TK, TN] = B + rk[:, newaxis] * ldb + rn[newaxis, :] * 1; //(8)
+  // reduction loop
+  for(int k = K; k > 0; k-= TK){
+    // fetch operands
+    TYPE a[TM, TK] = *pa; //(9) 
+    TYPE b[TK, TN] = *pb; //(10)
+    // matrix-multiply accumulate
+    c += a @ b; //(11)
+    // increment pointers
+    pa = pa + TK * 1; //(12)
+    pb = pb + TK * ldb; //(13)
+  }
+  // epilogue
+  TYPE* pc[TM, TN] = C + rn[newaxis, :] + rm[:, newaxis] * ldc; //(14)
+  *pc = c; //(15)
+}
+```
+Here, each kernel instance produces a `TM x TN` tile of the output matrix C as follows:
+
+- Statements (1) - (2) fetch the id of the current program instance.
+- Statements (3) - (4) construct ranges of indices to process for the vertical and horizontal axes of the output matrix `C`
+- Statement (5) constructs a range of indices along the reduction axis: `rk = [0, 1, ..., TK - 1]`
+- Statement (6) initialize a `TM x TN` array of accumulators to hold the result of `A[rm, :] x B[:, rn]`
+- Statements (7) - (8) initializes arrays of pointers `pa` and `pb` to the operands `A` and `B` using logic similar to that of the above transposition kernel
+- Statements (9) - (10) load tiles of operands by dereferencing `pa` and `pb`
+- Statement (11) performs updates the accumulator array using Triton-C's matrix multiplication operator '@'
+- Statements (12) - (13) updates `pa` and `pb`
+- Statement (14) creates an array of pointers `pc` to the result matrix `C`
+- Statement (15) writes back the accumulator to `C`
+
+Internally, the Triton compiler will perform quite a few optimizations that will ensure good performance for this kernel:
+
+- Automatic coalescing of load/store operations
+- Automatic vectorization of load/store operations
+- Stashing `a` and `b` to shared memory
+- Automatic allocation of shared memory
+- Automatic synchronization of shared memory
+- Automatic padding of shared memory to avoid bank conflicts
+- Automatic usage of tensor cores when TYPE = half and TK % 4 = 0
+
+### <span style="color:darkblue"> Optimizations </span> <a name="optimizations"></a>
+
+Nonetheless, there are two important optimizations that the Triton compiler does not do automatically at the moment yet are critical to achieve peak performance: pre-fetching and rematerialization. In this subsection we describe how these optimizations can be done manually by  modifying the above source-code.
+
+#### <span style="color:purple"> Pre-Fetching </span> <a name="pre-fetching"></a>
+
+The purpose of pre-fetching is to overlap the update of the accumulator `c` with the memory loads for the next tiles that will need to be multiplied. This can be done by modifying the above reduction loop as follows:
+
+```
+// pre-fetch operands
+TYPE a[TM, TK] = *pa; //(9) 
+TYPE b[TK, TN] = *pb; //(10)
+for(int k = K; k > 0; k-= TK){
+   c += a @ b;
+   pa = pa + TK * 1;
+   pb = pb + TK * ldb;
+   // don't prefetch last iteration
+   bool check = k > TK;
+   // pre-fetch operands
+   a = check ? *pa : 0;
+   b = check ? *pb : 0;
+ }
+```
+
+Note that the Triton-C compiler will now also be able to use double-buffering techniques to make sure that the array `a` can be used and updated at the same time without any memory hazard.
+
+#### <span style="color:purple"> Rematerialization </span> <a name="rematerialization"></a>
+
+[Rematerialization](https://en.wikipedia.org/wiki/Rematerialization) is a compiler optimization which consists in recomputing some values instead of storing and reloading them from (register) memory, so as to decrease register pressure in the compute kernel. Although LLVM does this automatically to some extent, it fails to find good heuristics for the above kernel -- thereby requiring some source code modification to achieve optimal performance. Fortunately, only `rm` and `rn` need to be rematerialized, leading to the  following epilogue:
+
+```c
+// epilogue
+int rcm[TM] = pm * TM + 0 ... TM;
+int rcn[TN] = pn * TN + 0 ... TN;
+TYPE* pc[TM, TN] = C + rcn[newaxis, :] + rcm[:, newaxis] * ldc;
+*pc = c; 
+```
+
+### <span style="color:darkblue"> Fused Transpositions and Auto-Tuning </span> <a name="fused-trans-autotuning"></a>
+
+It is common for optimized matrix-multiplication implementations (e.g., BLAS) to provide variants in which one or both operands are transposed. This is also what is done in the [PyTriton](https://github.com/ptillet/triton/blob/master/python/triton/ops/dot.py) implementation of matrix-multiplication. Fortunately, this can be done by using pre-processors macros for tile shapes and broadcasting directives, leading to the following kernel:
+
+```c
+// Triton-C
+// launched on a grid of (M / TM) x (N / TN) programs
+void dot(TYPE * A, TYPE * B, TYPE * C,
+         int M, int N, int K,
+         int lda __multipleof(8),  int ldb __multipleof(8),  int ldc __multipleof(8)) {
+  // prologue
+  int pm = get_program_id(0);
+  int pn = get_program_id(1);
+  int rm[TM] = pm * TM + 0 ... TM;
+  int rn[TN] = pn * TN + 0 ... TN;
+  int rk[TK] = 0 ... TK;
+  float c[TM, TN] = 0;
+  // pointers to operands
+  TYPE* pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK + rm[BROADCAST_AM] * STRIDE_AM;
+  TYPE* pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK + rn[BROADCAST_BN] * STRIDE_BN;
+  // prefetches operands
+  TYPE a[SHAPE_A] = (*pa);
+  TYPE b[SHAPE_B] = (*pb);
+  // reduction loop
+  for(int k = K; k > 0; k-= TK){
+    c += USE_A @ USE_B;
+    pa = pa + TK * STRIDE_AK;
+    pb = pb + TK * STRIDE_BK;
+    a = *pa;
+    b = *pb;
+  }
+  // epilogue
+  int rcm[TM] =  pm * TM + 0 ... TM;
+  int rcn[TN] =  pn * TN + 0 ... TN;
+  TYPE* pc[TM, TN] = C + rcn[newaxis, :] + rcm[:, newaxis] * ldc;
+  *pc = c;
+}
+```
+
+All matrix multiplications variants can then be retrieved using the following compilation option:
+```c
+// A is not transposed
+-DUSE_A=a -DSTRIDE_AK=1-DSTRIDE_AM=lda -DBROADCAST_AK=newaxis,: -DBROADCAST_AN=:,newaxis -DSHAPE_A=TM,TK
+// A is transposed
+-DUSE_A=^a -DSTRIDE_AK=lda-DSTRIDE_AM=1 -DBROADCAST_AK=:,newaxis -DBROADCAST_AN=newaxis,: -DSHAPE_A=TK,TM
+// B is not transpose
+-DUSE_B=b -DSTRIDE_BK=ldb-DSTRIDE_BN=1 -DBROADCAST_BK=:,newaxis -DBROADCAST_BN=newaxis,: -DSHAPE_B=TK,TN
+// B is transpose
+-DUSE_B=^b -DSTRIDE_BK=1-DSTRIDE_BN=ldb -DBROADCAST_BK=newaxis,: -DBROADCAST_BN=:,newaxis -DSHAPE_B=TN,TK
+```
+
+Auto-tuning can also be handled using pre-processor macros:
+```c
+// Auto-tuning TM and TN in {32, 64, 128}; TK in {8, 16}
+-DTM=[32, 64, 128] -DTN=[32, 64, 128] -DTK=[8, 16]
+```
+
diff --git a/include/triton/codegen/analysis/align.h b/include/triton/codegen/analysis/align.h
new file mode 100644
index 000000000..647db3984
--- /dev/null
+++ b/include/triton/codegen/analysis/align.h
@@ -0,0 +1,77 @@
+#ifndef TDL_INCLUDE_CODEGEN_ALIGNMENT_INFO_PASS_H
+#define TDL_INCLUDE_CODEGEN_ALIGNMENT_INFO_PASS_H
+
+#include <map>
+#include <vector>
+
+namespace triton {
+
+namespace ir {
+  class value;
+  class module;
+  class phi_node;
+  class splat_inst;
+  class reshape_inst;
+  class broadcast_inst;
+  class binary_operator;
+  class getelementptr_inst;
+}
+
+namespace codegen{
+namespace analysis{
+
+class align {
+private:
+  struct cst_info {
+    unsigned num_cst;
+    unsigned value;
+  };
+  // helpers
+  std::vector<unsigned> get_shapes(ir::value *v);
+  // populate is_constant
+  std::vector<cst_info> populate_is_constant_phi(ir::phi_node* x);
+  std::vector<cst_info> populate_is_constant_splat(ir::splat_inst* x);
+  std::vector<cst_info> populate_is_constant_reshape(ir::reshape_inst* x);
+  std::vector<cst_info> populate_is_constant_broadcast(ir::broadcast_inst* x);
+  std::vector<cst_info> populate_is_constant_binop(ir::binary_operator* x);
+  std::vector<cst_info> populate_is_constant_gep(ir::getelementptr_inst* x);
+  std::vector<cst_info> populate_is_constant_default(ir::value* v);
+  std::vector<cst_info> populate_is_constant(ir::value *v);
+  // populate max_contiguous
+  std::vector<unsigned> populate_max_contiguous_phi(ir::phi_node* x);
+  std::vector<unsigned> populate_max_contiguous_splat(ir::splat_inst* x);
+  std::vector<unsigned> populate_max_contiguous_reshape(ir::reshape_inst* x);
+  std::vector<unsigned> populate_max_contiguous_broadcast(ir::broadcast_inst* x);
+  std::vector<unsigned> populate_max_contiguous_binop(ir::binary_operator* x);
+  std::vector<unsigned> populate_max_contiguous_gep(ir::getelementptr_inst* x);
+  std::vector<unsigned> populate_max_contiguous_default(ir::value* v);
+  std::vector<unsigned> populate_max_contiguous(ir::value *v);
+  // populate starting_multiple
+  std::vector<unsigned> populate_starting_multiple_phi(ir::phi_node* x);
+  std::vector<unsigned> populate_starting_multiple_splat(ir::splat_inst* x);
+  std::vector<unsigned> populate_starting_multiple_reshape(ir::reshape_inst* x);
+  std::vector<unsigned> populate_starting_multiple_broadcast(ir::broadcast_inst* x);
+  std::vector<unsigned> populate_starting_multiple_binop(ir::binary_operator* x);
+  std::vector<unsigned> populate_starting_multiple_gep(ir::getelementptr_inst* x);
+  std::vector<unsigned> populate_starting_multiple_default(ir::value* v);
+  std::vector<unsigned> populate_starting_multiple(ir::value *v);
+  // populate all maps
+  void populate(ir::value *v);
+
+public:
+  void run(ir::module &mod);
+  unsigned get(ir::value* v, unsigned ax) const;
+  std::vector<unsigned> contiguous(ir::value* v) const;
+
+private:
+  std::map<ir::value*, std::vector<cst_info>> is_constant_;
+  std::map<ir::value*, std::vector<unsigned>> max_contiguous_;
+  std::map<ir::value*, std::vector<unsigned>> starting_multiple_;
+};
+
+
+}
+}
+}
+
+#endif
diff --git a/include/triton/codegen/analysis/allocation.h b/include/triton/codegen/analysis/allocation.h
new file mode 100644
index 000000000..e49f5c591
--- /dev/null
+++ b/include/triton/codegen/analysis/allocation.h
@@ -0,0 +1,47 @@
+#ifndef TDL_INCLUDE_IR_CODEGEN_STORAGE_ALLOC_H
+#define TDL_INCLUDE_IR_CODEGEN_STORAGE_ALLOC_H
+
+#include <map>
+#include <set>
+#include <iostream>
+#include "triton/codegen/analysis/liveness.h"
+
+namespace triton{
+
+namespace ir{
+  class value;
+  class function;
+  class module;
+}
+
+namespace codegen{
+namespace analysis{
+
+class tiles;
+
+class liveness;
+class cts;
+
+class allocation {
+public:
+  allocation(liveness *live)
+    : liveness_(live) { }
+  // accessors
+  bool has_offset(const data_layout *x)    const { return offsets_.find(x) != offsets_.end(); }
+  unsigned offset(const data_layout *x)    const { return offsets_.at(x); }
+  unsigned allocated_size()        const { return allocated_size_; }
+  // run
+  void run(ir::module& mod);
+
+private:
+  std::map<const data_layout*, unsigned> offsets_;
+  size_t allocated_size_;
+  // dependences
+  liveness *liveness_;
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/triton/codegen/analysis/axes.h b/include/triton/codegen/analysis/axes.h
new file mode 100644
index 000000000..dc50223fb
--- /dev/null
+++ b/include/triton/codegen/analysis/axes.h
@@ -0,0 +1,51 @@
+#ifndef _TRITON_CODEGEN_ANALYSIS_AXES_H_
+#define _TRITON_CODEGEN_ANALYSIS_AXES_H_
+
+#include "triton/tools/graph.h"
+#include <map>
+#include <vector>
+
+namespace triton{
+
+namespace ir{
+  class value;
+  class module;
+  class instruction;
+}
+
+namespace codegen{
+namespace analysis{
+
+class axes {
+  typedef std::pair<ir::value*, unsigned> node_t;
+
+private:
+  // update graph
+  void update_graph_store(ir::instruction *i);
+  void update_graph_reduce(ir::instruction *i);
+  void update_graph_reshape(ir::instruction *i);
+  void update_graph_trans(ir::instruction *i);
+  void update_graph_broadcast(ir::instruction *i);
+  void update_graph_dot(ir::instruction *i);
+  void update_graph_elementwise(ir::instruction *i);
+  void update_graph_no_edge(ir::instruction *i);
+  void update_graph(ir::instruction *i);
+
+public:
+  axes();
+  void run(ir::module &mod);
+  // accessors
+  int get(ir::value *value, unsigned dim);
+  std::vector<int> get(ir::value *value);
+
+private:
+  tools::graph<node_t> graph_;
+  std::map<node_t, size_t> axes_;
+};
+
+}
+}
+
+}
+
+#endif
diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h
new file mode 100644
index 000000000..13ddfafb4
--- /dev/null
+++ b/include/triton/codegen/analysis/layout.h
@@ -0,0 +1,205 @@
+#ifndef _TRITON_CODEGEN_ANALYSIS_GRID_H_
+#define _TRITON_CODEGEN_ANALYSIS_GRID_H_
+
+#include <map>
+#include <set>
+#include <vector>
+#include <memory>
+#include "triton/tools/graph.h"
+
+namespace triton{
+
+namespace ir{
+  class value;
+  class type;
+  class module;
+  class instruction;
+  class phi_node;
+}
+
+namespace codegen{
+namespace analysis{
+
+class axes;
+class align;
+class layout_visitor;
+class data_layout;
+class mma884_layout;
+class scanline_layout;
+class shared_layout;
+
+
+class layout_visitor {
+public:
+  virtual void visit_layout(data_layout *);
+  virtual void visit_layout_hmma_884(mma884_layout*) = 0;
+  virtual void visit_layout_scanline(scanline_layout*) = 0;
+  virtual void visit_layout_shared(shared_layout*) = 0;
+};
+
+class data_layout {
+protected:
+  enum id_t {
+    HMMA_884,
+    SCANLINE,
+    SHARED
+  };
+
+  typedef std::vector<int> axes_t;
+  typedef std::vector<unsigned> shape_t;
+  typedef std::vector<int> order_t;
+  typedef std::vector<ir::value*> values_t;
+
+private:
+  template<typename T>
+  T* downcast(id_t id) {
+    if(id_ == id)
+      return static_cast<T*>(this);
+    return nullptr;
+  }
+
+public:
+  data_layout(id_t id,
+             const std::vector<int>& axes,
+             const std::vector<unsigned> &shape,
+             const std::vector<ir::value *> &values,
+             analysis::align* align);
+  // visitor
+  virtual void accept(layout_visitor* vst) = 0;
+  // downcast
+  mma884_layout* to_mma884()          { return downcast<mma884_layout>(HMMA_884); }
+  scanline_layout* to_scanline()      { return downcast<scanline_layout>(SCANLINE); }
+  shared_layout* to_shared()          { return downcast<shared_layout>(SHARED); }
+  // accessors
+  size_t get_rank()                   { return shape_.size(); }
+  const shape_t& get_shape() const    { return shape_; }
+  const order_t& get_order() const    { return order_; }
+  const values_t& get_values() const  { return values_;}
+  int get_axis(size_t k) const        { return axes_.at(k); }
+  const int get_order(size_t k) const { return order_.at(k); }
+  // find the position of given axis
+  size_t find_axis(int to_find) const;
+
+
+private:
+  id_t id_;
+  axes_t axes_;
+  values_t values_;
+
+protected:
+  order_t order_;
+  shape_t shape_;
+};
+
+class mma884_layout: public data_layout {
+public:
+  mma884_layout(size_t num_warps,
+                const std::vector<int>& axes,
+                const std::vector<unsigned>& shapes,
+                const std::vector<ir::value *> &values,
+                analysis::align* align);
+  void accept(layout_visitor* vst) { vst->visit_layout_hmma_884(this); }
+  // accessor
+  int fpw(size_t k) { return fpw_.at(k); }
+  int wpt(size_t k) { return wpt_.at(k); }
+
+private:
+  std::vector<int> fpw_;
+  std::vector<int> wpt_;
+};
+
+struct scanline_layout: public data_layout {
+  scanline_layout(size_t num_warps,
+                    const std::vector<int>& axes,
+                    const std::vector<unsigned>& shape,
+                    const std::vector<ir::value *> &values,
+                    analysis::align* align);
+  void accept(layout_visitor* vst) { vst->visit_layout_scanline(this); }
+  // accessor
+  int mts(size_t k) { return mts_.at(k); }
+  int nts(size_t k) { return nts_.at(k); }
+
+private:
+  std::vector<int> mts_;
+  std::vector<int> nts_;
+};
+
+struct double_buffer_info_t {
+  ir::value* first;
+  ir::value* latch;
+  ir::phi_node* phi;
+};
+
+class shared_layout: public data_layout {
+private:
+  static bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator);
+  static void extract_double_bufferable(ir::value *v, std::shared_ptr<double_buffer_info_t>& res);
+
+public:
+  shared_layout(const data_layout *arg,
+                const std::vector<int>& axes,
+                const std::vector<unsigned>& shapes,
+                const std::vector<ir::value *> &values_,
+                ir::type *ty,
+                analysis::align* align);
+  void accept(layout_visitor* vst) { vst->visit_layout_shared(this); }
+  // accessors
+  size_t get_size()                         { return size_; }
+  ir::type* get_type()                      { return ty_; }
+  double_buffer_info_t* get_double_buffer() { return double_buffer_.get(); }
+
+private:
+  size_t size_;
+  ir::type *ty_;
+  std::shared_ptr<double_buffer_info_t> double_buffer_;
+};
+
+
+
+class layouts {
+  typedef ir::value* node_t;
+  typedef std::map <node_t, std::set<node_t>> graph_t;
+
+private:
+  // graph creation
+  void connect(ir::value *x, ir::value *y);
+  void make_graph(ir::instruction *i);
+
+  void init_hmma_tile(data_layout& layouts);
+  void init_scanline_tile(data_layout &layouts);
+
+  void create(size_t id, const std::vector<ir::value*>& values);
+
+public:
+  // constructor
+  layouts(analysis::axes *axes, analysis::align *align, size_t num_warps);
+
+  // accessors
+  unsigned layout_of(ir::value *value) const                  { return groups_.at(value); }
+  const std::vector<ir::value*>& values_of(unsigned id) const { return values_.at(id); }
+  size_t num_layouts() const                                  { return values_.size();}
+  data_layout* get(size_t id)                                 { return layouts_.at(id); }
+  data_layout* get(ir::value *v)                              { return get(layout_of(v));}
+  std::map<size_t, data_layout*> &get_all()                   { return layouts_; }
+  size_t tmp(ir::instruction* i)                              { return tmp_.at((ir::value*)i);}
+
+  // execution
+  void run(ir::module &mod);
+
+private:
+  analysis::axes* axes_;
+  analysis::align* align_;
+  size_t num_warps_;
+  tools::graph<ir::value*> graph_;
+  std::map<ir::value*, size_t> groups_;
+  std::map<size_t, std::vector<ir::value*>> values_;
+  std::map<size_t, data_layout*> layouts_;
+  std::map<ir::value*, size_t> tmp_;
+};
+
+}
+}
+
+}
+
+#endif
diff --git a/include/triton/codegen/analysis/liveness.h b/include/triton/codegen/analysis/liveness.h
new file mode 100644
index 000000000..a95d62a06
--- /dev/null
+++ b/include/triton/codegen/analysis/liveness.h
@@ -0,0 +1,67 @@
+#ifndef TDL_INCLUDE_IR_CODEGEN_LIVENESS_H
+#define TDL_INCLUDE_IR_CODEGEN_LIVENESS_H
+
+#include <map>
+#include <set>
+#include <vector>
+#include "triton/codegen/analysis/layout.h"
+#include "triton/tools/graph.h"
+
+namespace triton{
+
+namespace ir{
+  class value;
+  class phi_node;
+  class function;
+  class module;
+  class instruction;
+}
+
+namespace codegen{
+namespace analysis{
+
+typedef unsigned slot_index;
+
+class tiles;
+class layouts;
+class data_layout;
+
+struct segment {
+  slot_index start;
+  slot_index end;
+
+  bool contains(slot_index idx) const {
+    return start <= idx && idx < end;
+  }
+
+  bool intersect(const segment &Other){
+    return contains(Other.start) || Other.contains(start);
+  }
+};
+
+
+class liveness {
+private:
+  typedef std::map<shared_layout*, segment>    intervals_map_t;
+
+public:
+  // constructor
+  liveness(layouts *l): layouts_(l){ }
+  // accessors
+  const intervals_map_t& get()  const { return intervals_; }
+  segment get(shared_layout* v)  const { return intervals_.at(v); }
+  // run
+  void run(ir::module &mod);
+
+private:
+  // analysis
+  layouts *layouts_;
+  intervals_map_t intervals_;
+};
+
+}
+}
+}
+
+
+#endif
diff --git a/include/triton/codegen/pass.h b/include/triton/codegen/pass.h
new file mode 100644
index 000000000..129c02bc6
--- /dev/null
+++ b/include/triton/codegen/pass.h
@@ -0,0 +1,30 @@
+#ifndef _TRITON_CODEGEN_PASS_H_
+#define _TRITON_CODEGEN_PASS_H_
+
+#include <list>
+
+namespace triton{
+
+namespace ir{
+  class module;
+}
+
+namespace codegen{
+
+class pass {
+public:
+  virtual void run(ir::module& m);
+};
+
+
+class pass_manager {
+public:
+  void add(pass* p);
+  void run(ir::module& m);
+
+private:
+  std::list<pass*> passes;
+};
+
+}
+}
diff --git a/include/triton/codegen/selection/generator.h b/include/triton/codegen/selection/generator.h
new file mode 100644
index 000000000..8b8c5bf64
--- /dev/null
+++ b/include/triton/codegen/selection/generator.h
@@ -0,0 +1,171 @@
+#pragma once
+
+#ifndef _TRITON_SELECTION_GENERATOR_H_
+#define _TRITON_SELECTION_GENERATOR_H_
+
+#include "triton/ir/visitor.h"
+#include "triton/codegen/analysis/layout.h"
+#include "triton/codegen/selection/machine_value.h"
+#include <functional>
+
+// forward
+namespace llvm{
+  class Type;
+  class Value;
+  class Instruction;
+  class Constant;
+  class LLVMContext;
+  class Module;
+  class ConstantFolder;
+  class IRBuilderDefaultInserter;
+  template <typename T, typename Inserter>
+  class IRBuilder;
+  class ArrayType;
+  class Function;
+}
+
+namespace triton{
+namespace codegen{
+
+// forward
+namespace analysis{
+class liveness;
+class tiles;
+class align;
+class allocation;
+class cts;
+class axes;
+class layouts;
+}
+// typedef
+typedef llvm::IRBuilder<llvm::ConstantFolder,
+                        llvm::IRBuilderDefaultInserter> Builder;
+typedef llvm::LLVMContext LLVMContext;
+typedef llvm::Type Type;
+typedef llvm::Value Value;
+typedef llvm::Module Module;
+typedef llvm::Instruction Instruction;
+typedef llvm::Constant Constant;
+typedef llvm::ArrayType ArrayType;
+typedef llvm::Function Function;
+typedef std::vector<Value*> indices_t;
+// forward
+class machine_data_layout;
+class tile;
+class shared_tile;
+class distributed_tile;
+class target;
+
+}
+}
+
+namespace triton{
+namespace codegen{
+
+
+class generator: public ir::visitor, public analysis::layout_visitor {
+private:
+  void for_each(ir::value *x, const std::function<void(indices_t)>& fn);
+  Value* get_value(ir::value *x, const indices_t& idx);
+  void set_value(ir::value *x, const indices_t& idx, Value* v);
+
+  void visit_hmma_dot(ir::dot_inst*, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK);
+  void visit_scanline_dot(ir::dot_inst*, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add);
+  void visit_outer_dot(ir::dot_inst*, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, unsigned NK,
+                       Type *c_ty, Function *f_mul_add);
+
+  void finalize_shared_layout(analysis::shared_layout*);
+  void finalize_function(ir::function*);
+  void finalize_phi_node(ir::phi_node*);
+
+public:
+  generator(analysis::axes *a_axes,
+            analysis::layouts *layouts,
+            analysis::align *alignment,
+            analysis::allocation *alloc,
+            target *tgt,
+            unsigned num_warps);
+
+  void visit_value(ir::value* v);
+
+  void visit_phi_node(ir::phi_node*);
+  void visit_binary_operator(ir::binary_operator*);
+  void visit_getelementptr_inst(ir::getelementptr_inst*);
+
+  void visit_icmp_inst(ir::icmp_inst*);
+  void visit_fcmp_inst(ir::fcmp_inst*);
+  void visit_cast_inst(ir::cast_inst*);
+
+  void visit_return_inst(ir::return_inst*);
+  void visit_cond_branch_inst(ir::cond_branch_inst*);
+  void visit_uncond_branch_inst(ir::uncond_branch_inst*);
+
+
+  void visit_unmasked_load_inst(ir::unmasked_load_inst*);
+  void visit_masked_load_inst(ir::masked_load_inst*);
+  void visit_unmasked_store_inst(ir::unmasked_store_inst*);
+  void visit_masked_store_inst(ir::masked_store_inst*);
+
+  void visit_reshape_inst(ir::reshape_inst*);
+  void visit_splat_inst(ir::splat_inst*);
+  void visit_broadcast_inst(ir::broadcast_inst*);
+  void visit_downcast_inst(ir::downcast_inst*);
+
+  void visit_get_program_id_inst(ir::get_program_id_inst*);
+  void visit_get_num_program_inst(ir::get_num_program_inst*);
+  void visit_atomic_cas_inst(ir::atomic_cas_inst*);
+  void visit_atomic_exch_inst(ir::atomic_exch_inst*);
+  void visit_atomic_add_inst(ir::atomic_add_inst*);
+  void visit_dot_inst(ir::dot_inst*);
+  void visit_trans_inst(ir::trans_inst*);
+  void visit_sqrt_inst(ir::sqrt_inst*);
+  void visit_reduce_inst(ir::reduce_inst*);
+  void visit_select_inst(ir::select_inst*);
+
+  void visit_recoalesce_inst(ir::recoalesce_inst*);
+  void visit_copy_to_shared_inst(ir::copy_to_shared_inst*);
+  void visit_copy_from_shared_inst(ir::copy_from_shared_inst*);
+  void visit_barrier_inst(ir::barrier_inst*);
+  void visit_make_range_dyn(ir::make_range_dyn*);
+  void visit_make_range(ir::make_range*);
+
+  void visit_make_range_sta(ir::make_range_sta*);
+  void visit_undef_value(ir::undef_value*);
+  void visit_constant_int(ir::constant_int*);
+  void visit_constant_fp(ir::constant_fp*);
+  void visit_alloc_const(ir::alloc_const*);
+
+  void visit_function(ir::function*);
+  void visit_basic_block(ir::basic_block*);
+  void visit_argument(ir::argument*);
+
+  void visit_layout_hmma_884(analysis::mma884_layout*);
+  void visit_layout_scanline(analysis::scanline_layout*);
+  void visit_layout_shared(analysis::shared_layout*);
+
+  void visit(ir::module &, llvm::Module &);
+
+private:
+  LLVMContext *ctx_;
+  Builder* builder_;
+  Module *mod_;
+
+  std::map<const analysis::data_layout*, machine_data_layout*> machine_layouts_;
+  analysis::axes *a_axes_;
+  std::map<unsigned, distributed_axis> axes_;
+  std::map<ir::value *, Value *> vmap_;
+  std::map<ir::value *, tile *> tmap_;
+  target *tgt_;
+  analysis::layouts *layouts_;
+  analysis::align *alignment_;
+  analysis::allocation *alloc_;
+  Value *sh_mem_ptr_;
+  unsigned num_warps_;
+
+  std::set<ir::value*> seen_;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/codegen/selection/machine_layout.h b/include/triton/codegen/selection/machine_layout.h
new file mode 100644
index 000000000..5458f15d3
--- /dev/null
+++ b/include/triton/codegen/selection/machine_layout.h
@@ -0,0 +1,138 @@
+#pragma once
+
+#ifndef _TRITON_SELECTION_MACHINE_LAYOUT_H_
+#define _TRITON_SELECTION_MACHINE_LAYOUT_H_
+
+#include <map>
+#include "triton/codegen/analysis/layout.h"
+
+namespace llvm{
+  class Type;
+  class Value;
+  class Instruction;
+  class Constant;
+  class LLVMContext;
+  class Module;
+  class ConstantFolder;
+  class IRBuilderDefaultInserter;
+  template <typename T, typename Inserter>
+  class IRBuilder;
+  class ArrayType;
+  class Function;
+}
+
+namespace triton{
+
+namespace ir{
+class value;
+}
+
+namespace codegen{
+
+namespace analysis{
+class liveness;
+class tiles;
+class align;
+class allocation;
+class cts;
+class axes;
+class layouts;
+}
+
+typedef llvm::IRBuilder<llvm::ConstantFolder,
+                        llvm::IRBuilderDefaultInserter> Builder;
+typedef llvm::LLVMContext LLVMContext;
+typedef llvm::Type Type;
+typedef llvm::Value Value;
+typedef llvm::Module Module;
+typedef llvm::Instruction Instruction;
+typedef llvm::Constant Constant;
+typedef llvm::ArrayType ArrayType;
+typedef llvm::Function Function;
+
+class distributed_axis;
+class machine_data_layout;
+class tile;
+class shared_tile;
+class distributed_tile;
+class target;
+
+}
+}
+
+namespace triton{
+namespace codegen{
+
+
+class machine_data_layout {
+public:
+  virtual tile* create(ir::value *v) = 0;
+};
+
+class machine_shared_layout: public machine_data_layout {
+public:
+  machine_shared_layout(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, Value *&sh_mem_ptr,
+                          analysis::shared_layout* layout,
+                          std::map<ir::value *, Value *>& vmap,
+                          std::map<ir::value *, tile *>& tmap);
+
+  tile* create(ir::value *v);
+
+  Module *mod_;
+  Builder *builder_;
+  target *tgt_;
+  analysis::allocation* alloc_;
+  Value *&sh_mem_ptr_;
+  analysis::shared_layout* layout_;
+  std::map<ir::value *, Value *>& vmap_;
+  std::map<ir::value *, tile *>& tmap_;
+
+  Value *offset_;
+  Value *ptr_;
+  Value *pre_ptr_;
+  Value *next_ptr_;
+
+};
+
+class machine_distributed_layout: public machine_data_layout {
+public:
+  machine_distributed_layout(Module *mod, Builder *builder, target *tgt,
+                          analysis::axes *a_axes, std::map<unsigned, distributed_axis>& axes,
+                          analysis::data_layout* layout);
+
+  tile* create(ir::value *v);
+  Module *mod_;
+  Builder *builder_;
+  target *tgt_;
+  analysis::axes *a_axes_;
+  std::map<unsigned, distributed_axis>& axes_;
+  analysis::data_layout* layout_;
+};
+
+
+class machine_mma884_layout: public machine_distributed_layout {
+public:
+  machine_mma884_layout(Module *mod, Builder *builder,
+                        target *tgt,
+                        analysis::axes *a_axes, std::map<unsigned, distributed_axis>& axes,
+                        analysis::mma884_layout* layout);
+  Value *offset_a_i_, *offset_a_k_;
+  Value *offset_b_j_, *offset_b_k_;
+  unsigned pack_size_0_;
+  unsigned pack_size_1_;
+  unsigned num_packs_0_;
+  unsigned num_packs_1_;
+};
+
+class machine_scanline_layout: public machine_distributed_layout {
+public:
+  machine_scanline_layout(Module *mod, Builder *builder,
+                          target *tgt,
+                          analysis::axes *a_axes, std::map<unsigned, distributed_axis>& axes,
+                          analysis::scanline_layout* layout);
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/codegen/selection/machine_value.h b/include/triton/codegen/selection/machine_value.h
new file mode 100644
index 000000000..67c2ed394
--- /dev/null
+++ b/include/triton/codegen/selection/machine_value.h
@@ -0,0 +1,152 @@
+#pragma once
+
+#ifndef _TRITON_SELECTION_MACHINE_VALUE_H_
+#define _TRITON_SELECTION_MACHINE_VALUE_H_
+
+#include <vector>
+#include <map>
+#include <functional>
+
+namespace llvm{
+  class Type;
+  class Value;
+  class Instruction;
+  class Constant;
+  class LLVMContext;
+  class Module;
+  class ConstantFolder;
+  class IRBuilderDefaultInserter;
+  template <typename T, typename Inserter>
+  class IRBuilder;
+  class ArrayType;
+  class Function;
+}
+
+namespace triton{
+namespace codegen{
+  typedef llvm::IRBuilder<llvm::ConstantFolder,
+                          llvm::IRBuilderDefaultInserter> Builder;
+  typedef llvm::LLVMContext LLVMContext;
+  typedef llvm::Type Type;
+  typedef llvm::Value Value;
+  typedef llvm::Module Module;
+  typedef llvm::Instruction Instruction;
+  typedef llvm::Constant Constant;
+  typedef llvm::ArrayType ArrayType;
+  typedef llvm::Function Function;
+}
+}
+
+namespace triton{
+namespace codegen{
+
+namespace analysis{
+class liveness;
+class tiles;
+class align;
+class allocation;
+class cts;
+class axes;
+class layouts;
+}
+
+class distributed_axis;
+class machine_data_layout;
+class tile;
+class shared_tile;
+class distributed_tile;
+class target;
+typedef std::vector<Value*> indices_t;
+
+}
+}
+
+namespace triton{
+namespace codegen{
+
+struct distributed_axis {
+  int contiguous;
+  std::vector<Value*> values;
+  Value* thread_id;
+};
+
+class tile {
+protected:
+  typedef std::vector<unsigned> shapes_t;
+
+public:
+  tile(Type *ty, const shapes_t &shapes): ty_(ty), shapes_(shapes){ }
+  virtual void set_value(indices_t idx, Value *v) = 0;
+  virtual Value* get_value(indices_t idx) = 0;
+  Type *get_ty() const { return ty_; }
+  shapes_t get_shapes() const { return shapes_; }
+
+protected:
+  Type *ty_;
+  shapes_t shapes_;
+};
+
+class shared_tile: public tile {
+private:
+  void extract_constant(Value *arg, Value *&non_cst, Value *&cst);
+  void extract_constant(const indices_t &arg_idx, indices_t &non_cst_idx, indices_t &cst_idx);
+
+
+public:
+  shared_tile(Type* ty, const shapes_t &shapes, const std::vector<int> &order, Value* ptr, Builder &builder, Value* offset = nullptr, const std::vector<int>& perm = {});
+  void set_vector_size(unsigned vector_size);
+  void set_return_mode(bool return_vector);
+  void set_value(indices_t, Value *);
+  Value* get_ptr_to(indices_t idx);
+  Value* get_value(indices_t idx);
+  Value* get_pointer() { return ptr_; }
+  Value* get_offset() { return offset_; }
+  const std::vector<int>& get_perm() { return perm_; }
+  const std::vector<int>& get_order() { return order_; }
+  static Value* shared_offset(Builder& builder, const shapes_t& shapes, const std::vector<int>& perm, const std::vector<int>& order, indices_t idx);
+
+private:
+  Value *ptr_;
+  bool return_vector_;
+  Builder &builder_;
+  Value *offset_;
+  std::map<indices_t, Value*> ptr_cache_;
+  unsigned vector_size_;
+  std::vector<int> order_;
+  std::vector<int> perm_;
+};
+
+// Distribtued tile
+class distributed_tile: public tile{
+  typedef std::vector<distributed_axis> axes_t;
+  typedef std::vector<indices_t> ordered_indices_vec_t;
+  typedef std::map<indices_t, unsigned> indices_map_t;
+  typedef std::map<indices_t, Value*> values_map_t;
+
+private:
+  void init_indices();
+
+public:
+  distributed_tile(Type *ty, const shapes_t& shapes, const std::vector<int>& order, const axes_t &axes, Builder &builder);
+  void set_value(indices_t idx, Value *v);
+  Value* get_value(indices_t idx);
+  const std::vector<int>& get_order() { return order_; }
+  unsigned get_linear_index(indices_t idx);
+  indices_t get_ordered_indices(unsigned id);
+  void for_each(std::function<void(indices_t)> fn, int start = 0, int end = -1);
+  void for_each(std::function<void(indices_t)> fn, std::vector<int> start, std::vector<int> size);
+
+  const distributed_axis &axis(unsigned dim) { return axes_.at(dim); }
+private:
+  axes_t axes_;
+  std::vector<int> order_;
+  indices_map_t indices_;
+  values_map_t values_;
+  ordered_indices_vec_t ordered_indices_;
+  Builder &builder_;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/codegen/target.h b/include/triton/codegen/target.h
new file mode 100644
index 000000000..dc379bd0c
--- /dev/null
+++ b/include/triton/codegen/target.h
@@ -0,0 +1,98 @@
+#ifndef TDL_INCLUDE_IR_CODEGEN_TARGET_H
+#define TDL_INCLUDE_IR_CODEGEN_TARGET_H
+
+namespace llvm{
+  class Type;
+  class Value;
+  class Instruction;
+  class Constant;
+  class LLVMContext;
+  class Module;
+  class ConstantFolder;
+  class IRBuilderDefaultInserter;
+  template <typename T, typename Inserter>
+  class IRBuilder;
+  class ArrayType;
+  class Function;
+}
+
+// typedefs
+namespace triton{
+namespace codegen{
+  typedef llvm::IRBuilder<llvm::ConstantFolder,
+                          llvm::IRBuilderDefaultInserter> Builder;
+  typedef llvm::LLVMContext LLVMContext;
+  typedef llvm::Type Type;
+  typedef llvm::Value Value;
+  typedef llvm::Module Module;
+  typedef llvm::Instruction Instruction;
+  typedef llvm::Constant Constant;
+  typedef llvm::ArrayType ArrayType;
+  typedef llvm::Function Function;
+}
+}
+
+namespace triton{
+namespace codegen{
+
+class target {
+public:
+  target(bool is_gpu): is_gpu_(is_gpu){}
+  virtual ~target() {}
+  virtual void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn) = 0;
+  virtual Instruction* add_barrier(Module *module, Builder& builder) = 0;
+  virtual Instruction* add_memfence(Module *module, Builder& builder) = 0;
+  virtual Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax) = 0;
+  virtual Value* get_local_id(Module *module, Builder& builder, unsigned ax) = 0;
+  virtual Value* get_block_id(Module *module, Builder& builder, unsigned ax) = 0;
+  virtual Value* get_num_blocks(Module *module, Builder& builder, unsigned ax) = 0;
+  virtual unsigned guaranteed_alignment() = 0;
+  bool is_gpu() const;
+
+private:
+  bool is_gpu_;
+};
+
+class amd_cl_target: public target {
+public:
+  amd_cl_target(): target(true){}
+  void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn);
+  Instruction* add_barrier(Module *module, Builder& builder);
+  Instruction* add_memfence(Module *module, Builder& builder);
+  Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax);
+  Value* get_local_id(Module *module, Builder& builder, unsigned ax);
+  Value* get_block_id(Module *module, Builder& builder, unsigned ax);
+  Value* get_num_blocks(Module *module, Builder& builder, unsigned ax);
+  unsigned guaranteed_alignment() { return 16; }
+};
+
+class nvidia_cu_target: public target {
+public:
+  nvidia_cu_target(): target(true){}
+  void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn);
+  Instruction* add_barrier(Module *module, Builder& builder);
+  Instruction* add_memfence(Module *module, Builder& builder);
+  Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax);
+  Value* get_local_id(Module *module, Builder& builder, unsigned ax);
+  Value* get_block_id(Module *module, Builder& builder, unsigned ax);
+  Value* get_num_blocks(Module *module, Builder& builder, unsigned ax);
+  unsigned guaranteed_alignment() { return 16; }
+};
+
+class cpu_target: public target {
+public:
+  cpu_target(): target(false){}
+  void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn);
+  Instruction* add_barrier(Module *module, Builder& builder);
+  Instruction* add_memfence(Module *module, Builder& builder);
+  Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax);
+  Value* get_local_id(Module *module, Builder& builder, unsigned ax);
+  Value* get_block_id(Module *module, Builder& builder, unsigned ax);
+  Value* get_num_blocks(Module *module, Builder& builder, unsigned ax);
+  unsigned guaranteed_alignment() { return 1; }
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/codegen/transform/coalesce.h b/include/triton/codegen/transform/coalesce.h
new file mode 100644
index 000000000..1b15306f1
--- /dev/null
+++ b/include/triton/codegen/transform/coalesce.h
@@ -0,0 +1,47 @@
+#ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_REORDER_H
+#define TDL_INCLUDE_CODEGEN_OPTIMIZE_REORDER_H
+
+#include <map>
+#include <set>
+#include <vector>
+
+namespace triton {
+
+namespace ir {
+  class module;
+  class value;
+  class io_inst;
+  class instruction;
+  class builder;
+}
+
+namespace codegen{
+
+namespace analysis{
+  class align;
+  class layouts;
+  class cts;
+}
+
+namespace transform{
+
+class coalesce {
+private:
+  void extract_io_use(ir::value *v, std::set<ir::io_inst*>& result);
+  void extract_ld(ir::io_inst *i, std::map<int, std::vector<triton::ir::io_inst *> > &result);
+  ir::value* rematerialize(ir::value *v, ir::builder& builder, std::map<ir::value*, ir::value*>& seen);
+
+public:
+  coalesce(analysis::align* align, triton::codegen::analysis::layouts *layouts);
+  void run(ir::module &mod);
+
+private:
+  analysis::align* align_;
+  analysis::layouts* layout_;
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/triton/codegen/transform/cts.h b/include/triton/codegen/transform/cts.h
new file mode 100644
index 000000000..b4289305b
--- /dev/null
+++ b/include/triton/codegen/transform/cts.h
@@ -0,0 +1,28 @@
+#ifndef TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H
+#define TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H
+
+#include <set>
+#include <map>
+
+namespace triton {
+
+namespace ir {
+  class module;
+  class value;
+  class phi_node;
+  class instruction;
+}
+
+namespace codegen{
+namespace transform{
+
+class cts {
+public:
+  void run(ir::module &mod);
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/triton/codegen/transform/dce.h b/include/triton/codegen/transform/dce.h
new file mode 100644
index 000000000..8bed0afef
--- /dev/null
+++ b/include/triton/codegen/transform/dce.h
@@ -0,0 +1,24 @@
+#ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_CSE_H
+#define TDL_INCLUDE_CODEGEN_OPTIMIZE_CSE_H
+
+
+namespace triton {
+
+namespace ir {
+  class module;
+}
+
+namespace codegen{
+namespace transform{
+
+class dce {
+public:
+  dce() {}
+  void run(ir::module &mod);
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/triton/codegen/transform/disassociate.h b/include/triton/codegen/transform/disassociate.h
new file mode 100644
index 000000000..f2363f3fe
--- /dev/null
+++ b/include/triton/codegen/transform/disassociate.h
@@ -0,0 +1,22 @@
+#ifndef _TRITON_SELECTION_TRANSFORM_DISASSOCIATE_H_
+#define _TRITON_SELECTION_TRANSFORM_DISASSOCIATE_H_
+
+
+namespace triton {
+namespace ir {
+  class module;
+}
+
+namespace codegen{
+namespace transform{
+
+class disassociate {
+public:
+  void run(ir::module &mod);
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/triton/codegen/transform/membar.h b/include/triton/codegen/transform/membar.h
new file mode 100644
index 000000000..015f44f3d
--- /dev/null
+++ b/include/triton/codegen/transform/membar.h
@@ -0,0 +1,59 @@
+#ifndef TDL_INCLUDE_CODEGEN_BARRIERS_H
+#define TDL_INCLUDE_CODEGEN_BARRIERS_H
+
+namespace triton {
+
+namespace ir {
+  class module;
+  class basic_block;
+  class instruction;
+  class value;
+  class builder;
+}
+
+namespace codegen{
+
+namespace analysis{
+
+class allocation;
+class liveness;
+class layouts;
+class cts;
+
+}
+
+namespace transform{
+
+class membar {
+private:
+  typedef std::pair<unsigned, unsigned> interval_t;
+  typedef std::vector<interval_t> interval_vec_t;
+
+private:
+  interval_vec_t join(const std::vector<interval_vec_t>& intervals);
+  void insert_barrier(ir::instruction *instr, ir::builder &builder);
+  bool intersect(const interval_vec_t &X, interval_t x);
+  bool intersect(const interval_vec_t &X, const interval_vec_t &Y);
+  void add_reference(ir::value *v, interval_vec_t &res);
+  void get_read_intervals(ir::instruction *i, interval_vec_t &res);
+  void get_written_intervals(ir::instruction *i, interval_vec_t &res);
+  std::pair<interval_vec_t, interval_vec_t> transfer(ir::basic_block *block, const interval_vec_t &written_to, const interval_vec_t &read_from,
+                                                     std::set<ir::instruction *> &insert_loc, std::set<triton::ir::value *> &safe_war);
+
+public:
+  membar(analysis::liveness *liveness, analysis::layouts *layouts, analysis::allocation *alloc):
+    liveness_(liveness), layouts_(layouts), alloc_(alloc) {}
+  void run(ir::module &mod);
+
+private:
+  analysis::liveness *liveness_;
+  analysis::layouts *layouts_;
+  analysis::allocation *alloc_;
+};
+
+
+}
+}
+}
+
+#endif
diff --git a/include/triton/codegen/transform/peephole.h b/include/triton/codegen/transform/peephole.h
new file mode 100644
index 000000000..9382b968d
--- /dev/null
+++ b/include/triton/codegen/transform/peephole.h
@@ -0,0 +1,42 @@
+#ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_TRANS_H
+#define TDL_INCLUDE_CODEGEN_OPTIMIZE_TRANS_H
+
+
+namespace triton {
+
+namespace ir {
+  class module;
+  class value;
+  class instruction;
+  class trans_inst;
+  class builder;
+  class constant_int;
+  class dot_inst;
+}
+
+namespace codegen{
+namespace transform{
+
+class peephole {
+private:
+  bool rewrite_trans_phi(ir::instruction* value, ir::builder &builder);
+  bool rewrite_dot_fp32(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, ir::value *A, ir::value *B, ir::value *D);
+  bool rewrite_dot_hmma(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, ir::value *A, ir::value *B, ir::value *D);
+  bool rewrite_dot(ir::instruction *value, ir::builder& builder);
+  bool rewrite_mult(ir::instruction *value, ir::builder& builder);
+  bool rewrite_unit_red(ir::instruction *value, ir::builder& builder);
+  bool rewrite_gep_ptr_min_off_plus_off(ir::instruction *value, ir::builder& builder);
+
+private:
+
+public:
+  peephole() {}
+  void run(ir::module &mod);
+};
+
+
+}
+}
+}
+
+#endif
diff --git a/include/triton/codegen/transform/reassociate.h b/include/triton/codegen/transform/reassociate.h
new file mode 100644
index 000000000..708de3c73
--- /dev/null
+++ b/include/triton/codegen/transform/reassociate.h
@@ -0,0 +1,49 @@
+#ifndef TDL_INCLUDE_IR_CODEGEN_REASSOCIATE_H
+#define TDL_INCLUDE_IR_CODEGEN_REASSOCIATE_H
+
+#include <map>
+#include <set>
+#include <vector>
+
+namespace triton {
+
+// forward declaration
+namespace ir {
+class module;
+class value;
+class builder;
+class instruction;
+class getelementptr_inst;
+}
+
+namespace codegen{
+
+namespace analysis{
+class tiles;
+class align;
+}
+
+namespace transform{
+
+class reassociate {
+  struct cst_info {
+    ir::value* dyn_ptr;
+    ir::getelementptr_inst* sta_ptr;
+  };
+
+private:
+  ir::instruction* is_bin_add(ir::value *x);
+  ir::value *reassociate_idx(ir::value *value, ir::builder &builder, ir::value *&noncst, ir::value *&cst);
+  ir::value *reassociate_ptr(ir::getelementptr_inst* pz, ir::builder &builder, std::map<ir::value*, cst_info> &offsets);
+
+public:
+  void run(ir::module& module);
+};
+
+}
+
+}
+
+}
+
+#endif
diff --git a/include/triton/driver/backend.h b/include/triton/driver/backend.h
new file mode 100755
index 000000000..c7a0f5aac
--- /dev/null
+++ b/include/triton/driver/backend.h
@@ -0,0 +1,137 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_BACKEND_H_
+#define _TRITON_DRIVER_BACKEND_H_
+
+
+#include <map>
+#include <list>
+#include <vector>
+#include "triton/driver/context.h"
+
+namespace llvm
+{
+class Module;
+}
+
+namespace triton
+{
+namespace driver
+{
+
+class buffer;
+class stream;
+class device;
+class context;
+class platform;
+class module;
+class kernel;
+
+struct backend
+{
+
+  // platforms
+  class platforms
+  {
+    friend class backend;
+  private:
+    static void init();
+
+  public:
+    static void get(std::vector<driver::platform*> &results);
+
+  private:
+    static std::vector<driver::platform*> cache_;
+  };
+
+  // devices
+  class devices
+  {
+    friend class backend;
+
+  private:
+    static void init(const std::vector<platform *> &platforms);
+
+  public:
+    static void get(std::vector<driver::device*>& devs);
+
+  private:
+    static std::vector<driver::device*> cache_;
+  };
+
+  // modules
+  class modules
+  {
+    friend class backend;
+
+  public:
+    static void release();
+
+  private:
+    static std::map<std::tuple<driver::stream*, std::string>, driver::module*> cache_;
+  };
+
+  // kernels
+  class kernels
+  {
+    friend class backend;
+  public:
+    static void release();
+    static driver::kernel* get(driver::module* mod, const std::string & name);
+  private:
+    static std::map<std::tuple<module*, std::string>, driver::kernel*> cache_;
+  };
+
+  // contexts
+  class contexts
+  {
+    friend class backend;
+  private:
+    static void init(const std::vector<device *> &);
+    static void release();
+  public:
+    static driver::context* get_default();
+
+    static driver::context* import(CUcontext ctx)
+    {
+      for(driver::context* x: cache_){
+        driver::cu_context* cu_x = (driver::cu_context*)x;
+        if(*cu_x->cu()==ctx)
+          return x;
+      }
+      cache_.emplace_back(new driver::cu_context(ctx, false));
+      return cache_.back();
+    }
+
+    static void get(std::list<driver::context*> &);
+
+  private:
+    static std::list<driver::context*> cache_;
+  };
+
+  // streams
+  class streams
+  {
+    friend class backend;
+  private:
+    static void init(std::list<context*> const &);
+    static void release();
+  public:
+    static void get(driver::context*, std::vector<driver::stream *> &streams);
+    static driver::stream* get(driver::context*, unsigned int id = 0);
+    static driver::stream* get_default();
+  private:
+    static std::map<driver::context*, std::vector<driver::stream*> > cache_;
+  };
+
+  static void init();
+  static void release();
+  static void synchronize(triton::driver::context *);
+
+  static unsigned int default_device;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/driver/buffer.h b/include/triton/driver/buffer.h
new file mode 100755
index 000000000..282f98bfb
--- /dev/null
+++ b/include/triton/driver/buffer.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_BUFFER_H_
+#define _TRITON_DRIVER_BUFFER_H_
+
+#include "triton/driver/handle.h"
+#include "triton/driver/context.h"
+
+namespace triton
+{
+namespace driver
+{
+
+class stream;
+
+// Base
+class buffer : public polymorphic_resource<CUdeviceptr, cl_mem, host_buffer_t> {
+public:
+  buffer(driver::context* ctx, size_t size, CUdeviceptr cl, bool take_ownership);
+  buffer(driver::context* ctx, size_t size, cl_mem cl, bool take_ownership);
+  buffer(driver::context* ctx, size_t size, host_buffer_t hst, bool take_ownership);
+  static buffer* create(driver::context* ctx, size_t size);
+  driver::context* context();
+  size_t size();
+
+protected:
+  driver::context* context_;
+  size_t size_;
+};
+
+// CPU
+class host_buffer: public buffer
+{
+public:
+  host_buffer(driver::context* context, size_t size);
+};
+
+// OpenCL
+class ocl_buffer: public buffer
+{
+public:
+  ocl_buffer(driver::context* context, size_t size);
+};
+
+// CUDA
+class cu_buffer: public buffer
+{
+public:
+  cu_buffer(driver::context* context, size_t size);
+  cu_buffer(driver::context* context, size_t size, CUdeviceptr cu, bool take_ownership);
+  void set_zero(triton::driver::stream *queue, size_t size);
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/driver/context.h b/include/triton/driver/context.h
new file mode 100755
index 000000000..9e368972d
--- /dev/null
+++ b/include/triton/driver/context.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_CONTEXT_H_
+#define _TRITON_DRIVER_CONTEXT_H_
+
+#include "triton/driver/device.h"
+#include "triton/driver/handle.h"
+
+namespace triton
+{
+namespace driver
+{
+
+class context: public polymorphic_resource<CUcontext, cl_context, host_context_t>{
+protected:
+  static std::string get_cache_path();
+
+public:
+  context(driver::device *dev, CUcontext cu, bool take_ownership);
+  context(driver::device *dev, cl_context cl, bool take_ownership);
+  context(driver::device *dev, host_context_t hst, bool take_ownership);
+  driver::device* device() const;
+  std::string const & cache_path() const;
+  // factory methods
+  static context* create(driver::device *dev);
+
+protected:
+  driver::device* dev_;
+  std::string cache_path_;
+};
+
+// Host
+class host_context: public context {
+public:
+  host_context(driver::device* dev);
+};
+
+// CUDA
+class cu_context: public context {
+public:
+  class context_switcher{
+  public:
+      context_switcher(driver::context const & ctx);
+      ~context_switcher();
+  private:
+      driver::cu_context const & ctx_;
+  };
+
+private:
+  static CUdevice get_device_of(CUcontext);
+
+public:
+  //Constructors
+  cu_context(CUcontext cu, bool take_ownership = true);
+  cu_context(driver::device* dev);
+};
+
+// OpenCL
+class ocl_context: public context {
+public:
+  ocl_context(driver::device* dev);
+};
+
+
+
+
+}
+}
+
+#endif
diff --git a/include/triton/driver/cublas.h b/include/triton/driver/cublas.h
new file mode 100755
index 000000000..2553dcb89
--- /dev/null
+++ b/include/triton/driver/cublas.h
@@ -0,0 +1,229 @@
+﻿/* Copyright 2015-2017 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef TDL_INCLUDE_DRIVER_CUBLAS_H
+#define TDL_INCLUDE_DRIVER_CUBLAS_H
+
+#include "isaac/templates/common.hpp"
+#include "triton/driver/dispatch.h"
+#include "triton/driver/buffer.h"
+#include "triton/driver/stream.h"
+#include "triton/driver/backend.h"
+#include "triton/driver/error.h"
+#include "triton/tools/bench.hpp"
+#include "triton/tools/collections.hpp"
+
+namespace triton
+{
+namespace driver
+{
+
+enum cublasStrategy_t{
+    CUBLAS_PREFER_FASTEST,
+    CUBLAS_HEURISTICS
+};
+
+
+static const std::vector<cublasGemmAlgo_t> cublasAlgorithms = {
+  CUBLAS_GEMM_DFALT, CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1, CUBLAS_GEMM_ALGO2, CUBLAS_GEMM_ALGO3,
+  CUBLAS_GEMM_ALGO4, CUBLAS_GEMM_ALGO5, CUBLAS_GEMM_ALGO6, CUBLAS_GEMM_ALGO7
+};
+
+static const std::map<DType, cudaDataType> cudtype = {{FLOAT_TYPE, CUDA_R_32F}, {DOUBLE_TYPE,CUDA_R_64F}};
+static const std::map<char, cublasOperation_t> cuop = {{'N', CUBLAS_OP_N}, {'T', CUBLAS_OP_T}};
+
+inline cublasGemmAlgo_t cublasGemmFastest(stream& stream, cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K,
+                         void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb,
+                         void* beta, CUdeviceptr C, int32_t ldc){
+
+  typedef std::tuple<cudaDataType_t, cublasOperation_t, cublasOperation_t, int32_t, int32_t, int32_t> key_t;
+  // Benchmark fastest algorithm in cublasGemmEx
+  auto benchmark_fastest = [&](key_t const &){
+    std::vector<double> times;
+    for(cublasGemmAlgo_t a: cublasAlgorithms){
+      try{
+        times.push_back(bench([&](){ dispatch::cublasGemmEx(handle, AT, BT, M, N, K, alpha, (const void*)A, cudt, lda, (const void*)B, cudt, ldb, beta, (void*)C, cudt, ldc, cudt, a); },
+        [&](){ stream.synchronize(); },
+        stream.context().device()));
+      }catch(driver::exception::cublas::base const &){
+        times.push_back(INFINITY);
+      }
+    }
+    size_t argmin = std::min_element(times.begin(), times.end()) - times.begin();
+    return cublasAlgorithms[argmin];
+  };
+  // Cache result
+  static cpp::CachedMap<key_t, cublasGemmAlgo_t> cache(benchmark_fastest);
+  return cache.get(std::make_tuple(cudt, AT, BT, M, N, K));
+}
+
+/* Wrapper for cublasGemmEx */
+inline void cublasGemmEx(cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K,
+                         void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb,
+                         void* beta, CUdeviceptr C, int32_t ldc, cublasGemmAlgo_t algo)
+{ dispatch::cublasGemmEx(handle, AT, BT, M, N, K, alpha, (const void*)A, cudt, lda, (const void*)B, cudt, ldb, beta, (void*)C, cudt, ldc, cudt, algo); }
+
+
+/* Simplified API for default GEMM */
+inline void cublasGemm(DType dtype, stream& stream, char cAT, char cBT, int32_t M, int32_t N, int32_t K, scalar alpha, cu_buffer const & A, int32_t lda, cu_buffer const & B, int32_t ldb, scalar beta, cu_buffer& C, int32_t ldc, cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT){
+  ContextSwitcher ctx_switch(stream.context());
+  cublasHandle_t handle = dispatch::cublasHandle(stream.context());
+  dispatch::cublasSetStream_v2(handle, (CUstream)stream);
+  if(fastest)
+    *fastest = cublasGemmFastest(stream, handle, cudtype.at(dtype), cuop.at(cAT), cuop.at(cBT), M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc);
+  else
+    cublasGemmEx(handle, cudtype.at(dtype), cuop.at(cAT), cuop.at(cBT), M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc, algo);
+}
+
+inline cudnnDataType_t cudnnDtype(DType dtype){
+  switch(dtype){
+    case INT8X4_TYPE: return CUDNN_DATA_INT8x4;
+    case INT32_TYPE: return CUDNN_DATA_INT32;
+    case FLOAT_TYPE: return CUDNN_DATA_FLOAT;
+    case DOUBLE_TYPE: return CUDNN_DATA_DOUBLE;
+  }
+  throw;
+}
+
+inline cudnnTensorFormat_t format(cudnnDataType_t cutype){
+  switch(cutype){
+    case CUDNN_DATA_INT8x4: return CUDNN_TENSOR_NCHW_VECT_C;
+    default: return CUDNN_TENSOR_NCHW;
+  }
+}
+
+inline void cudnnConv(DType dtype, stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t C, int32_t T, int32_t R, int32_t S,
+                      int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, cu_buffer const & I, cu_buffer const & F, scalar beta, cu_buffer const & O){
+  driver::driver::context const & ctx = stream.context();
+  ContextSwitcher switch_ctx(ctx);
+
+  std::vector<int> pad = {pad_d, pad_h, pad_w};
+  std::vector<int> stride = {stride_d, stride_h, stride_w};
+  std::vector<int> upscale = {1, 1, 1};
+  std::vector<int> Oshapes = {N, K, M, P, Q};
+  std::vector<int> Fshapes = {K, C, T, R, S};
+  std::vector<int> Ishapes = {N, C, D, H, W};
+  if(M == 1 && T == 1 && D == 1){
+    pad.erase(pad.begin());
+    stride.erase(stride.begin());
+    upscale.erase(upscale.begin());
+    Oshapes.erase(Oshapes.begin() + 2);
+    Ishapes.erase(Ishapes.begin() + 2);
+    Fshapes.erase(Fshapes.begin() + 2);
+  }
+
+  cudnnHandle_t handle = dispatch::cudnnHandle(ctx);
+  cudnnDataType_t in_cutype = cudnnDtype(dtype);
+  cudnnDataType_t conv_cutype = (dtype == INT8X4_TYPE)?CUDNN_DATA_INT32:in_cutype;
+
+  dispatch::cudnnSetStream(handle, (CUstream)stream);
+  cudnnTensorDescriptor_t tO, tI;
+  cudnnFilterDescriptor_t tF;
+  cudnnConvolutionDescriptor_t conv;
+  cudnnConvolutionFwdAlgo_t algo;
+  dispatch::cudnnCreateTensorDescriptor(&tO);
+  dispatch::cudnnCreateTensorDescriptor(&tI);
+  dispatch::cudnnCreateFilterDescriptor(&tF);
+
+  dispatch::cudnnSetTensorNdDescriptorEx(tO, format(in_cutype), in_cutype, Oshapes.size(), Oshapes.data());
+  dispatch::cudnnSetFilterNdDescriptor(tF, in_cutype, format(in_cutype), Fshapes.size(), Fshapes.data());
+  dispatch::cudnnSetTensorNdDescriptorEx(tI, format(in_cutype), in_cutype, Ishapes.size(), Ishapes.data());
+
+  dispatch::cudnnCreateConvolutionDescriptor(&conv);
+  dispatch::cudnnSetConvolutionNdDescriptor(conv, pad.size(), pad.data(), stride.data(), upscale.data(), CUDNN_CROSS_CORRELATION, conv_cutype);
+  dispatch::cudnnGetConvolutionForwardAlgorithm(handle, tI, tF, conv, tO, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, 1024*1024*64, &algo);
+
+  size_t workspace_size;
+  dispatch::cudnnGetConvolutionForwardWorkspaceSize(handle, tI, tF, conv, tO, algo, &workspace_size);
+  static cu_buffer work(ctx, 1024*1024*64);
+  CUdeviceptr twork = work;
+  CUdeviceptr pI = I, pF = F, pO = O;
+  dispatch::cudnnConvolutionForward(handle, alpha.data(), tI, (void*)pI, tF, (void*)pF, conv, algo, (void*)twork, workspace_size, beta.data(), tO, (void*)pO);
+}
+
+
+inline void cudnnPool(DType dtype, stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t T, int32_t R, int32_t S,
+                      int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, cu_buffer const & I, scalar beta, cu_buffer const & O){
+  driver::driver::context const & ctx = stream.context();
+  ContextSwitcher switch_ctx(ctx);
+
+  std::vector<int> pad = {pad_d, pad_h, pad_w};
+  std::vector<int> stride = {stride_d, stride_h, stride_w};
+  std::vector<int> upscale = {1, 1, 1};
+  std::vector<int> Oshapes = {N, K, M, P, Q};
+  std::vector<int> Ishapes = {N, K, D, H, W};
+  std::vector<int> window = {T, R, S};
+  if(M == 1 && T == 1 && D == 1){
+    window.erase(window.begin());
+    pad.erase(pad.begin());
+    stride.erase(stride.begin());
+    upscale.erase(upscale.begin());
+    Oshapes.erase(Oshapes.begin() + 2);
+    Ishapes.erase(Ishapes.begin() + 2);
+  }
+
+  cudnnHandle_t handle = dispatch::cudnnHandle(ctx);
+  cudnnDataType_t cutype = cudnnDtype(dtype);
+
+  dispatch::cudnnSetStream(handle, (CUstream)stream);
+  cudnnTensorDescriptor_t tO, tI;
+  cudnnPoolingDescriptor_t desc;
+  dispatch::cudnnCreateTensorDescriptor(&tO);
+  dispatch::cudnnCreateTensorDescriptor(&tI);
+
+  dispatch::cudnnSetTensorNdDescriptorEx(tO, CUDNN_TENSOR_NCHW, cutype, Oshapes.size(), Oshapes.data());
+  dispatch::cudnnSetTensorNdDescriptorEx(tI, CUDNN_TENSOR_NCHW, cutype, Ishapes.size(), Ishapes.data());
+
+  dispatch::cudnnCreatePoolingDescriptor(&desc);
+  dispatch::cudnnSetPoolingNdDescriptor(desc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN, window.size(), window.data(), pad.data(), stride.data());
+
+  CUdeviceptr pI = I, pO = O;
+  dispatch::cudnnPoolingForward(handle, desc, alpha.data(), tI, (void*)pI, beta.data(), tO, (void*)pO);
+}
+
+inline void cudnnTransformTensor(driver::cu_stream & stream,
+               DType in_dtype, DType out_dtype,
+               cudnnTensorFormat_t in_layout, cudnnTensorFormat_t out_layout,
+               int32_t N, int32_t C, int32_t D, int32_t H, int32_t W,
+               scalar alpha, driver::cu_buffer const & I, scalar beta, driver::cu_buffer& O)
+{
+  cudnnHandle_t handle = dispatch::cudnnHandle(stream.context());
+  dispatch::cudnnSetStream(handle, (CUstream)stream);
+
+  cudnnTensorDescriptor_t tO, tI;
+  std::vector<int> shapes = {N, C, D, H, W};
+  dispatch::cudnnCreateTensorDescriptor(&tI);
+  dispatch::cudnnSetTensorNdDescriptorEx(tI, in_layout, cudnnDtype(in_dtype), shapes.size(), shapes.data());
+  dispatch::cudnnCreateTensorDescriptor(&tO);
+  dispatch::cudnnSetTensorNdDescriptorEx(tO, out_layout, cudnnDtype(out_dtype), shapes.size(), shapes.data());
+
+  CUdeviceptr pI = I, pO = O;
+  dispatch::cudnnTransformTensor(handle, alpha.data(), tI, (void*)pI, beta.data(), tO, (void*)pO);
+}
+
+
+}
+}
+
+
+
+#endif
diff --git a/include/triton/driver/device.h b/include/triton/driver/device.h
new file mode 100755
index 000000000..df119a272
--- /dev/null
+++ b/include/triton/driver/device.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_DEVICE_H_
+#define _TRITON_DRIVER_DEVICE_H_
+
+#include "triton/driver/platform.h"
+#include "triton/driver/handle.h"
+
+namespace triton
+{
+
+namespace codegen
+{
+class target;
+}
+
+namespace driver
+{
+
+class context;
+
+// Base device
+class device: public polymorphic_resource<CUdevice, cl_device_id, host_device_t>{
+public:
+  using polymorphic_resource::polymorphic_resource;
+  virtual size_t max_threads_per_block() const = 0;
+  virtual size_t max_shared_memory() const = 0;
+  virtual std::unique_ptr<codegen::target> make_target() const = 0;
+};
+
+// Host device
+class host_device: public device {
+public:
+  host_device(): device(host_device_t(), true){ }
+  size_t max_threads_per_block() const { return 1; }
+  size_t max_shared_memory() const { return 0; }
+  std::unique_ptr<codegen::target> make_target() const;
+};
+
+// OpenCL device
+class ocl_device: public device {
+public:
+  ocl_device(cl_device_id cl, bool take_ownership = true): device(cl, take_ownership) { }
+  size_t max_threads_per_block() const;
+  size_t max_shared_memory() const;
+  std::unique_ptr<codegen::target> make_target() const;
+};
+
+// CUDA device
+class cu_device: public device {
+public:
+  //Supported architectures
+  enum class Architecture{
+    //NVidia
+    SM_2_0,
+    SM_2_1,
+    SM_3_0,
+    SM_3_5,
+    SM_3_7,
+    SM_5_0,
+    SM_5_2,
+    SM_6_0,
+    SM_6_1,
+    SM_7_0,
+    UNKNOWN
+  };
+
+private:
+  //Metaprogramming elper to get cuda info from attribute
+  template<CUdevice_attribute attr>
+  int cuGetInfo() const;
+
+  inline Architecture nv_arch(std::pair<unsigned int, unsigned int> sm) const;
+  inline nvmlDevice_t nvml_device() const;
+
+public:
+  cu_device(CUdevice cu = CUdevice(), bool take_ownership = true): device(cu, take_ownership){}
+  // Accessors
+  Architecture architecture() const;
+  // Informations
+  std::string infos() const;
+  size_t address_bits() const;
+  std::vector<size_t> max_block_dim() const;
+  size_t warp_size() const;
+  // Compute Capability
+  void interpret_as(std::pair<size_t, size_t> cc);
+  std::pair<size_t, size_t> compute_capability() const;
+  // Identifier
+  std::string name() const;
+  std::string pci_bus_id() const;
+  // Clocks
+  size_t current_sm_clock() const;
+  size_t current_mem_clock() const;
+  size_t max_threads_per_block() const;
+  size_t max_shared_memory() const;
+  size_t max_sm_clock() const;
+  size_t max_mem_clock() const;
+  void set_max_clock();
+  // Target
+  std::unique_ptr<codegen::target> make_target() const;
+
+private:
+  std::shared_ptr<std::pair<size_t, size_t>> interpreted_as_;
+};
+
+}
+
+}
+
+#endif
diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h
new file mode 100755
index 000000000..ed717a7fb
--- /dev/null
+++ b/include/triton/driver/dispatch.h
@@ -0,0 +1,259 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_DISPATCH_H_
+#define _TRITON_DRIVER_DISPATCH_H_
+
+#include <type_traits>
+#include <dlfcn.h>
+
+//CUDA Backend
+#include "triton/external/CUDA/cuda.h"
+#include "triton/external/CUDA/nvml.h"
+#include "triton/external/CL/cl.h"
+#include "triton/external/CL/cl_ext.h"
+
+//Exceptions
+#include <iostream>
+#include <stdexcept>
+
+namespace llvm {
+class PassRegistry;
+class Module;
+}
+
+namespace triton
+{
+namespace driver
+{
+
+class cu_context;
+
+template<class T> void check(T){}
+void check(CUresult err);
+void check(cl_int err);
+
+class dispatch
+{
+protected:
+  template <class F>
+  struct return_type;
+
+  template <class R, class... A>
+  struct return_type<R (*)(A...)>
+  { typedef R type; };
+
+  typedef bool (*f_init_t)();
+
+  template<f_init_t initializer, typename FunPtrT, typename... Args>
+  static typename return_type<FunPtrT>::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args)
+  {
+    initializer();
+    if(cache == nullptr){
+      cache = dlsym(lib_h, name);
+			if(cache == 0)
+				throw std::runtime_error("dlsym unable to load function");
+		}
+    FunPtrT fptr;
+    *reinterpret_cast<void **>(&fptr) = cache;
+    typename return_type<FunPtrT>::type res = (*fptr)(args...);
+    check(res);
+    return res;
+  }
+
+public:
+  static bool clinit();
+  static bool nvmlinit();
+  static bool cuinit();
+  static bool spvllvminit();
+  static void release();
+
+  // OpenCL
+  static cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *), void *);
+  static cl_int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
+  static cl_int clSetKernelArg(cl_kernel, cl_uint, size_t, const void *);
+  static cl_int clReleaseMemObject(cl_mem);
+  static cl_int clFinish(cl_command_queue);
+  static cl_int clGetMemObjectInfo(cl_mem, cl_mem_info, size_t, void *, size_t *);
+  static cl_int clGetCommandQueueInfo(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *);
+  static cl_int clReleaseContext(cl_context);
+  static cl_int clReleaseEvent(cl_event);
+  static cl_int clEnqueueWriteBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+  static cl_int clEnqueueReadBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
+  static cl_int clGetProgramBuildInfo(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *);
+  static cl_int clReleaseDevice(cl_device_id);
+  static cl_context clCreateContext(const cl_context_properties *, cl_uint, const cl_device_id *, void (*)(const char *, const void *, size_t, void *), void *, cl_int *);
+  static cl_int clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
+  static cl_int clGetContextInfo(cl_context, cl_context_info, size_t, void *, size_t *);
+  static cl_int clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void *, size_t *);
+  static cl_int clReleaseCommandQueue(cl_command_queue);
+  static cl_int clGetPlatformIDs(cl_uint, cl_platform_id *, cl_uint *);
+  static cl_int clGetPlatformInfo(cl_platform_id, cl_platform_info, size_t, void *, size_t *);
+  static cl_int clGetEventProfilingInfo(cl_event, cl_profiling_info, size_t, void *, size_t *);
+  static cl_program clCreateProgramWithBinary(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *);
+  static cl_command_queue clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int *);
+  static cl_int clRetainEvent(cl_event);
+  static cl_int clReleaseProgram(cl_program);
+  static cl_int clFlush(cl_command_queue);
+  static cl_int clGetProgramInfo(cl_program, cl_program_info, size_t, void *, size_t *);
+  static cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *);
+  static cl_int clGetKernelWorkGroupInfo(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *);
+  static cl_kernel clCreateKernel(cl_program, const char *, cl_int *);
+  static cl_int clCreateKernelsInProgram(cl_program, cl_uint, cl_kernel*, cl_uint*);
+  static cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *);
+  static cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *);
+  static cl_int clReleaseKernel(cl_kernel);
+
+  // CUDA
+  static CUresult cuCtxGetCurrent(CUcontext *pctx);
+  static CUresult cuCtxSetCurrent(CUcontext ctx);
+  static CUresult cuCtxDestroy_v2(CUcontext ctx);
+  static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags);
+  static CUresult cuDeviceGet(CUdevice *device, int ordinal);
+  static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+  static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
+  static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+  static CUresult cuMemFree_v2(CUdeviceptr dptr);
+  static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+  static CUresult cuDriverGetVersion(int *driverVersion);
+  static CUresult cuDeviceGetName(char *name, int len, CUdevice dev);
+  static CUresult cuDeviceGetPCIBusId(char *id, int len, CUdevice dev);
+  static CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t* bytes, CUmodule hmod, const char *name);
+  static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+  static CUresult cuModuleLoad(CUmodule *module, const char *fname);
+  static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+  static CUresult cuModuleUnload(CUmodule hmod);
+  static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+  static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
+  static CUresult cuDeviceGetCount(int *count);
+  static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+  static CUresult cuInit(unsigned int Flags);
+  static CUresult cuEventRecord(CUevent hEvent, CUstream hStream);
+  static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
+  static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
+  static CUresult cuCtxPopCurrent_v2(CUcontext *pctx);
+  static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
+  static CUresult cuStreamSynchronize(CUstream hStream);
+  static CUresult cuStreamDestroy_v2(CUstream hStream);
+  static CUresult cuEventDestroy_v2(CUevent hEvent);
+  static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
+  static CUresult cuPointerGetAttribute(void * data, CUpointer_attribute attribute, CUdeviceptr ptr);
+  static CUresult cuCtxGetDevice(CUdevice* result);
+  static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N, CUstream stream);
+  static CUresult cuFuncGetAttribute(int* pi, CUfunction_attribute attrib, CUfunction hfunc);
+  static CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int  value);
+  static CUresult cuFuncSetCacheConfig (CUfunction hfunc, CUfunc_cache config);
+  // NVML
+  static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device);
+  static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
+  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
+  static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock);
+
+
+  // SPIR-V libraries
+  static int initializeLLVMToSPIRVPass(llvm::PassRegistry &);
+  static bool writeSpirv(llvm::Module *M, std::ostream &OS, std::string &ErrMsg);
+
+
+private:
+
+  // Libraries
+  static void* opencl_;
+  static void* cuda_;
+  static void* nvml_;
+  static void* vulkan_;
+  static void* spvllvm_;
+  static void* spvcross_;
+  static void* opengl_;
+
+  // OpenCL functions
+  static void* clBuildProgram_;
+  static void* clEnqueueNDRangeKernel_;
+  static void* clSetKernelArg_;
+  static void* clReleaseMemObject_;
+  static void* clFinish_;
+  static void* clGetMemObjectInfo_;
+  static void* clGetCommandQueueInfo_;
+  static void* clReleaseContext_;
+  static void* clReleaseEvent_;
+  static void* clEnqueueWriteBuffer_;
+  static void* clEnqueueReadBuffer_;
+  static void* clGetProgramBuildInfo_;
+  static void* clReleaseDevice_;
+  static void* clCreateContext_;
+  static void* clGetDeviceIDs_;
+  static void* clGetContextInfo_;
+  static void* clGetDeviceInfo_;
+  static void* clReleaseCommandQueue_;
+  static void* clGetPlatformIDs_;
+  static void* clGetPlatformInfo_;
+  static void* clGetEventProfilingInfo_;
+  static void* clCreateProgramWithBinary_;
+  static void* clCreateCommandQueue_;
+  static void* clRetainEvent_;
+  static void* clReleaseProgram_;
+  static void* clFlush_;
+  static void* clGetProgramInfo_;
+  static void* clGetKernelInfo_;
+  static void* clGetKernelWorkGroupInfo_;
+  static void* clCreateKernel_;
+  static void* clCreateKernelsInProgram_;
+  static void* clCreateBuffer_;
+  static void* clCreateProgramWithSource_;
+  static void* clReleaseKernel_;
+
+  // CUDA functions
+  static void* cuCtxGetCurrent_;
+  static void* cuCtxSetCurrent_;
+  static void* cuCtxDestroy_v2_;
+  static void* cuEventCreate_;
+  static void* cuDeviceGet_;
+  static void* cuMemcpyDtoH_v2_;
+  static void* cuStreamCreate_;
+  static void* cuEventElapsedTime_;
+  static void* cuMemFree_v2_;
+  static void* cuMemcpyDtoHAsync_v2_;
+  static void* cuDriverGetVersion_;
+  static void* cuDeviceGetName_;
+  static void* cuDeviceGetPCIBusId_;
+  static void* cuModuleGetGlobal_v2_;
+  static void* cuMemcpyHtoDAsync_v2_;
+  static void* cuModuleLoad_;
+  static void* cuLaunchKernel_;
+  static void* cuModuleUnload_;
+  static void* cuModuleLoadDataEx_;
+  static void* cuDeviceGetAttribute_;
+  static void* cuDeviceGetCount_;
+  static void* cuMemcpyHtoD_v2_;
+  static void* cuInit_;
+  static void* cuEventRecord_;
+  static void* cuCtxCreate_v2_;
+  static void* cuModuleGetFunction_;
+  static void* cuStreamSynchronize_;
+  static void* cuStreamDestroy_v2_;
+  static void* cuEventDestroy_v2_;
+  static void* cuMemAlloc_v2_;
+  static void* cuPointerGetAttribute_;
+  static void* cuCtxGetDevice_;
+  static void* cuMemsetD8Async_;
+  static void* cuCtxPushCurrent_v2_;
+  static void* cuCtxPopCurrent_v2_;
+  static void* cuFuncGetAttribute_;
+  static void* cuFuncSetAttribute_;
+  static void* cuFuncSetCacheConfig_;
+  // NVML
+  static void* nvmlInit_v2_;
+  static void* nvmlDeviceGetHandleByPciBusId_v2_;
+  static void* nvmlDeviceGetClockInfo_;
+  static void* nvmlDeviceGetMaxClockInfo_;
+  static void* nvmlDeviceSetApplicationsClocks_;
+
+  // LLVM to SPIR-V
+  static void* initializeLLVMToSPIRVPass_;
+  static void* writeSpirv_;
+};
+
+}
+}
+
+
+#endif
diff --git a/include/triton/driver/error.h b/include/triton/driver/error.h
new file mode 100755
index 000000000..5091faf3e
--- /dev/null
+++ b/include/triton/driver/error.h
@@ -0,0 +1,208 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_ERROR_H_
+#define _TRITON_DRIVER_ERROR_H_
+
+#include <exception>
+#include "triton/driver/dispatch.h"
+
+
+namespace triton
+{
+
+  namespace driver
+  {
+
+  namespace exception
+  {
+
+  namespace nvrtc
+  {
+
+#define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } }
+
+  ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory              ,"out of memory");
+  ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure   ,"program creation failure");
+  ISAAC_CREATE_NVRTC_EXCEPTION(invalid_input              ,"invalid input");
+  ISAAC_CREATE_NVRTC_EXCEPTION(invalid_program            ,"invalid program");
+  ISAAC_CREATE_NVRTC_EXCEPTION(invalid_option             ,"invalid option");
+  ISAAC_CREATE_NVRTC_EXCEPTION(compilation                ,"compilation");
+  ISAAC_CREATE_NVRTC_EXCEPTION(builtin_operation_failure  ,"builtin operation failure");
+  ISAAC_CREATE_NVRTC_EXCEPTION(unknown_error              ,"unknown error");
+
+#undef ISAAC_CREATE_NVRTC_EXCEPTION
+  }
+
+
+  namespace cuda
+  {
+  class base: public std::exception{};
+
+#define ISAAC_CREATE_CUDA_EXCEPTION(name, msg) class name: public base { public:const char * what() const throw(){ return "CUDA: Error- " msg; } }
+
+
+  ISAAC_CREATE_CUDA_EXCEPTION(invalid_value                   ,"invalid value");
+  ISAAC_CREATE_CUDA_EXCEPTION(out_of_memory                   ,"out of memory");
+  ISAAC_CREATE_CUDA_EXCEPTION(not_initialized                 ,"not initialized");
+  ISAAC_CREATE_CUDA_EXCEPTION(deinitialized                   ,"deinitialized");
+  ISAAC_CREATE_CUDA_EXCEPTION(profiler_disabled               ,"profiler disabled");
+  ISAAC_CREATE_CUDA_EXCEPTION(profiler_not_initialized        ,"profiler not initialized");
+  ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_started        ,"profiler already started");
+  ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_stopped        ,"profiler already stopped");
+  ISAAC_CREATE_CUDA_EXCEPTION(no_device                       ,"no device");
+  ISAAC_CREATE_CUDA_EXCEPTION(invalid_device                  ,"invalid device");
+  ISAAC_CREATE_CUDA_EXCEPTION(invalid_image                   ,"invalid image");
+  ISAAC_CREATE_CUDA_EXCEPTION(invalid_context                 ,"invalid context");
+  ISAAC_CREATE_CUDA_EXCEPTION(context_already_current         ,"context already current");
+  ISAAC_CREATE_CUDA_EXCEPTION(map_failed                      ,"map failed");
+  ISAAC_CREATE_CUDA_EXCEPTION(unmap_failed                    ,"unmap failed");
+  ISAAC_CREATE_CUDA_EXCEPTION(array_is_mapped                 ,"array is mapped");
+  ISAAC_CREATE_CUDA_EXCEPTION(already_mapped                  ,"already mapped");
+  ISAAC_CREATE_CUDA_EXCEPTION(no_binary_for_gpu               ,"no binary for gpu");
+  ISAAC_CREATE_CUDA_EXCEPTION(already_acquired                ,"already acquired");
+  ISAAC_CREATE_CUDA_EXCEPTION(not_mapped                      ,"not mapped");
+  ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_array             ,"not mapped as array");
+  ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer           ,"not mapped as pointer");
+  ISAAC_CREATE_CUDA_EXCEPTION(ecc_uncorrectable               ,"ecc uncorrectable");
+  ISAAC_CREATE_CUDA_EXCEPTION(unsupported_limit               ,"unsupported limit");
+  ISAAC_CREATE_CUDA_EXCEPTION(context_already_in_use          ,"context already in use");
+  ISAAC_CREATE_CUDA_EXCEPTION(peer_access_unsupported         ,"peer access unsupported");
+  ISAAC_CREATE_CUDA_EXCEPTION(invalid_ptx                     ,"invalid ptx");
+  ISAAC_CREATE_CUDA_EXCEPTION(invalid_graphics_context        ,"invalid graphics context");
+  ISAAC_CREATE_CUDA_EXCEPTION(invalid_source                  ,"invalid source");
+  ISAAC_CREATE_CUDA_EXCEPTION(file_not_found                  ,"file not found");
+  ISAAC_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found  ,"shared object symbol not found");
+  ISAAC_CREATE_CUDA_EXCEPTION(shared_object_init_failed       ,"shared object init failed");
+  ISAAC_CREATE_CUDA_EXCEPTION(operating_system                ,"operating system");
+  ISAAC_CREATE_CUDA_EXCEPTION(invalid_handle                  ,"invalid handle");
+  ISAAC_CREATE_CUDA_EXCEPTION(not_found                       ,"not found");
+  ISAAC_CREATE_CUDA_EXCEPTION(not_ready                       ,"not ready");
+  ISAAC_CREATE_CUDA_EXCEPTION(illegal_address                 ,"illegal address");
+  ISAAC_CREATE_CUDA_EXCEPTION(launch_out_of_resources         ,"launch out of resources");
+  ISAAC_CREATE_CUDA_EXCEPTION(launch_timeout                  ,"launch timeout");
+  ISAAC_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing   ,"launch incompatible texturing");
+  ISAAC_CREATE_CUDA_EXCEPTION(peer_access_already_enabled     ,"peer access already enabled");
+  ISAAC_CREATE_CUDA_EXCEPTION(peer_access_not_enabled         ,"peer access not enabled");
+  ISAAC_CREATE_CUDA_EXCEPTION(primary_context_active          ,"primary context active");
+  ISAAC_CREATE_CUDA_EXCEPTION(context_is_destroyed            ,"context is destroyed");
+  ISAAC_CREATE_CUDA_EXCEPTION(assert_error                    ,"assert");
+  ISAAC_CREATE_CUDA_EXCEPTION(too_many_peers                  ,"too many peers");
+  ISAAC_CREATE_CUDA_EXCEPTION(host_memory_already_registered  ,"host memory already registered");
+  ISAAC_CREATE_CUDA_EXCEPTION(host_memory_not_registered      ,"hot memory not registered");
+  ISAAC_CREATE_CUDA_EXCEPTION(hardware_stack_error            ,"hardware stack error");
+  ISAAC_CREATE_CUDA_EXCEPTION(illegal_instruction             ,"illegal instruction");
+  ISAAC_CREATE_CUDA_EXCEPTION(misaligned_address              ,"misaligned address");
+  ISAAC_CREATE_CUDA_EXCEPTION(invalid_address_space           ,"invalid address space");
+  ISAAC_CREATE_CUDA_EXCEPTION(invalid_pc                      ,"invalid pc");
+  ISAAC_CREATE_CUDA_EXCEPTION(launch_failed                   ,"launch failed");
+  ISAAC_CREATE_CUDA_EXCEPTION(not_permitted                   ,"not permitted");
+  ISAAC_CREATE_CUDA_EXCEPTION(not_supported                   ,"not supported");
+  ISAAC_CREATE_CUDA_EXCEPTION(unknown                         ,"unknown");
+
+#undef ISAAC_CREATE_CUDA_EXCEPTION
+  }
+
+  namespace cublas
+  {
+  class base: public std::exception{};
+
+#define ISAAC_CREATE_CUBLAS_EXCEPTION(name, msg) class name: public base { public: const char * what() const throw(){ return "CUBLAS: Error- " msg; } }
+
+  ISAAC_CREATE_CUBLAS_EXCEPTION(not_initialized              ,"not initialized");
+  ISAAC_CREATE_CUBLAS_EXCEPTION(alloc_failed                 ,"alloc failed");
+  ISAAC_CREATE_CUBLAS_EXCEPTION(invalid_value                ,"invalid value");
+  ISAAC_CREATE_CUBLAS_EXCEPTION(arch_mismatch                ,"arch mismatch");
+  ISAAC_CREATE_CUBLAS_EXCEPTION(mapping_error                ,"mapping error");
+  ISAAC_CREATE_CUBLAS_EXCEPTION(execution_failed             ,"execution failed");
+  ISAAC_CREATE_CUBLAS_EXCEPTION(internal_error               ,"internal error");
+  ISAAC_CREATE_CUBLAS_EXCEPTION(not_supported                ,"not supported");
+  ISAAC_CREATE_CUBLAS_EXCEPTION(license_error                ,"license error");
+  ISAAC_CREATE_CUBLAS_EXCEPTION(unknown                      ,"unknown");
+
+#undef ISAAC_CREATE_CUBLAS_EXCEPTION
+  }
+
+  namespace cudnn
+  {
+#define ISAAC_CREATE_CUDNN_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "CUDNN: Error- " msg; } }
+
+  ISAAC_CREATE_CUDNN_EXCEPTION(not_initialized              ,"not initialized");
+  ISAAC_CREATE_CUDNN_EXCEPTION(alloc_failed                 ,"allocation failed");
+  ISAAC_CREATE_CUDNN_EXCEPTION(bad_param                    ,"bad param");
+  ISAAC_CREATE_CUDNN_EXCEPTION(internal_error               ,"internal error");
+  ISAAC_CREATE_CUDNN_EXCEPTION(invalid_value                ,"invalid value");
+  ISAAC_CREATE_CUDNN_EXCEPTION(arch_mismatch                ,"arch mismatch");
+  ISAAC_CREATE_CUDNN_EXCEPTION(mapping_error                ,"mapping error");
+  ISAAC_CREATE_CUDNN_EXCEPTION(execution_failed             ,"execution failed");
+  ISAAC_CREATE_CUDNN_EXCEPTION(not_supported                ,"not supported");
+  ISAAC_CREATE_CUDNN_EXCEPTION(license_error                ,"license error");
+  ISAAC_CREATE_CUDNN_EXCEPTION(runtime_prerequisite_missing ,"prerequisite missing");
+  ISAAC_CREATE_CUDNN_EXCEPTION(runtime_in_progress          ,"runtime in progress");
+  ISAAC_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow          ,"runtime fp overflow");
+  }
+
+  namespace ocl
+  {
+
+  class base: public std::exception{};
+
+#define ISAAC_CREATE_CL_EXCEPTION(name, msg) class name: public base { public: const char * what() const throw(){ return "OpenCL: Error- " msg; } }
+
+
+  ISAAC_CREATE_CL_EXCEPTION(device_not_found,                  "device not found");
+  ISAAC_CREATE_CL_EXCEPTION(device_not_available,              "device not available");
+  ISAAC_CREATE_CL_EXCEPTION(compiler_not_available,            "compiler not available");
+  ISAAC_CREATE_CL_EXCEPTION(mem_object_allocation_failure,     "object allocation failure");
+  ISAAC_CREATE_CL_EXCEPTION(out_of_resources,                  "launch out of resources");
+  ISAAC_CREATE_CL_EXCEPTION(out_of_host_memory,                "out of host memory");
+  ISAAC_CREATE_CL_EXCEPTION(profiling_info_not_available,      "profiling info not available");
+  ISAAC_CREATE_CL_EXCEPTION(mem_copy_overlap,                  "mem copy overlap");
+  ISAAC_CREATE_CL_EXCEPTION(image_format_mismatch,             "image format mismatch");
+  ISAAC_CREATE_CL_EXCEPTION(image_format_not_supported,        "image format not supported");
+  ISAAC_CREATE_CL_EXCEPTION(build_program_failure,             "build program failure");
+  ISAAC_CREATE_CL_EXCEPTION(map_failure,                       "map failure");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_value,                     "invalid value");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_device_type,               "invalid device type");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_platform,                  "invalid platform");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_device,                    "invalid device");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_context,                   "invalid context");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_queue_properties,          "invalid queue properties");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_command_queue,             "invalid command queue");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_host_ptr,                  "invalid host pointer");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_mem_object,                "invalid mem object");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_image_format_descriptor,   "invalid image format descriptor");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_image_size,                "invalid image size");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_sampler,                   "invalid sampler");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_binary,                    "invalid binary");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_build_options,             "invalid build options");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_program,                   "invalid program");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_program_executable,        "invalid program executable");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_name,               "invalid kernel name");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_definition,         "invalid kernel definition");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel,                    "invalid kernel");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_arg_index,                 "invalid arg index");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_arg_value,                 "invalid arg value");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_arg_size,                  "invalid arg size");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_args,               "invalid kernel args");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_work_dimension,            "invalid work dimension");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_work_group_size,           "invalid work group size");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_work_item_size,            "invalid work item size");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_global_offset,             "invalid global offset");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_event_wait_list,           "invalid event wait list");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_event,                     "invalid event");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_operation,                 "invalid operation");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_gl_object,                 "invalid GL object");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_buffer_size,               "invalid buffer size");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_mip_level,                 "invalid MIP level");
+  ISAAC_CREATE_CL_EXCEPTION(invalid_global_work_size,          "invalid global work size");
+#ifdef CL_INVALID_PROPERTY
+  ISAAC_CREATE_CL_EXCEPTION(invalid_property,                  "invalid property");
+#endif
+  }
+
+
+  }
+  }
+}
+
+#endif
diff --git a/include/triton/driver/event.h b/include/triton/driver/event.h
new file mode 100755
index 000000000..7310d001f
--- /dev/null
+++ b/include/triton/driver/event.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_EVENT_H_
+#define _TRITON_DRIVER_EVENT_H_
+
+#include "triton/driver/handle.h"
+
+namespace triton
+{
+
+namespace driver
+{
+
+// event
+class event
+{
+public:
+  float elapsed_time() const;
+  handle<cu_event_t> const & cu() const;
+
+private:
+  handle<cu_event_t> cu_;
+};
+
+}
+
+}
+
+#endif
diff --git a/include/triton/driver/handle.h b/include/triton/driver/handle.h
new file mode 100755
index 000000000..eac14dca2
--- /dev/null
+++ b/include/triton/driver/handle.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_HANDLE_H_
+#define _TRITON_DRIVER_HANDLE_H_
+
+#include <memory>
+#include <map>
+#include <iostream>
+#include <functional>
+#include <type_traits>
+#include "triton/driver/dispatch.h"
+
+namespace llvm
+{
+class ExecutionEngine;
+class Function;
+}
+
+namespace triton
+{
+
+namespace driver
+{
+
+enum backend_t {
+  CUDA,
+  OpenCL,
+  Host
+};
+
+// Host handles
+struct host_platform_t{
+
+};
+
+struct host_device_t{
+
+};
+
+struct host_context_t{
+
+};
+
+struct host_stream_t{
+
+};
+
+struct host_module_t{
+  std::string error;
+  llvm::ExecutionEngine* engine;
+  std::map<std::string, llvm::Function*> functions;
+};
+
+struct host_function_t{
+  llvm::Function* fn;
+};
+
+struct host_buffer_t{
+  char* data;
+};
+
+
+// Extra CUDA handles
+struct cu_event_t{
+  operator bool() const { return first && second; }
+  CUevent first;
+  CUevent second;
+};
+
+struct CUPlatform{
+  CUPlatform() : status_(dispatch::cuInit(0)) { }
+  operator bool() const { return status_; }
+private:
+  CUresult status_;
+};
+
+template<class T, class CUType>
+class handle_interface{
+public:
+    //Accessors
+    operator CUType() const { return *(((T*)this)->cu().h_); }
+    //Comparison
+    bool operator==(handle_interface const & y) { return (CUType)(*this) == (CUType)(y); }
+    bool operator!=(handle_interface const & y) { return (CUType)(*this) != (CUType)(y); }
+    bool operator<(handle_interface const & y) { return (CUType)(*this) < (CUType)(y); }
+};
+
+template<class T>
+class handle{
+public:
+  template<class, class> friend class handle_interface;
+public:
+  //Constructors
+  handle(T h, bool take_ownership = true);
+  handle();
+  ~handle();
+  T& operator*() { return *h_; }
+  T const & operator*() const { return *h_; }
+  T* operator->() const { return h_.get(); }
+
+protected:
+  std::shared_ptr<T> h_;
+  bool has_ownership_;
+};
+
+template<class CUType, class CLType, class HostType>
+class polymorphic_resource {
+public:
+  polymorphic_resource(CUType cu, bool take_ownership): cu_(cu, take_ownership), backend_(CUDA){}
+  polymorphic_resource(CLType cl, bool take_ownership): cl_(cl, take_ownership), backend_(OpenCL){}
+  polymorphic_resource(HostType hst, bool take_ownership): hst_(hst, take_ownership), backend_(Host){}
+  virtual ~polymorphic_resource() { }
+
+  handle<CUType> cu() { return cu_; }
+  handle<CLType> cl() { return cl_; }
+  handle<HostType> hst() { return hst_; }
+  const handle<CUType>& cu() const { return cu_; }
+  const handle<CLType>& cl() const { return cl_; }
+  const handle<HostType>& hst() const { return hst_; }
+  backend_t backend() { return backend_; }
+
+protected:
+  handle<CLType> cl_;
+  handle<CUType> cu_;
+  handle<HostType> hst_;
+  backend_t backend_;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/driver/kernel.h b/include/triton/driver/kernel.h
new file mode 100755
index 000000000..b45755ee7
--- /dev/null
+++ b/include/triton/driver/kernel.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_KERNEL_H_
+#define _TRITON_DRIVER_KERNEL_H_
+
+#include "triton/driver/module.h"
+#include "triton/driver/handle.h"
+#include <memory>
+
+namespace llvm
+{
+class GenericValue;
+}
+
+namespace triton
+{
+
+namespace driver
+{
+
+class cu_buffer;
+
+// Base
+class kernel: public polymorphic_resource<CUfunction, cl_kernel, host_function_t> {
+public:
+  kernel(driver::module* program, CUfunction fn, bool has_ownership);
+  kernel(driver::module* program, cl_kernel fn, bool has_ownership);
+  kernel(driver::module* program, host_function_t fn, bool has_ownership);
+  // Getters
+  driver::module* module();
+  // Factory methods
+  static kernel* create(driver::module* program, const char* name);
+  // Arguments setters
+  virtual void setArg(unsigned int index, std::size_t size, void* ptr) = 0;
+  virtual void setArg(unsigned int index, buffer *) = 0;
+  template<class T> void setArg(unsigned int index, T value) { setArg(index, sizeof(T), (void*)&value); }
+private:
+  driver::module* program_;
+};
+
+// Host
+class host_kernel: public kernel {
+public:
+  //Constructors
+  host_kernel(driver::module* program, const char* name);
+  // Arguments setters
+  void setArg(unsigned int index, std::size_t size, void* ptr);
+  void setArg(unsigned int index, driver::buffer* buffer);
+  // Params
+  const std::vector<void*>& params();
+private:
+  std::vector<std::shared_ptr<void> >  params_store_;
+  std::vector<void*>  params_;
+};
+
+// OpenCL
+class ocl_kernel: public kernel {
+public:
+  //Constructors
+  ocl_kernel(driver::module* program, const char* name);
+  // Arguments setters
+  void setArg(unsigned int index, std::size_t size, void* ptr);
+  void setArg(unsigned int index, driver::buffer* buffer);
+
+};
+
+// CUDA
+class cu_kernel: public kernel {
+public:
+  //Constructors
+  cu_kernel(driver::module* program, const char * name);
+  // Arguments setters
+  void setArg(unsigned int index, std::size_t size, void* ptr);
+  void setArg(unsigned int index, driver::buffer* buffer);
+  //Arguments getters
+  void* const* cu_params() const;
+
+private:
+  std::vector<std::shared_ptr<void> >  cu_params_store_;
+  std::vector<void*>  cu_params_;
+};
+
+}
+
+}
+
+#endif
+
diff --git a/include/triton/driver/module.h b/include/triton/driver/module.h
new file mode 100755
index 000000000..d62142c7d
--- /dev/null
+++ b/include/triton/driver/module.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_MODULE_H_
+#define _TRITON_DRIVER_MODULE_H_
+
+#include <map>
+#include "triton/driver/handle.h"
+#include "triton/driver/context.h"
+#include "triton/driver/buffer.h"
+
+namespace llvm
+{
+  class Module;
+  template<class T>
+  class SmallVectorImpl;
+}
+
+namespace triton
+{
+
+namespace driver
+{
+
+class cu_context;
+class cu_device;
+
+// Base
+class module: public polymorphic_resource<CUmodule, cl_program, host_module_t> {
+protected:
+  void init_llvm();
+
+  enum file_type_t{
+    Object,
+    Assembly
+  };
+
+public:
+  module(driver::context* ctx, CUmodule mod, bool has_ownership);
+  module(driver::context* ctx, cl_program mod, bool has_ownership);
+  module(driver::context* ctx, host_module_t mod, bool has_ownership);
+  static module* create(driver::context* ctx, std::unique_ptr<llvm::Module> src);
+  driver::context* context() const;
+  void compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
+                                  const std::string &proc, std::string layout,
+                           llvm::SmallVectorImpl<char> &buffer,
+                           const std::string &features,
+                           file_type_t file_type);
+  virtual std::unique_ptr<buffer> symbol(const char * name) const = 0;
+
+
+protected:
+  driver::context* ctx_;
+};
+
+// CPU
+class host_module: public module{
+public:
+  host_module(driver::context* context, std::unique_ptr<llvm::Module> module);
+  std::unique_ptr<buffer> symbol(const char * name) const;
+};
+
+// OpenCL
+class ocl_module: public module{
+public:
+  ocl_module(driver::context* context, std::unique_ptr<llvm::Module> module);
+  std::unique_ptr<buffer> symbol(const char * name) const;
+};
+
+// CUDA
+class cu_module: public module {
+  std::string compile_llvm_module(std::unique_ptr<llvm::Module> module, driver::device* device);
+
+public:
+  cu_module(driver::context* context, std::unique_ptr<llvm::Module> module);
+  cu_module(driver::context* context, const std::string& source);
+  std::unique_ptr<buffer> symbol(const char * name) const;
+
+private:
+  std::string source_;
+};
+
+
+}
+
+}
+
+#endif
diff --git a/include/triton/driver/platform.h b/include/triton/driver/platform.h
new file mode 100755
index 000000000..ff4e83b9d
--- /dev/null
+++ b/include/triton/driver/platform.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_PLATFORM_H_
+#define _TRITON_DRIVER_PLATFORM_H_
+
+#include <vector>
+#include <string>
+
+#include "triton/driver/handle.h"
+
+namespace triton
+{
+
+namespace driver
+{
+
+class device;
+
+class platform
+{
+public:
+  // Constructor
+  platform(const std::string& name): name_(name){ }
+  // Accessors
+  std::string name() const { return name_; }
+  // Virtual methods
+  virtual std::string version() const = 0;
+  virtual void devices(std::vector<driver::device *> &devices) const = 0;
+private:
+  std::string name_;
+};
+
+// CUDA
+class cu_platform: public platform
+{
+public:
+  cu_platform(): platform("CUDA") { }
+  std::string version() const;
+  void devices(std::vector<driver::device*> &devices) const;
+
+private:
+  handle<CUPlatform> cu_;
+};
+
+// OpenCL
+class cl_platform: public platform
+{
+public:
+  cl_platform(cl_platform_id cl): platform("OpenCL"), cl_(cl) { }
+  std::string version() const;
+  void devices(std::vector<driver::device*> &devices) const;
+
+private:
+  handle<cl_platform_id> cl_;
+};
+
+// Host
+class host_platform: public platform
+{
+public:
+  host_platform(): platform("CPU") { }
+  std::string version() const;
+  void devices(std::vector<driver::device*> &devices) const;
+};
+
+}
+
+}
+
+#endif
diff --git a/include/triton/driver/stream.h b/include/triton/driver/stream.h
new file mode 100755
index 000000000..4b80b62af
--- /dev/null
+++ b/include/triton/driver/stream.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_STREAM_H_
+#define _TRITON_DRIVER_STREAM_H_
+
+#include <map>
+#include "triton/driver/context.h"
+#include "triton/driver/device.h"
+#include "triton/driver/handle.h"
+#include "triton/driver/buffer.h"
+
+namespace triton
+{
+
+namespace driver
+{
+
+class kernel;
+class event;
+class Range;
+class cu_buffer;
+
+// Base
+class stream: public polymorphic_resource<CUstream, cl_command_queue, host_stream_t> {
+public:
+  stream(driver::context *ctx, CUstream, bool has_ownership);
+  stream(driver::context *ctx, cl_command_queue, bool has_ownership);
+  stream(driver::context *ctx, host_stream_t, bool has_ownership);
+  // factory
+  static driver::stream* create(driver::context* ctx);
+  // accessors
+  driver::context* context() const;
+  // methods
+  virtual void synchronize() = 0;
+  virtual void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<event> const * = NULL, event *event = NULL) = 0;
+  virtual void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr) = 0;
+  virtual void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr) = 0;
+  // template helpers
+  template<class T> void write(driver::buffer* buf, bool blocking, std::size_t offset, std::vector<T> const & x)
+  { write(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
+  template<class T> void read(driver::buffer* buf, bool blocking, std::size_t offset, std::vector<T>& x)
+  { read(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
+
+protected:
+  driver::context *ctx_;
+};
+
+// Host
+class host_stream: public stream {
+public:
+  // Constructors
+  host_stream(driver::context *ctx);
+
+  // Overridden
+  void synchronize();
+  void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<event> const *, event *event);
+  void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
+  void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr);
+};
+
+// OpenCL
+class cl_stream: public stream {
+public:
+  // Constructors
+  cl_stream(driver::context *ctx);
+
+  // Overridden
+  void synchronize();
+  void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<event> const *, event *event);
+  void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
+  void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr);
+};
+
+// CUDA
+class cu_stream: public stream {
+public:
+  // Constructors
+  cu_stream(CUstream str, bool take_ownership);
+  cu_stream(driver::context* context);
+
+  // Overridden
+  void synchronize();
+  void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<event> const *, event *event);
+  void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
+  void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr);
+};
+
+
+}
+
+}
+
+#endif
diff --git a/include/triton/external/CL/cl.h b/include/triton/external/CL/cl.h
new file mode 100644
index 000000000..8d58f8f77
--- /dev/null
+++ b/include/triton/external/CL/cl.h
@@ -0,0 +1,1468 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl_platform.h>
+#else
+#include "cl_platform.h"
+#endif	
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ 
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_device_svm_capabilities;
+typedef cl_bitfield         cl_command_queue_properties;
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
+
+typedef intptr_t            cl_context_properties;
+typedef cl_uint             cl_context_info;
+typedef cl_bitfield         cl_queue_properties;
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+typedef cl_bitfield         cl_svm_mem_flags;
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+typedef cl_bitfield         cl_mem_migration_flags;
+typedef cl_uint             cl_image_info;
+typedef cl_uint             cl_buffer_create_type;
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+typedef intptr_t            cl_pipe_properties;
+typedef cl_uint             cl_pipe_info;
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+typedef cl_uint             cl_program_binary_type;
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
+typedef cl_uint             cl_kernel_work_group_info;
+typedef cl_uint             cl_kernel_sub_group_info;
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+typedef cl_bitfield         cl_sampler_properties;
+typedef cl_uint             cl_kernel_exec_info;
+
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+#ifdef __GNUC__
+    __extension__   /* Prevents warnings about anonymous union in -pedantic builds */
+#endif
+    union {
+      cl_mem                  buffer;
+      cl_mem                  mem_object;
+    };
+} cl_image_desc;
+
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
+
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#define CL_INVALID_PROPERTY                         -64
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+#define CL_INVALID_PIPE_SIZE                        -69
+#define CL_INVALID_DEVICE_QUEUE                     -70
+#define CL_INVALID_SPEC_ID                          -71
+#define CL_MAX_SIZE_RESTRICTION_EXCEEDED            -72
+
+/* OpenCL Version */
+#define CL_VERSION_1_0                              1
+#define CL_VERSION_1_1                              1
+#define CL_VERSION_1_2                              1
+#define CL_VERSION_2_0                              1
+#define CL_VERSION_2_1                              1
+#define CL_VERSION_2_2                              1
+
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+#define CL_BLOCKING                                 CL_TRUE
+#define CL_NON_BLOCKING                             CL_FALSE
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+#define CL_PLATFORM_HOST_TIMER_RESOLUTION           0x0905
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE                                   0x1000
+#define CL_DEVICE_VENDOR_ID                              0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                      0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS               0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE                    0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES                    0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR            0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT           0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT             0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG            0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT           0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE          0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY                    0x100C
+#define CL_DEVICE_ADDRESS_BITS                           0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS                    0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                   0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                     0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                      0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                     0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                      0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                     0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                      0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                          0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                     0x1017
+#define CL_DEVICE_MAX_SAMPLERS                           0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                    0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE               0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                       0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                  0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE              0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                  0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                        0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE               0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                      0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                         0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                         0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT               0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION             0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                          0x1026
+#define CL_DEVICE_AVAILABLE                              0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                     0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES                 0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                       0x102A    /* deprecated */
+#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES               0x102A
+#define CL_DEVICE_NAME                                   0x102B
+#define CL_DEVICE_VENDOR                                 0x102C
+#define CL_DRIVER_VERSION                                0x102D
+#define CL_DEVICE_PROFILE                                0x102E
+#define CL_DEVICE_VERSION                                0x102F
+#define CL_DEVICE_EXTENSIONS                             0x1030
+#define CL_DEVICE_PLATFORM                               0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
+#define CL_DEVICE_HALF_FP_CONFIG                         0x1033
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF            0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY                    0x1035   /* deprecated */
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR               0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT              0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT                0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG               0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT              0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE             0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF               0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                       0x103D
+#define CL_DEVICE_LINKER_AVAILABLE                       0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                       0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                  0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                   0x1041
+#define CL_DEVICE_PARENT_DEVICE                          0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES              0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES                   0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN              0x1045
+#define CL_DEVICE_PARTITION_TYPE                         0x1046
+#define CL_DEVICE_REFERENCE_COUNT                        0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC            0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                     0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                  0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT           0x104B
+#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS              0x104C
+#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE               0x104D
+#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES             0x104E
+#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE         0x104F
+#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE               0x1050
+#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                   0x1051
+#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                   0x1052
+#define CL_DEVICE_SVM_CAPABILITIES                       0x1053
+#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE   0x1054
+#define CL_DEVICE_MAX_PIPE_ARGS                          0x1055
+#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS           0x1056
+#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                   0x1057
+#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT    0x1058
+#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT      0x1059
+#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT       0x105A
+#define CL_DEVICE_IL_VERSION                             0x105B
+#define CL_DEVICE_MAX_NUM_SUB_GROUPS                     0x105C
+#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
+
+/* cl_device_mem_cache_type */
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+#define CL_QUEUE_ON_DEVICE                          (1 << 2)
+#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)
+
+/* cl_context_info  */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
+    
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
+    
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA               (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE           (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE           (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE           (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE           (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
+    
+/* cl_device_svm_capabilities */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+#define CL_QUEUE_SIZE                               0x1094
+#define CL_QUEUE_DEVICE_DEFAULT                     0x1095
+
+/* cl_mem_flags and cl_svm_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+/* reserved                                         (1 << 6)    */
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)
+
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
+
+/* cl_channel_order */
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+#define CL_DEPTH                                    0x10BD
+#define CL_DEPTH_STENCIL                            0x10BE
+#define CL_sRGB                                     0x10BF
+#define CL_sRGBx                                    0x10C0
+#define CL_sRGBA                                    0x10C1
+#define CL_sBGRA                                    0x10C2
+#define CL_ABGR                                     0x10C3
+
+/* cl_channel_type */
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+#define CL_UNORM_INT24                              0x10DF
+#define CL_UNORM_INT_101010_2                       0x10E0
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+#define CL_MEM_OBJECT_PIPE                          0x10F7
+
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+#define CL_MEM_USES_SVM_POINTER                     0x1109
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+#define CL_IMAGE_ARRAY_SIZE                         0x1117
+#define CL_IMAGE_BUFFER                             0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
+#define CL_IMAGE_NUM_SAMPLES                        0x111A
+    
+/* cl_pipe_info */
+#define CL_PIPE_PACKET_SIZE                         0x1120
+#define CL_PIPE_MAX_PACKETS                         0x1121
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+#define CL_SAMPLER_MIP_FILTER_MODE                  0x1155
+#define CL_SAMPLER_LOD_MIN                          0x1156
+#define CL_SAMPLER_LOD_MAX                          0x1157
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+#define CL_PROGRAM_NUM_KERNELS                      0x1167
+#define CL_PROGRAM_KERNEL_NAMES                     0x1168
+#define CL_PROGRAM_IL                               0x1169
+#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT       0x116A
+#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT       0x116B
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+#define CL_PROGRAM_BINARY_TYPE                      0x1184
+#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185
+    
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+#define CL_KERNEL_ATTRIBUTES                        0x1195
+#define CL_KERNEL_MAX_NUM_SUB_GROUPS                0x11B9
+#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS            0x11BA
+
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
+#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
+#define CL_KERNEL_ARG_NAME                          0x119A
+
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
+
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
+    
+/* cl_kernel_arg_type_qualifier */
+#define CL_KERNEL_ARG_TYPE_NONE                     0
+#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+#define CL_KERNEL_ARG_TYPE_PIPE                     (1 << 3)
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
+
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE    0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE       0x2034
+#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT    0x11B8
+    
+/* cl_kernel_exec_info */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS                0x11B6
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM   0x11B7
+
+/* cl_event_info  */
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
+#define CL_EVENT_CONTEXT                            0x11D4
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+#define CL_COMMAND_BARRIER                          0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
+#define CL_COMMAND_FILL_BUFFER                      0x1207
+#define CL_COMMAND_FILL_IMAGE                       0x1208
+#define CL_COMMAND_SVM_FREE                         0x1209
+#define CL_COMMAND_SVM_MEMCPY                       0x120A
+#define CL_COMMAND_SVM_MEMFILL                      0x120B
+#define CL_COMMAND_SVM_MAP                          0x120C
+#define CL_COMMAND_SVM_UNMAP                        0x120D
+
+/* command execution status */
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+
+/* cl_buffer_create_type  */
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+
+/* cl_profiling_info  */
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+#define CL_PROFILING_COMMAND_COMPLETE               0x1284
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint          /* num_entries */,
+                 cl_platform_id * /* platforms */,
+                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL 
+clGetPlatformInfo(cl_platform_id   /* platform */, 
+                  cl_platform_info /* param_name */,
+                  size_t           /* param_value_size */, 
+                  void *           /* param_value */,
+                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id   /* platform */,
+               cl_device_type   /* device_type */, 
+               cl_uint          /* num_entries */, 
+               cl_device_id *   /* devices */, 
+               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id    /* device */,
+                cl_device_info  /* param_name */, 
+                size_t          /* param_value_size */, 
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id                         /* in_device */,
+                   const cl_device_partition_property * /* properties */,
+                   cl_uint                              /* num_devices */,
+                   cl_device_id *                       /* out_devices */,
+                   cl_uint *                            /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetDefaultDeviceCommandQueue(cl_context           /* context */,
+                               cl_device_id         /* device */,
+                               cl_command_queue     /* command_queue */) CL_API_SUFFIX__VERSION_2_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceAndHostTimer(cl_device_id    /* device */,
+                        cl_ulong*       /* device_timestamp */,
+                        cl_ulong*       /* host_timestamp */) CL_API_SUFFIX__VERSION_2_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetHostTimer(cl_device_id /* device */,
+               cl_ulong *   /* host_timestamp */)  CL_API_SUFFIX__VERSION_2_1;
+
+    
+/* Context APIs  */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * /* properties */,
+                cl_uint                 /* num_devices */,
+                const cl_device_id *    /* devices */,
+                void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+                void *                  /* user_data */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * /* properties */,
+                        cl_device_type          /* device_type */,
+                        void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+                        void *                  /* user_data */,
+                        cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context         /* context */, 
+                 cl_context_info    /* param_name */, 
+                 size_t             /* param_value_size */, 
+                 void *             /* param_value */, 
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithProperties(cl_context               /* context */,
+                                   cl_device_id             /* device */,
+                                   const cl_queue_properties *    /* properties */,
+                                   cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
+                      cl_command_queue_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context   /* context */,
+               cl_mem_flags /* flags */,
+               size_t       /* size */,
+               void *       /* host_ptr */,
+               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem                   /* buffer */,
+                  cl_mem_flags             /* flags */,
+                  cl_buffer_create_type    /* buffer_create_type */,
+                  const void *             /* buffer_create_info */,
+                  cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage(cl_context              /* context */,
+              cl_mem_flags            /* flags */,
+              const cl_image_format * /* image_format */,
+              const cl_image_desc *   /* image_desc */, 
+              void *                  /* host_ptr */,
+              cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+                        
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreatePipe(cl_context                 /* context */,
+             cl_mem_flags               /* flags */,
+             cl_uint                    /* pipe_packet_size */,
+             cl_uint                    /* pipe_max_packets */,
+             const cl_pipe_properties * /* properties */,
+             cl_int *                   /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context           /* context */,
+                           cl_mem_flags         /* flags */,
+                           cl_mem_object_type   /* image_type */,
+                           cl_uint              /* num_entries */,
+                           cl_image_format *    /* image_formats */,
+                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+                                    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem           /* memobj */,
+                   cl_mem_info      /* param_name */, 
+                   size_t           /* param_value_size */,
+                   void *           /* param_value */,
+                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem           /* image */,
+               cl_image_info    /* param_name */, 
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPipeInfo(cl_mem           /* pipe */,
+              cl_pipe_info     /* param_name */,
+              size_t           /* param_value_size */,
+              void *           /* param_value */,
+              size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
+    
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(cl_mem /* memobj */,
+                                 void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+                                 void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;
+
+/* SVM Allocation APIs */
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAlloc(cl_context       /* context */,
+           cl_svm_mem_flags /* flags */,
+           size_t           /* size */,
+           cl_uint          /* alignment */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFree(cl_context        /* context */,
+          void *            /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;
+    
+/* Sampler APIs */
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSamplerWithProperties(cl_context                     /* context */,
+                              const cl_sampler_properties *  /* normalized_coords */,
+                              cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler         /* sampler */,
+                 cl_sampler_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+/* Program Object APIs  */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context        /* context */,
+                          cl_uint           /* count */,
+                          const char **     /* strings */,
+                          const size_t *    /* lengths */,
+                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context                     /* context */,
+                          cl_uint                        /* num_devices */,
+                          const cl_device_id *           /* device_list */,
+                          const size_t *                 /* lengths */,
+                          const unsigned char **         /* binaries */,
+                          cl_int *                       /* binary_status */,
+                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context            /* context */,
+                                  cl_uint               /* num_devices */,
+                                  const cl_device_id *  /* device_list */,
+                                  const char *          /* kernel_names */,
+                                  cl_int *              /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithIL(cl_context    /* context */,
+                     const void*    /* il */,
+                     size_t         /* length */,
+                     cl_int*        /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1;
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program           /* program */,
+               cl_uint              /* num_devices */,
+               const cl_device_id * /* device_list */,
+               const char *         /* options */, 
+               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram(cl_program           /* program */,
+                 cl_uint              /* num_devices */,
+                 const cl_device_id * /* device_list */,
+                 const char *         /* options */, 
+                 cl_uint              /* num_input_headers */,
+                 const cl_program *   /* input_headers */,
+                 const char **        /* header_include_names */,
+                 void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+                 void *               /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context           /* context */,
+              cl_uint              /* num_devices */,
+              const cl_device_id * /* device_list */,
+              const char *         /* options */, 
+              cl_uint              /* num_input_programs */,
+              const cl_program *   /* input_programs */,
+              void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+              void *               /* user_data */,
+              cl_int *             /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetProgramReleaseCallback(cl_program          /* program */,
+                            void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+                            void *              /* user_data */) CL_API_SUFFIX__VERSION_2_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetProgramSpecializationConstant(cl_program  /* program */,
+                                   cl_uint     /* spec_id */,
+                                   size_t      /* spec_size */,
+                                   const void* /* spec_value */) CL_API_SUFFIX__VERSION_2_2;
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program         /* program */,
+                 cl_program_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program            /* program */,
+                      cl_device_id          /* device */,
+                      cl_program_build_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program      /* program */,
+               const char *    /* kernel_name */,
+               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program     /* program */,
+                         cl_uint        /* num_kernels */,
+                         cl_kernel *    /* kernels */,
+                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCloneKernel(cl_kernel     /* source_kernel */,
+              cl_int*       /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel    /* kernel */,
+               cl_uint      /* arg_index */,
+               size_t       /* arg_size */,
+               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointer(cl_kernel    /* kernel */,
+                         cl_uint      /* arg_index */,
+                         const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfo(cl_kernel            /* kernel */,
+                    cl_kernel_exec_info  /* param_name */,
+                    size_t               /* param_value_size */,
+                    const void *         /* param_value */) CL_API_SUFFIX__VERSION_2_0;
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel       /* kernel */,
+                cl_kernel_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel       /* kernel */,
+                   cl_uint         /* arg_indx */,
+                   cl_kernel_arg_info  /* param_name */,
+                   size_t          /* param_value_size */,
+                   void *          /* param_value */,
+                   size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
+                         cl_device_id               /* device */,
+                         cl_kernel_work_group_info  /* param_name */,
+                         size_t                     /* param_value_size */,
+                         void *                     /* param_value */,
+                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfo(cl_kernel                   /* kernel */,
+                        cl_device_id                /* device */,
+                        cl_kernel_sub_group_info    /* param_name */,
+                        size_t                      /* input_value_size */,
+                        const void*                 /*input_value */,
+                        size_t                      /* param_value_size */,
+                        void*                       /* param_value */,
+                        size_t*                     /* param_value_size_ret */ ) CL_API_SUFFIX__VERSION_2_1;
+
+
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint             /* num_events */,
+                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event         /* event */,
+               cl_event_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context    /* context */,
+                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;               
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event   /* event */,
+                     cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+                     
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback( cl_event    /* event */,
+                    cl_int      /* command_exec_callback_type */,
+                    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+                    void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event            /* event */,
+                        cl_profiling_info   /* param_name */,
+                        size_t              /* param_value_size */,
+                        void *              /* param_value */,
+                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                                
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
+                    cl_mem              /* buffer */,
+                    cl_bool             /* blocking_read */,
+                    size_t              /* offset */,
+                    size_t              /* size */, 
+                    void *              /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
+                        cl_mem              /* buffer */,
+                        cl_bool             /* blocking_read */,
+                        const size_t *      /* buffer_offset */,
+                        const size_t *      /* host_offset */, 
+                        const size_t *      /* region */,
+                        size_t              /* buffer_row_pitch */,
+                        size_t              /* buffer_slice_pitch */,
+                        size_t              /* host_row_pitch */,
+                        size_t              /* host_slice_pitch */,                        
+                        void *              /* ptr */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   /* command_queue */, 
+                     cl_mem             /* buffer */, 
+                     cl_bool            /* blocking_write */, 
+                     size_t             /* offset */, 
+                     size_t             /* size */, 
+                     const void *       /* ptr */, 
+                     cl_uint            /* num_events_in_wait_list */, 
+                     const cl_event *   /* event_wait_list */, 
+                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
+                         cl_mem              /* buffer */,
+                         cl_bool             /* blocking_write */,
+                         const size_t *      /* buffer_offset */,
+                         const size_t *      /* host_offset */, 
+                         const size_t *      /* region */,
+                         size_t              /* buffer_row_pitch */,
+                         size_t              /* buffer_slice_pitch */,
+                         size_t              /* host_row_pitch */,
+                         size_t              /* host_slice_pitch */,                        
+                         const void *        /* ptr */,
+                         cl_uint             /* num_events_in_wait_list */,
+                         const cl_event *    /* event_wait_list */,
+                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue   /* command_queue */,
+                    cl_mem             /* buffer */, 
+                    const void *       /* pattern */, 
+                    size_t             /* pattern_size */, 
+                    size_t             /* offset */, 
+                    size_t             /* size */, 
+                    cl_uint            /* num_events_in_wait_list */, 
+                    const cl_event *   /* event_wait_list */, 
+                    cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue    /* command_queue */, 
+                    cl_mem              /* src_buffer */,
+                    cl_mem              /* dst_buffer */, 
+                    size_t              /* src_offset */,
+                    size_t              /* dst_offset */,
+                    size_t              /* size */, 
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */, 
+                        cl_mem              /* src_buffer */,
+                        cl_mem              /* dst_buffer */, 
+                        const size_t *      /* src_origin */,
+                        const size_t *      /* dst_origin */,
+                        const size_t *      /* region */, 
+                        size_t              /* src_row_pitch */,
+                        size_t              /* src_slice_pitch */,
+                        size_t              /* dst_row_pitch */,
+                        size_t              /* dst_slice_pitch */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* image */,
+                   cl_bool              /* blocking_read */, 
+                   const size_t *       /* origin[3] */,
+                   const size_t *       /* region[3] */,
+                   size_t               /* row_pitch */,
+                   size_t               /* slice_pitch */, 
+                   void *               /* ptr */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue    /* command_queue */,
+                    cl_mem              /* image */,
+                    cl_bool             /* blocking_write */, 
+                    const size_t *      /* origin[3] */,
+                    const size_t *      /* region[3] */,
+                    size_t              /* input_row_pitch */,
+                    size_t              /* input_slice_pitch */, 
+                    const void *        /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue   /* command_queue */,
+                   cl_mem             /* image */, 
+                   const void *       /* fill_color */, 
+                   const size_t *     /* origin[3] */, 
+                   const size_t *     /* region[3] */, 
+                   cl_uint            /* num_events_in_wait_list */, 
+                   const cl_event *   /* event_wait_list */, 
+                   cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* src_image */,
+                   cl_mem               /* dst_image */, 
+                   const size_t *       /* src_origin[3] */,
+                   const size_t *       /* dst_origin[3] */,
+                   const size_t *       /* region[3] */, 
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_image */,
+                           cl_mem           /* dst_buffer */, 
+                           const size_t *   /* src_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           size_t           /* dst_offset */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_buffer */,
+                           cl_mem           /* dst_image */, 
+                           size_t           /* src_offset */,
+                           const size_t *   /* dst_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue /* command_queue */,
+                   cl_mem           /* buffer */,
+                   cl_bool          /* blocking_map */, 
+                   cl_map_flags     /* map_flags */,
+                   size_t           /* offset */,
+                   size_t           /* size */,
+                   cl_uint          /* num_events_in_wait_list */,
+                   const cl_event * /* event_wait_list */,
+                   cl_event *       /* event */,
+                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue  /* command_queue */,
+                  cl_mem            /* image */, 
+                  cl_bool           /* blocking_map */, 
+                  cl_map_flags      /* map_flags */, 
+                  const size_t *    /* origin[3] */,
+                  const size_t *    /* region[3] */,
+                  size_t *          /* image_row_pitch */,
+                  size_t *          /* image_slice_pitch */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */,
+                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
+                        cl_mem           /* memobj */,
+                        void *           /* mapped_ptr */,
+                        cl_uint          /* num_events_in_wait_list */,
+                        const cl_event *  /* event_wait_list */,
+                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue       /* command_queue */,
+                           cl_uint                /* num_mem_objects */,
+                           const cl_mem *         /* mem_objects */,
+                           cl_mem_migration_flags /* flags */,
+                           cl_uint                /* num_events_in_wait_list */,
+                           const cl_event *       /* event_wait_list */,
+                           cl_event *             /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+                       cl_kernel        /* kernel */,
+                       cl_uint          /* work_dim */,
+                       const size_t *   /* global_work_offset */,
+                       const size_t *   /* global_work_size */,
+                       const size_t *   /* local_work_size */,
+                       cl_uint          /* num_events_in_wait_list */,
+                       const cl_event * /* event_wait_list */,
+                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
+					  void (CL_CALLBACK * /*user_func*/)(void *), 
+                      void *            /* args */,
+                      size_t            /* cb_args */, 
+                      cl_uint           /* num_mem_objects */,
+                      const cl_mem *    /* mem_list */,
+                      const void **     /* args_mem_loc */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList(cl_command_queue  /* command_queue */,
+                            cl_uint           /* num_events_in_wait_list */,
+                            const cl_event *  /* event_wait_list */,
+                            cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList(cl_command_queue  /* command_queue */,
+                             cl_uint           /* num_events_in_wait_list */,
+                             const cl_event *  /* event_wait_list */,
+                             cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFree(cl_command_queue  /* command_queue */,
+                 cl_uint           /* num_svm_pointers */,
+                 void *[]          /* svm_pointers[] */,
+                 void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                                        cl_uint          /* num_svm_pointers */,
+                                                        void *[]         /* svm_pointers[] */,
+                                                        void *           /* user_data */),
+                 void *            /* user_data */,
+                 cl_uint           /* num_events_in_wait_list */,
+                 const cl_event *  /* event_wait_list */,
+                 cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpy(cl_command_queue  /* command_queue */,
+                   cl_bool           /* blocking_copy */,
+                   void *            /* dst_ptr */,
+                   const void *      /* src_ptr */,
+                   size_t            /* size */,
+                   cl_uint           /* num_events_in_wait_list */,
+                   const cl_event *  /* event_wait_list */,
+                   cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFill(cl_command_queue  /* command_queue */,
+                    void *            /* svm_ptr */,
+                    const void *      /* pattern */,
+                    size_t            /* pattern_size */,
+                    size_t            /* size */,
+                    cl_uint           /* num_events_in_wait_list */,
+                    const cl_event *  /* event_wait_list */,
+                    cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMap(cl_command_queue  /* command_queue */,
+                cl_bool           /* blocking_map */,
+                cl_map_flags      /* flags */,
+                void *            /* svm_ptr */,
+                size_t            /* size */,
+                cl_uint           /* num_events_in_wait_list */,
+                const cl_event *  /* event_wait_list */,
+                cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmap(cl_command_queue  /* command_queue */,
+                  void *            /* svm_ptr */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMigrateMem(cl_command_queue         /* command_queue */,
+                       cl_uint                  /* num_svm_pointers */,
+                       const void **            /* svm_pointers */,
+                       const size_t *           /* sizes */,
+                       cl_mem_migration_flags   /* flags */,
+                       cl_uint                  /* num_events_in_wait_list */,
+                       const cl_event *         /* event_wait_list */,
+                       cl_event *               /* event */) CL_API_SUFFIX__VERSION_2_1;
+
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found.  The client must
+ * check to make sure the address is not NULL, before using or 
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL 
+clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
+                                         const char *   /* func_name */) CL_API_SUFFIX__VERSION_1_2;
+    
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage2D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_row_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage3D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */, 
+                size_t                  /* image_height */,
+                size_t                  /* image_depth */, 
+                size_t                  /* image_row_pitch */, 
+                size_t                  /* image_slice_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue    /* command_queue */,
+                cl_event *          /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+                        cl_uint          /* num_events */,
+                        const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+/* Deprecated OpenCL 2.0 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     /* context */,
+                     cl_device_id                   /* device */,
+                     cl_command_queue_properties    /* properties */,
+                     cl_int *                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+    
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL
+clCreateSampler(cl_context          /* context */,
+                cl_bool             /* normalized_coords */,
+                cl_addressing_mode  /* addressing_mode */,
+                cl_filter_mode      /* filter_mode */,
+                cl_int *            /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  /* command_queue */,
+              cl_kernel         /* kernel */,
+              cl_uint           /* num_events_in_wait_list */,
+              const cl_event *  /* event_wait_list */,
+              cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+    
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_H */
+
diff --git a/include/triton/external/CL/cl.hpp b/include/triton/external/CL/cl.hpp
new file mode 100644
index 000000000..6634f8c76
--- /dev/null
+++ b/include/triton/external/CL/cl.hpp
@@ -0,0 +1,12947 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/*! \file
+ *
+ *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and 
+ *       OpenCL 1.2 (rev 15)    
+ *   \author Benedict R. Gaster, Laurent Morichetti and Lee Howes
+ *   
+ *   Additions and fixes from:
+ *       Brian Cole, March 3rd 2010 and April 2012 
+ *       Matt Gruenke, April 2012.
+ *       Bruce Merry, February 2013.
+ *       Tom Deakin and Simon McIntosh-Smith, July 2013
+ *   
+ *   \version 1.2.9
+ *   \date December 2015
+ *
+ *   Optional extension support
+ *
+ *         cl
+ *         cl_ext_device_fission
+ *				#define USE_CL_DEVICE_FISSION
+ */
+
+/*! \mainpage
+ * \section intro Introduction
+ * For many large applications C++ is the language of choice and so it seems
+ * reasonable to define C++ bindings for OpenCL.
+ *
+ *
+ * The interface is contained with a single C++ header file \em cl.hpp and all
+ * definitions are contained within the namespace \em cl. There is no additional
+ * requirement to include \em cl.h and to use either the C++ or original C
+ * bindings it is enough to simply include \em cl.hpp.
+ *
+ * The bindings themselves are lightweight and correspond closely to the
+ * underlying C API. Using the C++ bindings introduces no additional execution
+ * overhead.
+ *
+ * For detail documentation on the bindings see:
+ *
+ * The OpenCL C++ Wrapper API 1.2 (revision 09)
+ *  http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf
+ *
+ * \section example Example
+ *
+ * The following example shows a general use case for the C++
+ * bindings, including support for the optional exception feature and
+ * also the supplied vector and string classes, see following sections for
+ * decriptions of these features.
+ *
+ * \code
+ * #define __CL_ENABLE_EXCEPTIONS
+ * 
+ * #if defined(__APPLE__) || defined(__MACOSX)
+ * #include <OpenCL/cl.hpp>
+ * #else
+ * #include <CL/cl.hpp>
+ * #endif
+ * #include <cstdio>
+ * #include <cstdlib>
+ * #include <iostream>
+ * 
+ *  const char * helloStr  = "__kernel void "
+ *                           "hello(void) "
+ *                           "{ "
+ *                           "  "
+ *                           "} ";
+ * 
+ *  int
+ *  main(void)
+ *  {
+ *     cl_int err = CL_SUCCESS;
+ *     try {
+ *
+ *       std::vector<cl::Platform> platforms;
+ *       cl::Platform::get(&platforms);
+ *       if (platforms.size() == 0) {
+ *           std::cout << "Platform size 0\n";
+ *           return -1;
+ *       }
+ *
+ *       cl_context_properties properties[] = 
+ *          { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
+ *       cl::Context context(CL_DEVICE_TYPE_CPU, properties); 
+ * 
+ *       std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+ * 
+ *       cl::Program::Sources source(1,
+ *           std::make_pair(helloStr,strlen(helloStr)));
+ *       cl::Program program_ = cl::Program(context, source);
+ *       program_.build(devices);
+ * 
+ *       cl::Kernel kernel(program_, "hello", &err);
+ * 
+ *       cl::Event event;
+ *       cl::CommandQueue queue(context, devices[0], 0, &err);
+ *       queue.enqueueNDRangeKernel(
+ *           kernel, 
+ *           cl::NullRange, 
+ *           cl::NDRange(4,4),
+ *           cl::NullRange,
+ *           NULL,
+ *           &event); 
+ * 
+ *       event.wait();
+ *     }
+ *     catch (cl::Error err) {
+ *        std::cerr 
+ *           << "ERROR: "
+ *           << err.what()
+ *           << "("
+ *           << err.err()
+ *           << ")"
+ *           << std::endl;
+ *     }
+ * 
+ *    return EXIT_SUCCESS;
+ *  }
+ * 
+ * \endcode
+ *
+ */
+#ifndef CL_HPP_
+#define CL_HPP_
+
+#ifdef _WIN32
+
+#include <malloc.h>
+
+#if defined(USE_DX_INTEROP)
+#include <CL/cl_d3d10.h>
+#include <CL/cl_dx9_media_sharing.h>
+#endif
+#endif // _WIN32
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif // _MSC_VER
+
+// 
+#if defined(USE_CL_DEVICE_FISSION)
+#include <CL/cl_ext.h>
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/opencl.h>
+#else
+#include <CL/opencl.h>
+#endif // !__APPLE__
+
+#if (_MSC_VER >= 1700) || (__cplusplus >= 201103L)
+#define CL_HPP_RVALUE_REFERENCES_SUPPORTED
+#define CL_HPP_CPP11_ATOMICS_SUPPORTED
+#include <atomic>
+#endif
+
+#if (__cplusplus >= 201103L)
+#define CL_HPP_NOEXCEPT noexcept
+#else
+#define CL_HPP_NOEXCEPT
+#endif
+
+
+// To avoid accidentally taking ownership of core OpenCL types
+// such as cl_kernel constructors are made explicit
+// under OpenCL 1.2
+#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS explicit
+#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS 
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+// Define deprecated prefixes and suffixes to ensure compilation
+// in case they are not pre-defined
+#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+
+#if !defined(CL_CALLBACK)
+#define CL_CALLBACK
+#endif //CL_CALLBACK
+
+#include <utility>
+#include <limits>
+#include <iterator>
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+#include <exception>
+#endif // #if defined(__CL_ENABLE_EXCEPTIONS)
+
+#if !defined(__NO_STD_VECTOR)
+#include <vector>
+#endif
+
+#if !defined(__NO_STD_STRING)
+#include <string>
+#endif 
+
+#if defined(__ANDROID__) || defined(linux) || defined(__APPLE__) || defined(__MACOSX)
+#include <alloca.h>
+#endif // linux
+
+#include <cstring>
+
+
+/*! \namespace cl
+ *
+ * \brief The OpenCL C++ bindings are defined within this namespace.
+ *
+ */
+namespace cl {
+
+class Memory;
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __INIT_CL_EXT_FCN_PTR(name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddress(#name); \
+        if(!pfn_##name) { \
+        } \
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+#if defined(CL_VERSION_1_2)
+#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddressForPlatform(platform, #name); \
+        if(!pfn_##name) { \
+        } \
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+class Program;
+class Device;
+class Context;
+class CommandQueue;
+class Memory;
+class Buffer;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+/*! \brief Exception class 
+ * 
+ *  This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined.
+ */
+class Error : public std::exception
+{
+private:
+    cl_int err_;
+    const char * errStr_;
+public:
+    /*! \brief Create a new CL error exception for a given error code
+     *  and corresponding message.
+     * 
+     *  \param err error code value.
+     *
+     *  \param errStr a descriptive string that must remain in scope until
+     *                handling of the exception has concluded.  If set, it
+     *                will be returned by what().
+     */
+    Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+    {}
+
+    ~Error() throw() {}
+
+    /*! \brief Get error string associated with exception
+     *
+     * \return A memory pointer to the error message string.
+     */
+    virtual const char * what() const throw ()
+    {
+        if (errStr_ == NULL) {
+            return "empty";
+        }
+        else {
+            return errStr_;
+        }
+    }
+
+    /*! \brief Get error code associated with exception
+     *
+     *  \return The error code.
+     */
+    cl_int err(void) const { return err_; }
+};
+
+#define __ERR_STR(x) #x
+#else
+#define __ERR_STR(x) NULL
+#endif // __CL_ENABLE_EXCEPTIONS
+
+
+namespace detail
+{
+#if defined(__CL_ENABLE_EXCEPTIONS)
+static inline cl_int errHandler (
+    cl_int err,
+    const char * errStr = NULL)
+{
+    if (err != CL_SUCCESS) {
+        throw Error(err, errStr);
+    }
+    return err;
+}
+#else
+static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+{
+    (void) errStr; // suppress unused variable warning
+    return err;
+}
+#endif // __CL_ENABLE_EXCEPTIONS
+}
+
+
+
+//! \cond DOXYGEN_DETAIL
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#define __GET_DEVICE_INFO_ERR               __ERR_STR(clGetDeviceInfo)
+#define __GET_PLATFORM_INFO_ERR             __ERR_STR(clGetPlatformInfo)
+#define __GET_DEVICE_IDS_ERR                __ERR_STR(clGetDeviceIDs)
+#define __GET_PLATFORM_IDS_ERR              __ERR_STR(clGetPlatformIDs)
+#define __GET_CONTEXT_INFO_ERR              __ERR_STR(clGetContextInfo)
+#define __GET_EVENT_INFO_ERR                __ERR_STR(clGetEventInfo)
+#define __GET_EVENT_PROFILE_INFO_ERR        __ERR_STR(clGetEventProfileInfo)
+#define __GET_MEM_OBJECT_INFO_ERR           __ERR_STR(clGetMemObjectInfo)
+#define __GET_IMAGE_INFO_ERR                __ERR_STR(clGetImageInfo)
+#define __GET_SAMPLER_INFO_ERR              __ERR_STR(clGetSamplerInfo)
+#define __GET_KERNEL_INFO_ERR               __ERR_STR(clGetKernelInfo)
+#if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_ARG_INFO_ERR               __ERR_STR(clGetKernelArgInfo)
+#endif // #if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_WORK_GROUP_INFO_ERR    __ERR_STR(clGetKernelWorkGroupInfo)
+#define __GET_PROGRAM_INFO_ERR              __ERR_STR(clGetProgramInfo)
+#define __GET_PROGRAM_BUILD_INFO_ERR        __ERR_STR(clGetProgramBuildInfo)
+#define __GET_COMMAND_QUEUE_INFO_ERR        __ERR_STR(clGetCommandQueueInfo)
+
+#define __CREATE_CONTEXT_ERR                __ERR_STR(clCreateContext)
+#define __CREATE_CONTEXT_FROM_TYPE_ERR      __ERR_STR(clCreateContextFromType)
+#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   __ERR_STR(clGetSupportedImageFormats)
+
+#define __CREATE_BUFFER_ERR                 __ERR_STR(clCreateBuffer)
+#define __COPY_ERR                          __ERR_STR(cl::copy)
+#define __CREATE_SUBBUFFER_ERR              __ERR_STR(clCreateSubBuffer)
+#define __CREATE_GL_BUFFER_ERR              __ERR_STR(clCreateFromGLBuffer)
+#define __CREATE_GL_RENDER_BUFFER_ERR       __ERR_STR(clCreateFromGLBuffer)
+#define __GET_GL_OBJECT_INFO_ERR            __ERR_STR(clGetGLObjectInfo)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_IMAGE_ERR                  __ERR_STR(clCreateImage)
+#define __CREATE_GL_TEXTURE_ERR             __ERR_STR(clCreateFromGLTexture)
+#define __IMAGE_DIMENSION_ERR               __ERR_STR(Incorrect image dimensions)
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_SAMPLER_ERR                __ERR_STR(clCreateSampler)
+#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
+
+#define __CREATE_USER_EVENT_ERR             __ERR_STR(clCreateUserEvent)
+#define __SET_USER_EVENT_STATUS_ERR         __ERR_STR(clSetUserEventStatus)
+#define __SET_EVENT_CALLBACK_ERR            __ERR_STR(clSetEventCallback)
+#define __WAIT_FOR_EVENTS_ERR               __ERR_STR(clWaitForEvents)
+
+#define __CREATE_KERNEL_ERR                 __ERR_STR(clCreateKernel)
+#define __SET_KERNEL_ARGS_ERR               __ERR_STR(clSetKernelArg)
+#define __CREATE_PROGRAM_WITH_SOURCE_ERR    __ERR_STR(clCreateProgramWithSource)
+#define __CREATE_PROGRAM_WITH_BINARY_ERR    __ERR_STR(clCreateProgramWithBinary)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    __ERR_STR(clCreateProgramWithBuiltInKernels)
+#endif // #if defined(CL_VERSION_1_2)
+#define __BUILD_PROGRAM_ERR                 __ERR_STR(clBuildProgram)
+#if defined(CL_VERSION_1_2)
+#define __COMPILE_PROGRAM_ERR                  __ERR_STR(clCompileProgram)
+#define __LINK_PROGRAM_ERR                  __ERR_STR(clLinkProgram)
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_KERNELS_IN_PROGRAM_ERR     __ERR_STR(clCreateKernelsInProgram)
+
+#define __CREATE_COMMAND_QUEUE_ERR          __ERR_STR(clCreateCommandQueue)
+#define __SET_COMMAND_QUEUE_PROPERTY_ERR    __ERR_STR(clSetCommandQueueProperty)
+#define __ENQUEUE_READ_BUFFER_ERR           __ERR_STR(clEnqueueReadBuffer)
+#define __ENQUEUE_READ_BUFFER_RECT_ERR      __ERR_STR(clEnqueueReadBufferRect)
+#define __ENQUEUE_WRITE_BUFFER_ERR          __ERR_STR(clEnqueueWriteBuffer)
+#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     __ERR_STR(clEnqueueWriteBufferRect)
+#define __ENQEUE_COPY_BUFFER_ERR            __ERR_STR(clEnqueueCopyBuffer)
+#define __ENQEUE_COPY_BUFFER_RECT_ERR       __ERR_STR(clEnqueueCopyBufferRect)
+#define __ENQUEUE_FILL_BUFFER_ERR           __ERR_STR(clEnqueueFillBuffer)
+#define __ENQUEUE_READ_IMAGE_ERR            __ERR_STR(clEnqueueReadImage)
+#define __ENQUEUE_WRITE_IMAGE_ERR           __ERR_STR(clEnqueueWriteImage)
+#define __ENQUEUE_COPY_IMAGE_ERR            __ERR_STR(clEnqueueCopyImage)
+#define __ENQUEUE_FILL_IMAGE_ERR           __ERR_STR(clEnqueueFillImage)
+#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  __ERR_STR(clEnqueueCopyImageToBuffer)
+#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  __ERR_STR(clEnqueueCopyBufferToImage)
+#define __ENQUEUE_MAP_BUFFER_ERR            __ERR_STR(clEnqueueMapBuffer)
+#define __ENQUEUE_MAP_IMAGE_ERR             __ERR_STR(clEnqueueMapImage)
+#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      __ERR_STR(clEnqueueUnMapMemObject)
+#define __ENQUEUE_NDRANGE_KERNEL_ERR        __ERR_STR(clEnqueueNDRangeKernel)
+#define __ENQUEUE_TASK_ERR                  __ERR_STR(clEnqueueTask)
+#define __ENQUEUE_NATIVE_KERNEL             __ERR_STR(clEnqueueNativeKernel)
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   __ERR_STR(clEnqueueMigrateMemObjects)
+#endif // #if defined(CL_VERSION_1_2)
+
+#define __ENQUEUE_ACQUIRE_GL_ERR            __ERR_STR(clEnqueueAcquireGLObjects)
+#define __ENQUEUE_RELEASE_GL_ERR            __ERR_STR(clEnqueueReleaseGLObjects)
+
+
+#define __RETAIN_ERR                        __ERR_STR(Retain Object)
+#define __RELEASE_ERR                       __ERR_STR(Release Object)
+#define __FLUSH_ERR                         __ERR_STR(clFlush)
+#define __FINISH_ERR                        __ERR_STR(clFinish)
+#define __VECTOR_CAPACITY_ERR               __ERR_STR(Vector capacity error)
+
+/**
+ * CL 1.2 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_2)
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevices)
+#else
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevicesEXT)
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __ENQUEUE_MARKER_ERR                __ERR_STR(clEnqueueMarker)
+#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       __ERR_STR(clEnqueueWaitForEvents)
+#define __ENQUEUE_BARRIER_ERR               __ERR_STR(clEnqueueBarrier)
+#define __UNLOAD_COMPILER_ERR               __ERR_STR(clUnloadCompiler)
+#define __CREATE_GL_TEXTURE_2D_ERR          __ERR_STR(clCreateFromGLTexture2D)
+#define __CREATE_GL_TEXTURE_3D_ERR          __ERR_STR(clCreateFromGLTexture3D)
+#define __CREATE_IMAGE2D_ERR                __ERR_STR(clCreateImage2D)
+#define __CREATE_IMAGE3D_ERR                __ERR_STR(clCreateImage3D)
+#endif // #if defined(CL_VERSION_1_1)
+
+#endif // __CL_USER_OVERRIDE_ERROR_STRINGS
+//! \endcond
+
+/**
+ * CL 1.2 marker and barrier commands
+ */
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MARKER_WAIT_LIST_ERR                __ERR_STR(clEnqueueMarkerWithWaitList)
+#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               __ERR_STR(clEnqueueBarrierWithWaitList)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
+typedef std::string STRING_CLASS;
+#elif !defined(__USE_DEV_STRING) 
+
+/*! \class string
+ * \brief Simple string class, that provides a limited subset of std::string
+ * functionality but avoids many of the issues that come with that class.
+ 
+ *  \note Deprecated. Please use std::string as default or
+ *  re-define the string class to match the std::string
+ *  interface by defining STRING_CLASS
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string
+{
+private:
+    ::size_t size_;
+    char * str_;
+public:
+    //! \brief Constructs an empty string, allocating no memory.
+    string(void) : size_(0), str_(NULL)
+    {
+    }
+
+    /*! \brief Constructs a string populated from an arbitrary value of
+     *  specified size.
+     * 
+     *  An extra '\0' is added, in case none was contained in str.
+     *
+     *  \param str the initial value of the string instance.  Note that '\0'     
+     *             characters receive no special treatment.  If NULL,
+     *             the string is left empty, with a size of 0.
+     *
+     *  \param size the number of characters to copy from str.
+     */
+    string(const char * str, ::size_t size) :
+        size_(size),
+        str_(NULL)
+    {
+        if( size > 0 ) {
+            str_ = new char[size_+1];
+            if (str_ != NULL) {
+                memcpy(str_, str, size_  * sizeof(char));
+                str_[size_] = '\0';
+            }
+            else {
+                size_ = 0;
+            }
+        }
+    }
+
+    /*! \brief Constructs a string populated from a null-terminated value.
+     *
+     *  \param str the null-terminated initial value of the string instance.
+     *             If NULL, the string is left empty, with a size of 0.
+     */
+    string(const char * str) :
+        size_(0),
+        str_(NULL)
+    {
+        if( str ) {
+            size_= ::strlen(str);
+        }
+        if( size_ > 0 ) {
+            str_ = new char[size_ + 1];
+            if (str_ != NULL) {
+                memcpy(str_, str, (size_ + 1) * sizeof(char));
+            }
+        }
+    }
+
+    void resize( ::size_t n )
+    {
+        if( size_ == n ) {
+            return;
+        }
+        if (n == 0) {
+            if( str_ ) {
+                delete [] str_;
+            }
+            str_ = NULL;
+            size_ = 0;
+        } 
+        else {
+            char *newString = new char[n + 1];
+            ::size_t copySize = n;
+            if( size_ < n ) {
+                copySize = size_;
+            }
+            size_ = n;
+            
+            if(str_) {
+                memcpy(newString, str_, (copySize + 1) * sizeof(char));
+            }
+            if( copySize < size_ ) {
+                memset(newString + copySize, 0, size_ - copySize);
+            }
+            newString[size_] = '\0';
+
+            delete [] str_;
+            str_ = newString;
+        }
+    }
+
+    const char& operator[] ( ::size_t pos ) const
+    {
+        return str_[pos];
+    }
+
+    char& operator[] ( ::size_t pos )
+    {
+        return str_[pos];
+    }
+
+    /*! \brief Copies the value of another string to this one.
+     *
+     *  \param rhs the string to copy.
+     *
+     *  \returns a reference to the modified instance.
+     */
+    string& operator=(const string& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if( str_ != NULL ) {
+            delete [] str_;
+            str_ = NULL;
+            size_ = 0;
+        }
+
+        if (rhs.size_ == 0 || rhs.str_ == NULL) {
+            str_ = NULL;
+            size_ = 0;
+        } 
+        else {
+            str_ = new char[rhs.size_ + 1];
+            size_ = rhs.size_;
+            
+            if (str_ != NULL) {
+                memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
+            }
+            else {
+                size_ = 0;
+            }
+        }
+
+        return *this;
+    }
+
+    /*! \brief Constructs a string by copying the value of another instance.
+     *
+     *  \param rhs the string to copy.
+     */
+    string(const string& rhs) :
+        size_(0),
+        str_(NULL)
+    {
+        *this = rhs;
+    }
+
+    //! \brief Destructor - frees memory used to hold the current value.
+    ~string()
+    {
+        delete[] str_;
+        str_ = NULL;
+    }
+    
+    //! \brief Queries the length of the string, excluding any added '\0's.
+    ::size_t size(void) const   { return size_; }
+
+    //! \brief Queries the length of the string, excluding any added '\0's.
+    ::size_t length(void) const { return size(); }
+
+    /*! \brief Returns a pointer to the private copy held by this instance,
+     *  or "" if empty/unset.
+     */
+    const char * c_str(void) const { return (str_) ? str_ : "";}
+} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+typedef cl::string STRING_CLASS;
+#endif // #elif !defined(__USE_DEV_STRING) 
+
+#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+#define VECTOR_CLASS std::vector
+#elif !defined(__USE_DEV_VECTOR) 
+#define VECTOR_CLASS cl::vector 
+
+#if !defined(__MAX_DEFAULT_VECTOR_SIZE)
+#define __MAX_DEFAULT_VECTOR_SIZE 10
+#endif
+
+/*! \class vector
+ * \brief Fixed sized vector implementation that mirroring 
+ *
+ *  \note Deprecated. Please use std::vector as default or
+ *  re-define the vector class to match the std::vector
+ *  interface by defining VECTOR_CLASS
+
+ *  \note Not recommended for use with custom objects as
+ *  current implementation will construct N elements
+ *
+ * std::vector functionality.
+ *  \brief Fixed sized vector compatible with std::vector.
+ *
+ *  \note
+ *  This differs from std::vector<> not just in memory allocation,
+ *  but also in terms of when members are constructed, destroyed,
+ *  and assigned instead of being copy constructed.
+ *
+ *  \param T type of element contained in the vector.
+ *
+ *  \param N maximum size of the vector.
+ */
+template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector
+{
+private:
+    T data_[N];
+    unsigned int size_;
+
+public:
+    //! \brief Constructs an empty vector with no memory allocated.
+    vector() :  
+        size_(static_cast<unsigned int>(0))
+    {}
+
+    //! \brief Deallocates the vector's memory and destroys all of its elements.
+    ~vector() 
+    {
+        clear();
+    }
+
+    //! \brief Returns the number of elements currently contained.
+    unsigned int size(void) const
+    {
+        return size_;
+    }
+    
+    /*! \brief Empties the vector of all elements.
+     *  \note
+     *  This does not deallocate memory but will invoke destructors
+     *  on contained elements.
+     */
+    void clear()
+    {
+        while(!empty()) {
+            pop_back();
+        }
+    }
+
+    /*! \brief Appends an element after the last valid element.
+     * Calling this on a vector that has reached capacity will throw an 
+     * exception if exceptions are enabled.
+     */
+    void push_back (const T& x)
+    { 
+        if (size() < N) {
+            new (&data_[size_]) T(x);
+            size_++;
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+    }
+
+    /*! \brief Removes the last valid element from the vector.
+     * Calling this on an empty vector will throw an exception
+     * if exceptions are enabled.
+     */
+    void pop_back(void)
+    {
+        if (size_ != 0) {
+            --size_;
+            data_[size_].~T();
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+    }
+
+    /*! \brief Constructs with a value copied from another.
+     *
+     *  \param vec the vector to copy.
+     */
+    vector(const vector<T, N>& vec) : 
+        size_(vec.size_)
+    {
+        if (size_ != 0) {
+            assign(vec.begin(), vec.end());
+        }
+    } 
+
+    /*! \brief Constructs with a specified number of initial elements.
+     *
+     *  \param size number of initial elements.
+     *
+     *  \param val value of initial elements.
+     */
+    vector(unsigned int size, const T& val = T()) :
+        size_(0)
+    {
+        for (unsigned int i = 0; i < size; i++) {
+            push_back(val);
+        }
+    }
+
+    /*! \brief Overwrites the current content with that copied from another
+     *         instance.
+     *
+     *  \param rhs vector to copy.
+     *
+     *  \returns a reference to this.
+     */
+    vector<T, N>& operator=(const vector<T, N>& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if (rhs.size_ != 0) {	
+            assign(rhs.begin(), rhs.end());
+        } else {
+            clear();
+        }
+
+        return *this;
+    }
+
+    /*! \brief Tests equality against another instance.
+     *
+     *  \param vec the vector against which to compare.
+     */
+    bool operator==(vector<T,N> &vec)
+    {
+        if (size() != vec.size()) {
+            return false;
+        }
+
+        for( unsigned int i = 0; i < size(); ++i ) {
+            if( operator[](i) != vec[i] ) {
+                return false;
+            }
+        }
+        return true;
+    }
+  
+    //! \brief Conversion operator to T*.
+    operator T* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
+    operator const T* () const { return data_; }
+   
+    //! \brief Tests whether this instance has any elements.
+    bool empty (void) const
+    {
+        return size_==0;
+    }
+  
+    //! \brief Returns the maximum number of elements this instance can hold.
+    unsigned int max_size (void) const
+    {
+        return N;
+    }
+
+    //! \brief Returns the maximum number of elements this instance can hold.
+    unsigned int capacity () const
+    {
+        return N;
+    }
+
+    //! \brief Resizes the vector to the given size
+    void resize(unsigned int newSize, T fill = T())
+    {
+        if (newSize > N)
+        {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+        else
+        {
+            while (size_ < newSize)
+            {
+                new (&data_[size_]) T(fill);
+                size_++;
+            }
+            while (size_ > newSize)
+            {
+                --size_;
+                data_[size_].~T();
+            }
+        }
+    }
+
+    /*! \brief Returns a reference to a given element.
+     *
+     *  \param index which element to access.     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
+    T& operator[](int index)
+    {
+        return data_[index];
+    }
+  
+    /*! \brief Returns a const reference to a given element.
+     *
+     *  \param index which element to access.
+     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
+    const T& operator[](int index) const
+    {
+        return data_[index];
+    }
+  
+    /*! \brief Assigns elements of the vector based on a source iterator range.
+     *
+     *  \param start Beginning iterator of source range
+     *  \param end Enditerator of source range
+     *
+     *  \note
+     *  Will throw an exception if exceptions are enabled and size exceeded.
+     */
+    template<class I>
+    void assign(I start, I end)
+    {
+        clear();   
+        while(start != end) {
+            push_back(*start);
+            start++;
+        }
+    }
+
+    /*! \class iterator
+     * \brief Const iterator class for vectors
+     */
+    class iterator
+    {
+    private:
+        const vector<T,N> *vec_;
+        int index_;
+
+        /**
+         * Internal iterator constructor to capture reference
+         * to the vector it iterates over rather than taking 
+         * the vector by copy.
+         */
+        iterator (const vector<T,N> &vec, int index) :
+            vec_(&vec)
+        {            
+            if( !vec.empty() ) {
+                index_ = index;
+            } else {
+                index_ = -1;
+            }
+        }
+
+    public:
+        iterator(void) : 
+            index_(-1),
+            vec_(NULL)
+        {
+        }
+
+        iterator(const iterator& rhs) :
+            vec_(rhs.vec_),
+            index_(rhs.index_)
+        {
+        }
+
+        ~iterator(void) {}
+
+        static iterator begin(const cl::vector<T,N> &vec)
+        {
+            iterator i(vec, 0);
+
+            return i;
+        }
+
+        static iterator end(const cl::vector<T,N> &vec)
+        {
+            iterator i(vec, vec.size());
+
+            return i;
+        }
+    
+        bool operator==(iterator i)
+        {
+            return ((vec_ == i.vec_) && 
+                    (index_ == i.index_));
+        }
+
+        bool operator!=(iterator i)
+        {
+            return (!(*this==i));
+        }
+
+        iterator& operator++()
+        {
+            ++index_;
+            return *this;
+        }
+
+        iterator operator++(int)
+        {
+            iterator retVal(*this);
+            ++index_;
+            return retVal;
+        }
+
+        iterator& operator--()
+        {
+            --index_;
+            return *this;
+        }
+
+        iterator operator--(int)
+        {
+            iterator retVal(*this);
+            --index_;
+            return retVal;
+        }
+
+        const T& operator *() const
+        {
+            return (*vec_)[index_];
+        }
+    };
+
+    iterator begin(void)
+    {
+        return iterator::begin(*this);
+    }
+
+    iterator begin(void) const
+    {
+        return iterator::begin(*this);
+    }
+
+    iterator end(void)
+    {
+        return iterator::end(*this);
+    }
+
+    iterator end(void) const
+    {
+        return iterator::end(*this);
+    }
+
+    T& front(void)
+    {
+        return data_[0];
+    }
+
+    T& back(void)
+    {
+        return data_[size_];
+    }
+
+    const T& front(void) const
+    {
+        return data_[0];
+    }
+
+    const T& back(void) const
+    {
+        return data_[size_-1];
+    }
+} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+
+
+
+
+
+namespace detail {
+#define __DEFAULT_NOT_INITIALIZED 1 
+#define __DEFAULT_BEING_INITIALIZED 2
+#define __DEFAULT_INITIALIZED 4
+
+    /*
+     * Compare and exchange primitives are needed for handling of defaults
+    */
+
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+    inline int compare_exchange(std::atomic<int> * dest, int exchange, int comparand)
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    inline int compare_exchange(volatile int * dest, int exchange, int comparand)
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    {
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+        std::atomic_compare_exchange_strong(dest, &comparand, exchange);
+        return comparand;
+#elif _MSC_VER
+        return (int)(_InterlockedCompareExchange(
+            (volatile long*)dest,
+            (long)exchange,
+            (long)comparand));
+#else // !_MSC_VER && !CL_HPP_CPP11_ATOMICS_SUPPORTED
+        return (__sync_val_compare_and_swap(
+            dest,
+            comparand,
+            exchange));
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    }
+
+    inline void fence() {
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+        std::atomic_thread_fence(std::memory_order_seq_cst);
+#elif _MSC_VER // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+        _ReadWriteBarrier();
+#else // !_MSC_VER && !CL_HPP_CPP11_ATOMICS_SUPPORTED
+        __sync_synchronize();
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    }
+} // namespace detail
+
+    
+/*! \brief class used to interface between C++ and
+ *  OpenCL C calls that require arrays of size_t values, whose
+ *  size is known statically.
+ */
+template <int N>
+class size_t
+{ 
+private:
+    ::size_t data_[N];
+
+public:
+    //! \brief Initialize size_t to all 0s
+    size_t()
+    {
+        for( int i = 0; i < N; ++i ) {
+            data_[i] = 0;
+        }
+    }
+
+    ::size_t& operator[](int index)
+    {
+        return data_[index];
+    }
+
+    const ::size_t& operator[](int index) const
+    {
+        return data_[index];
+    }
+
+    //! \brief Conversion operator to T*.
+    operator ::size_t* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
+    operator const ::size_t* () const { return data_; }
+};
+
+namespace detail {
+
+// Generic getInfoHelper. The final parameter is used to guide overload
+// resolution: the actual parameter passed is an int, which makes this
+// a worse conversion sequence than a specialization that declares the
+// parameter as an int.
+template<typename Functor, typename T>
+inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
+{
+    return f(name, sizeof(T), param, NULL);
+}
+
+// Specialized getInfoHelper for VECTOR_CLASS params
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    T* value = (T*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    param->assign(&value[0], &value[required/sizeof(T)]);
+    return CL_SUCCESS;
+}
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int, typename T::cl_type = 0)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    typename T::cl_type * value = (typename T::cl_type *) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ::size_t elements = required / sizeof(typename T::cl_type);
+    param->assign(&value[0], &value[elements]);
+    for (::size_t i = 0; i < elements; i++)
+    {
+        if (value[i] != NULL)
+        {
+            err = (*param)[i].retain();
+            if (err != CL_SUCCESS) {
+                return err;
+            }
+        }
+    }
+    return CL_SUCCESS;
+}
+
+// Specialized for getInfo<CL_PROGRAM_BINARIES>
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<char *>* param, int)
+{
+    cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
+
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for STRING_CLASS params
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long)
+{
+#if defined(__NO_STD_VECTOR) || defined(__NO_STD_STRING)
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    char* value = (char*)alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    *param = value;
+    return CL_SUCCESS;
+#else 
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    // std::string has a constant data member
+    // a char vector does not
+    VECTOR_CLASS<char> value(required);
+    err = f(name, required, value.data(), NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    if (param) {
+        param->assign(value.begin(), value.end());
+    }
+#endif
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for cl::size_t params
+template <typename Func, ::size_t N>
+inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ::size_t* value = (::size_t*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    for(int i = 0; i < N; ++i) {
+        (*param)[i] = value[i];
+    }
+
+    return CL_SUCCESS;
+}
+
+template<typename T> struct ReferenceHandler;
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template<typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
+{
+    typename T::cl_type value;
+    cl_int err = f(name, sizeof(value), &value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    *param = value;
+    if (value != NULL)
+    {
+        err = param->retain();
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+    }
+    return CL_SUCCESS;
+}
+
+#define __PARAM_NAME_INFO_1_0(F) \
+    F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
+    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
+    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \
+    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
+    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \
+    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
+    F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \
+    F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
+    F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \
+    \
+    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
+    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
+    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
+    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_int) \
+    \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
+    \
+    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
+    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
+    F(cl_mem_info, CL_MEM_SIZE, ::size_t) \
+    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
+    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
+    \
+    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
+    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \
+    F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \
+    F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \
+    \
+    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
+    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
+    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_bool) \
+    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_addressing_mode) \
+    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_filter_mode) \
+    \
+    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
+    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
+    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \
+    \
+    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \
+    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
+    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \
+    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
+    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
+    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
+    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+#if defined(CL_VERSION_1_1)
+#define __PARAM_NAME_INFO_1_1(F) \
+    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \
+    \
+    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
+    F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
+#endif // CL_VERSION_1_1
+
+    
+#if defined(CL_VERSION_1_2)
+#define __PARAM_NAME_INFO_1_2(F) \
+    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \
+    \
+    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \
+    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
+    \
+    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \
+    \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_QUALIFIER, cl_kernel_arg_type_qualifier) \
+    \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS<cl_device_partition_property>) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS<cl_device_partition_property>)  \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \
+    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if defined(USE_CL_DEVICE_FISSION)
+#define __PARAM_NAME_DEVICE_FISSION(F) \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
+    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
+#endif // USE_CL_DEVICE_FISSION
+
+template <typename enum_type, cl_int Name>
+struct param_traits {};
+
+#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \
+struct token;                                        \
+template<>                                           \
+struct param_traits<detail:: token,param_name>       \
+{                                                    \
+    enum { value = param_name };                     \
+    typedef T param_type;                            \
+};
+
+__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS)
+#if defined(CL_VERSION_1_1)
+__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+#if defined(CL_VERSION_1_2)
+__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+
+#if defined(USE_CL_DEVICE_FISSION)
+__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS);
+#endif // USE_CL_DEVICE_FISSION
+
+#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
+__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS)
+#endif
+
+#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
+#endif
+
+#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>)
+#endif
+#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
+#endif
+
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_WARP_SIZE_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_GPU_OVERLAP_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
+#endif
+
+// Convenience functions
+
+template <typename Func, typename T>
+inline cl_int
+getInfo(Func f, cl_uint name, T* param)
+{
+    return getInfoHelper(f, name, param, 0);
+}
+
+template <typename Func, typename Arg0>
+struct GetInfoFunctor0
+{
+    Func f_; const Arg0& arg0_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename Arg1>
+struct GetInfoFunctor1
+{
+    Func f_; const Arg0& arg0_; const Arg1& arg1_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, arg1_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
+{
+    GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template <typename Func, typename Arg0, typename Arg1, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
+{
+    GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template<typename T>
+struct ReferenceHandler
+{ };
+
+#if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.2 devices do have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int retain(cl_device_id device)
+    { return ::clRetainDevice(device); }
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int release(cl_device_id device)
+    { return ::clReleaseDevice(device); }
+};
+#else // #if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.1 devices do not have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    // cl_device_id does not have retain().
+    static cl_int retain(cl_device_id)
+    { return CL_SUCCESS; }
+    // cl_device_id does not have release().
+    static cl_int release(cl_device_id)
+    { return CL_SUCCESS; }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+template <>
+struct ReferenceHandler<cl_platform_id>
+{
+    // cl_platform_id does not have retain().
+    static cl_int retain(cl_platform_id)
+    { return CL_SUCCESS; }
+    // cl_platform_id does not have release().
+    static cl_int release(cl_platform_id)
+    { return CL_SUCCESS; }
+};
+
+template <>
+struct ReferenceHandler<cl_context>
+{
+    static cl_int retain(cl_context context)
+    { return ::clRetainContext(context); }
+    static cl_int release(cl_context context)
+    { return ::clReleaseContext(context); }
+};
+
+template <>
+struct ReferenceHandler<cl_command_queue>
+{
+    static cl_int retain(cl_command_queue queue)
+    { return ::clRetainCommandQueue(queue); }
+    static cl_int release(cl_command_queue queue)
+    { return ::clReleaseCommandQueue(queue); }
+};
+
+template <>
+struct ReferenceHandler<cl_mem>
+{
+    static cl_int retain(cl_mem memory)
+    { return ::clRetainMemObject(memory); }
+    static cl_int release(cl_mem memory)
+    { return ::clReleaseMemObject(memory); }
+};
+
+template <>
+struct ReferenceHandler<cl_sampler>
+{
+    static cl_int retain(cl_sampler sampler)
+    { return ::clRetainSampler(sampler); }
+    static cl_int release(cl_sampler sampler)
+    { return ::clReleaseSampler(sampler); }
+};
+
+template <>
+struct ReferenceHandler<cl_program>
+{
+    static cl_int retain(cl_program program)
+    { return ::clRetainProgram(program); }
+    static cl_int release(cl_program program)
+    { return ::clReleaseProgram(program); }
+};
+
+template <>
+struct ReferenceHandler<cl_kernel>
+{
+    static cl_int retain(cl_kernel kernel)
+    { return ::clRetainKernel(kernel); }
+    static cl_int release(cl_kernel kernel)
+    { return ::clReleaseKernel(kernel); }
+};
+
+template <>
+struct ReferenceHandler<cl_event>
+{
+    static cl_int retain(cl_event event)
+    { return ::clRetainEvent(event); }
+    static cl_int release(cl_event event)
+    { return ::clReleaseEvent(event); }
+};
+
+
+// Extracts version number with major in the upper 16 bits, minor in the lower 16
+static cl_uint getVersion(const char *versionInfo)
+{
+    int highVersion = 0;
+    int lowVersion = 0;
+    int index = 7;
+    while(versionInfo[index] != '.' ) {
+        highVersion *= 10;
+        highVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    ++index;
+    while(versionInfo[index] != ' ' &&  versionInfo[index] != '\0') {
+        lowVersion *= 10;
+        lowVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    return (highVersion << 16) | lowVersion;
+}
+
+static cl_uint getPlatformVersion(cl_platform_id platform)
+{
+    ::size_t size = 0;
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
+    char *versionInfo = (char *) alloca(size);
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size);
+    return getVersion(versionInfo);
+}
+
+static cl_uint getDevicePlatformVersion(cl_device_id device)
+{
+    cl_platform_id platform;
+    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+    return getPlatformVersion(platform);
+}
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+static cl_uint getContextPlatformVersion(cl_context context)
+{
+    // The platform cannot be queried directly, so we first have to grab a
+    // device and obtain its context
+    ::size_t size = 0;
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
+    if (size == 0)
+        return 0;
+    cl_device_id *devices = (cl_device_id *) alloca(size);
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL);
+    return getDevicePlatformVersion(devices[0]);
+}
+#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+template <typename T>
+class Wrapper
+{
+public:
+    typedef T cl_type;
+
+protected:
+    cl_type object_;
+
+public:
+    Wrapper() : object_(NULL) { }
+
+    Wrapper(const cl_type &obj) : object_(obj) { }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT
+    {
+        object_ = rhs.object_;
+        rhs.object_ = NULL;
+    }
+#endif
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (this != &rhs) {
+            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+            object_ = rhs.object_;
+            if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        }
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
+    {
+        if (this != &rhs) {
+            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+            object_ = rhs.object_;
+            rhs.object_ = NULL;
+        }
+        return *this;
+    }
+#endif
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        return ReferenceHandler<cl_type>::retain(object_);
+    }
+
+    cl_int release() const
+    {
+        return ReferenceHandler<cl_type>::release(object_);
+    }
+};
+
+template <>
+class Wrapper<cl_device_id>
+{
+public:
+    typedef cl_device_id cl_type;
+
+protected:
+    cl_type object_;
+    bool referenceCountable_;
+
+    static bool isReferenceCountable(cl_device_id device)
+    {
+        bool retVal = false;
+        if (device != NULL) {
+            int version = getDevicePlatformVersion(device);
+            if(version > ((1 << 16) + 1)) {
+                retVal = true;
+            }
+        }
+        return retVal;
+    }
+
+public:
+    Wrapper() : object_(NULL), referenceCountable_(false) 
+    { 
+    }
+    
+    Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) 
+    {
+        referenceCountable_ = isReferenceCountable(obj); 
+    }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+    
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = isReferenceCountable(object_); 
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = rhs.referenceCountable_;
+        rhs.object_ = NULL;
+        rhs.referenceCountable_ = false;
+    }
+#endif
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (this != &rhs) {
+            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+            object_ = rhs.object_;
+            referenceCountable_ = rhs.referenceCountable_;
+            if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        }
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
+    {
+        if (this != &rhs) {
+            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+            object_ = rhs.object_;
+            referenceCountable_ = rhs.referenceCountable_;
+            rhs.object_ = NULL;
+            rhs.referenceCountable_ = false;
+        }
+        return *this;
+    }
+#endif
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
+        referenceCountable_ = isReferenceCountable(object_); 
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS<U>*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::retain(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+
+    cl_int release() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::release(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+};
+
+} // namespace detail
+//! \endcond
+
+/*! \stuct ImageFormat
+ *  \brief Adds constructors and member functions for cl_image_format.
+ *
+ *  \see cl_image_format
+ */
+struct ImageFormat : public cl_image_format
+{
+    //! \brief Default constructor - performs no initialization.
+    ImageFormat(){}
+
+    //! \brief Initializing constructor.
+    ImageFormat(cl_channel_order order, cl_channel_type type)
+    {
+        image_channel_order = order;
+        image_channel_data_type = type;
+    }
+
+    //! \brief Assignment operator.
+    ImageFormat& operator = (const ImageFormat& rhs)
+    {
+        if (this != &rhs) {
+            this->image_channel_data_type = rhs.image_channel_data_type;
+            this->image_channel_order     = rhs.image_channel_order;
+        }
+        return *this;
+    }
+};
+
+/*! \brief Class interface for cl_device_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_device_id
+ */
+class Device : public detail::Wrapper<cl_device_id>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Device() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Device(const cl_device_id &device) : detail::Wrapper<cl_type>(device) { }
+
+    /*! \brief Returns the first device on the default context.
+     *
+     *  \see Context::getDefault()
+     */
+    static Device getDefault(cl_int * err = NULL);
+
+    /*! \brief Assignment operator from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device& operator = (const cl_device_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Device(const Device& dev) : detail::Wrapper<cl_type>(dev) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Device& operator = (const Device &dev)
+    {
+        detail::Wrapper<cl_type>::operator=(dev);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Device(Device&& dev) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(dev)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Device& operator = (Device &&dev)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(dev));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    //! \brief Wrapper for clGetDeviceInfo().
+    template <typename T>
+    cl_int getInfo(cl_device_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetDeviceInfo, object_, name, param),
+            __GET_DEVICE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_device_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_device_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /**
+     * CL 1.2 version
+     */
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clCreateSubDevicesEXT().
+    cl_int createSubDevices(
+        const cl_device_partition_property * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        cl_uint n = 0;
+        cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = clCreateSubDevices(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * CL 1.1 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_1)
+#if defined(USE_CL_DEVICE_FISSION)
+    cl_int createSubDevices(
+        const cl_device_partition_property_ext * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        typedef CL_API_ENTRY cl_int 
+            ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
+                cl_device_id /*in_device*/,
+                const cl_device_partition_property_ext * /* properties */,
+                cl_uint /*num_entries*/,
+                cl_device_id * /*out_devices*/,
+                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+        __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif // #if defined(USE_CL_DEVICE_FISSION)
+#endif // #if defined(CL_VERSION_1_1)
+};
+
+/*! \brief Class interface for cl_platform_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_platform_id
+ */
+class Platform : public detail::Wrapper<cl_platform_id>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Platform() : detail::Wrapper<cl_type>()  { }
+
+    /*! \brief Constructor from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Platform(const cl_platform_id &platform) : detail::Wrapper<cl_type>(platform) { }
+
+    /*! \brief Assignment operator from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform& operator = (const cl_platform_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo().
+    cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetPlatformInfo, object_, name, param),
+            __GET_PLATFORM_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_platform_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_platform_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of devices for this platform.
+     * 
+     *  Wraps clGetDeviceIDs().
+     */
+    cl_int getDevices(
+        cl_device_type type,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        cl_uint n = 0;
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+        cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = ::clGetDeviceIDs(object_, type, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+
+#if defined(USE_DX_INTEROP)
+   /*! \brief Get the list of available D3D10 devices.
+     *
+     *  \param d3d_device_source.
+     *
+     *  \param d3d_object.
+     *
+     *  \param d3d_device_set.
+     *
+     *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
+     *  values returned in devices can be used to identify a specific OpenCL
+     *  device. If \a devices argument is NULL, this argument is ignored.
+     *
+     *  \return One of the following values:
+     *    - CL_SUCCESS if the function is executed successfully.
+     *
+     *  The application can query specific capabilities of the OpenCL device(s)
+     *  returned by cl::getDevices. This can be used by the application to
+     *  determine which device(s) to use.
+     *
+     * \note In the case that exceptions are enabled and a return value
+     * other than CL_SUCCESS is generated, then cl::Error exception is
+     * generated.
+     */
+    cl_int getDevices(
+        cl_d3d10_device_source_khr d3d_device_source,
+        void *                     d3d_object,
+        cl_d3d10_device_set_khr    d3d_device_set,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
+            cl_platform_id platform, 
+            cl_d3d10_device_source_khr d3d_device_source, 
+            void * d3d_object,
+            cl_d3d10_device_set_khr d3d_device_set,
+            cl_uint num_entries,
+            cl_device_id * devices,
+            cl_uint* num_devices);
+
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+
+        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set, 
+            0, 
+            NULL, 
+            &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set,
+            n, 
+            ids, 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif
+
+    /*! \brief Gets a list of available platforms.
+     * 
+     *  Wraps clGetPlatformIDs().
+     */
+    static cl_int get(
+        VECTOR_CLASS<Platform>* platforms)
+    {
+        cl_uint n = 0;
+
+        if( platforms == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        platforms->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static cl_int get(
+        Platform * platform)
+    {
+        cl_uint n = 0;
+
+        if( platform == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        *platform = ids[0];
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform, returning it by value.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static Platform get(
+        cl_int * errResult = NULL)
+    {
+        Platform platform;
+        cl_uint n = 0;
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+            if (errResult != NULL) {
+                *errResult = err;
+            }
+            return Platform();
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+            if (errResult != NULL) {
+                *errResult = err;
+            }
+            return Platform();
+        }
+
+        
+        return Platform(ids[0]);
+    }
+
+    static Platform getDefault( 
+        cl_int *errResult = NULL )
+    {
+        return get(errResult);
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clUnloadCompiler().
+    cl_int
+    unloadCompiler()
+    {
+        return ::clUnloadPlatformCompiler(object_);
+    }
+#endif // #if defined(CL_VERSION_1_2)
+}; // class Platform
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
+/**
+ * Unload the OpenCL compiler.
+ * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
+UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline cl_int
+UnloadCompiler()
+{
+    return ::clUnloadCompiler();
+}
+#endif // #if defined(CL_VERSION_1_1)
+
+/*! \brief Class interface for cl_context.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_context as the original.  For details, see
+ *        clRetainContext() and clReleaseContext().
+ *
+ *  \see cl_context
+ */
+class Context 
+    : public detail::Wrapper<cl_context>
+{
+private:
+
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static std::atomic<int> default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static volatile int default_initialized_;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static Context default_;
+    static volatile cl_int default_error_;
+public:
+    /*! \brief Constructs a context including a list of specified devices.
+     *
+     *  Wraps clCreateContext().
+     */
+    Context(
+        const VECTOR_CLASS<Device>& devices,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        object_ = ::clCreateContext(
+            properties, (cl_uint) numDevices,
+            deviceIDs,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Context(
+        const Device& device,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_device_id deviceID = device();
+
+        object_ = ::clCreateContext(
+            properties, 1,
+            &deviceID,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a context including all or a subset of devices of a specified type.
+     *
+     *  Wraps clCreateContextFromType().
+     */
+    Context(
+        cl_device_type type,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if !defined(__APPLE__) && !defined(__MACOS)
+        cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
+
+        if (properties == NULL) {
+            // Get a valid platform ID as we cannot send in a blank one
+            VECTOR_CLASS<Platform> platforms;
+            error = Platform::get(&platforms);
+            if (error != CL_SUCCESS) {
+                detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = error;
+                }
+                return;
+            }
+
+            // Check the platforms we found for a device of our specified type
+            cl_context_properties platform_id = 0;
+            for (unsigned int i = 0; i < platforms.size(); i++) {
+
+                VECTOR_CLASS<Device> devices;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+                try {
+#endif
+
+                    error = platforms[i].getDevices(type, &devices);
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+                } catch (Error) {}
+    // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
+    // We do error checking next anyway, and can throw there if needed
+#endif
+
+                // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
+                if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
+                    detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                    if (err != NULL) {
+                        *err = error;
+                    }
+                }
+
+                if (devices.size() > 0) {
+                    platform_id = (cl_context_properties)platforms[i]();
+                    break;
+                }
+            }
+
+            if (platform_id == 0) {
+                detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = CL_DEVICE_NOT_FOUND;
+                }
+                return;
+            }
+
+            prop[1] = platform_id;
+            properties = &prop[0];
+        }
+#endif
+        object_ = ::clCreateContextFromType(
+            properties, type, notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context(const Context& ctx) : detail::Wrapper<cl_type>(ctx) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context& operator = (const Context &ctx)
+    {
+        detail::Wrapper<cl_type>::operator=(ctx);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context(Context&& ctx) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(ctx)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context& operator = (Context &&ctx)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(ctx));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
+     *
+     *  \note All calls to this function return the same cl_context as the first.
+     */
+    static Context getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+        default_ = Context(
+            CL_DEVICE_TYPE_DEFAULT,
+            NULL,
+            NULL,
+            NULL,
+            &error);
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Context() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_context
+     *  into the new Context object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper<cl_type>(context) { }
+
+    /*! \brief Assignment operator from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseContext() on the value previously held by this instance.
+     */
+    Context& operator = (const cl_context& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetContextInfo().
+    template <typename T>
+    cl_int getInfo(cl_context_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetContextInfo, object_, name, param),
+            __GET_CONTEXT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetContextInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_context_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_context_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of supported image formats.
+     *  
+     *  Wraps clGetSupportedImageFormats().
+     */
+    cl_int getSupportedImageFormats(
+        cl_mem_flags flags,
+        cl_mem_object_type type,
+        VECTOR_CLASS<ImageFormat>* formats) const
+    {
+        cl_uint numEntries;
+
+        if (!formats) {
+            return CL_SUCCESS;
+        }
+
+        cl_int err = ::clGetSupportedImageFormats(
+            object_,
+            flags,
+            type,
+            0,
+            NULL,
+            &numEntries);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        if (numEntries > 0) {
+            ImageFormat* value = (ImageFormat*)
+                alloca(numEntries * sizeof(ImageFormat));
+            err = ::clGetSupportedImageFormats(
+                object_,
+                flags,
+                type,
+                numEntries,
+                (cl_image_format*)value,
+                NULL);
+            if (err != CL_SUCCESS) {
+                return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+            }
+
+            formats->assign(&value[0], &value[numEntries]);
+        }
+        else {
+            formats->clear();
+        }
+        return CL_SUCCESS;
+    }
+};
+
+inline Device Device::getDefault(cl_int * err)
+{
+    cl_int error;
+    Device device;
+
+    Context context = Context::getDefault(&error);
+    detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+    if (error != CL_SUCCESS) {
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+    else {
+        device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+        if (err != NULL) {
+            *err = CL_SUCCESS;
+        }
+    }
+
+    return device;
+}
+
+
+#ifdef _WIN32
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) std::atomic<int> Context::default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) Context Context::default_;
+__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#else // !_WIN32
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) std::atomic<int> Context::default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) Context Context::default_;
+__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#endif // !_WIN32
+
+/*! \brief Class interface for cl_event.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_event as the original.  For details, see
+ *        clRetainEvent() and clReleaseEvent().
+ *
+ *  \see cl_event
+ */
+class Event : public detail::Wrapper<cl_event>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Event() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_event - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_event
+     *  into the new Event object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Event(const cl_event& event) : detail::Wrapper<cl_type>(event) { }
+
+    /*! \brief Assignment operator from cl_event - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseEvent() on the value previously held by this instance.
+     */
+    Event& operator = (const cl_event& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetEventInfo().
+    template <typename T>
+    cl_int getInfo(cl_event_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetEventInfo, object_, name, param),
+            __GET_EVENT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_event_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_event_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo().
+    template <typename T>
+    cl_int getProfilingInfo(cl_profiling_info name, T* param) const
+    {
+        return detail::errHandler(detail::getInfo(
+            &::clGetEventProfilingInfo, object_, name, param),
+            __GET_EVENT_PROFILE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_profiling_info, name>::param_type
+    getProfilingInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_profiling_info, name>::param_type param;
+        cl_int result = getProfilingInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Blocks the calling thread until this event completes.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    cl_int wait() const
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(1, &object_),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Registers a user callback function for a specific command execution status.
+     *
+     *  Wraps clSetEventCallback().
+     */
+    cl_int setCallback(
+        cl_int type,
+        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetEventCallback(
+                object_,
+                type,
+                pfn_notify,
+                user_data), 
+            __SET_EVENT_CALLBACK_ERR);
+    }
+#endif
+
+    /*! \brief Blocks the calling thread until every event specified is complete.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    static cl_int
+    waitForEvents(const VECTOR_CLASS<Event>& events)
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(
+                (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+};
+
+#if defined(CL_VERSION_1_1)
+/*! \brief Class interface for user events (a subset of cl_event's).
+ * 
+ *  See Event for details about copy semantics, etc.
+ */
+class UserEvent : public Event
+{
+public:
+    /*! \brief Constructs a user event on a given context.
+     *
+     *  Wraps clCreateUserEvent().
+     */
+    UserEvent(
+        const Context& context,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateUserEvent(
+            context(),
+            &error);
+
+        detail::errHandler(error, __CREATE_USER_EVENT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    UserEvent() : Event() { }
+
+    /*! \brief Sets the execution status of a user event object.
+     *
+     *  Wraps clSetUserEventStatus().
+     */
+    cl_int setStatus(cl_int status)
+    {
+        return detail::errHandler(
+            ::clSetUserEventStatus(object_,status), 
+            __SET_USER_EVENT_STATUS_ERR);
+    }
+};
+#endif
+
+/*! \brief Blocks the calling thread until every event specified is complete.
+ * 
+ *  Wraps clWaitForEvents().
+ */
+inline static cl_int
+WaitForEvents(const VECTOR_CLASS<Event>& events)
+{
+    return detail::errHandler(
+        ::clWaitForEvents(
+            (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+        __WAIT_FOR_EVENTS_ERR);
+}
+
+/*! \brief Class interface for cl_mem.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_mem as the original.  For details, see
+ *        clRetainMemObject() and clReleaseMemObject().
+ *
+ *  \see cl_mem
+ */
+class Memory : public detail::Wrapper<cl_mem>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Memory() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_mem
+     *  into the new Memory object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper<cl_type>(memory) { }
+
+    /*! \brief Assignment operator from cl_mem - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseMemObject() on the value previously held by this instance.
+     */
+    Memory& operator = (const cl_mem& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory(const Memory& mem) : detail::Wrapper<cl_type>(mem) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory& operator = (const Memory &mem)
+    {
+        detail::Wrapper<cl_type>::operator=(mem);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory(Memory&& mem) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(mem)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory& operator = (Memory &&mem)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(mem));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    //! \brief Wrapper for clGetMemObjectInfo().
+    template <typename T>
+    cl_int getInfo(cl_mem_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
+            __GET_MEM_OBJECT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_mem_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_mem_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Registers a callback function to be called when the memory object
+     *         is no longer needed.
+     *
+     *  Wraps clSetMemObjectDestructorCallback().
+     *
+     *  Repeated calls to this function, for a given cl_mem value, will append
+     *  to the list of functions called (in reverse order) when memory object's
+     *  resources are freed and the memory object is deleted.
+     *
+     *  \note
+     *  The registered callbacks are associated with the underlying cl_mem
+     *  value - not the Memory class instance.
+     */
+    cl_int setDestructorCallback(
+        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetMemObjectDestructorCallback(
+                object_,
+                pfn_notify,
+                user_data), 
+            __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
+    }
+#endif
+
+};
+
+// Pre-declare copy functions
+class Buffer;
+template< typename IteratorType >
+cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+
+
+/*! \brief Class interface for Buffer Memory Objects.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class Buffer : public Memory
+{
+public:
+
+    /*! \brief Constructs a Buffer in a specified context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     */
+    Buffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a Buffer in the default context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     *
+     *  \see Context::getDefault()
+     */
+    Buffer(
+         cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(
+        IteratorType startIterator,
+        IteratorType endIterator,
+        bool readOnly,
+        bool useHostPtr = false,
+        cl_int* err = NULL)
+    {
+        typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+        cl_int error;
+
+        cl_mem_flags flags = 0;
+        if( readOnly ) {
+            flags |= CL_MEM_READ_ONLY;
+        }
+        else {
+            flags |= CL_MEM_READ_WRITE;
+        }
+        if( useHostPtr ) {
+            flags |= CL_MEM_USE_HOST_PTR;
+        }
+        
+        ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+        Context context = Context::getDefault(err);
+
+        if( useHostPtr ) {
+            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        } else {
+            object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+        }
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        if( !useHostPtr ) {
+            error = cl::copy(startIterator, endIterator, *this);
+            detail::errHandler(error, __CREATE_BUFFER_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators using a specified context.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+
+    /*!
+    * \brief Construct a Buffer from a host container via iterators using a specified queue.
+    * If useHostPtr is specified iterators must represent contiguous data.
+    */
+    template< typename IteratorType >
+    Buffer(const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Buffer() : Memory() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+    
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer(const Buffer& buf) : Memory(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer& operator = (const Buffer &buf)
+    {
+        Memory::operator=(buf);
+        return *this;
+    }
+    
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer(Buffer&& buf) CL_HPP_NOEXCEPT : Memory(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer& operator = (Buffer &&buf)
+    {
+        Memory::operator=(std::move(buf));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Creates a new buffer object from this.
+     *
+     *  Wraps clCreateSubBuffer().
+     */
+    Buffer createSubBuffer(
+        cl_mem_flags flags,
+        cl_buffer_create_type buffer_create_type,
+        const void * buffer_create_info,
+        cl_int * err = NULL)
+    {
+        Buffer result;
+        cl_int error;
+        result.object_ = ::clCreateSubBuffer(
+            object_, 
+            flags, 
+            buffer_create_type, 
+            buffer_create_info, 
+            &error);
+
+        detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return result;
+    }		
+#endif
+};
+
+#if defined (USE_DX_INTEROP)
+/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
+ *
+ *  This is provided to facilitate interoperability with Direct3D.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class BufferD3D10 : public Buffer
+{
+public:
+    typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
+    cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
+    cl_int* errcode_ret);
+
+    /*! \brief Constructs a BufferD3D10, in a specified context, from a
+     *         given ID3D10Buffer.
+     *
+     *  Wraps clCreateFromD3D10BufferKHR().
+     */
+    BufferD3D10(
+        const Context& context,
+        cl_mem_flags flags,
+        ID3D10Buffer* bufobj,
+        cl_int * err = NULL)
+    {
+        static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
+
+#if defined(CL_VERSION_1_2)
+        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
+        cl_platform platform = -1;
+        for( int i = 0; i < props.size(); ++i ) {
+            if( props[i] == CL_CONTEXT_PLATFORM ) {
+                platform = props[i+1];
+            }
+        }
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
+#endif
+
+        cl_int error;
+        object_ = pfn_clCreateFromD3D10BufferKHR(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferD3D10() : Buffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferD3D10(const BufferD3D10& buf) : Buffer(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferD3D10& operator = (const BufferD3D10 &buf)
+    {
+        Buffer::operator=(buf);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferD3D10(BufferD3D10&& buf) CL_HPP_NOEXCEPT : Buffer(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferD3D10& operator = (BufferD3D10 &&buf)
+    {
+        Buffer::operator=(std::move(buf));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif
+
+/*! \brief Class interface for GL Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferGL in a specified context, from a given
+     *         GL buffer.
+     *
+     *  Wraps clCreateFromGLBuffer().
+     */
+    BufferGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLBuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferGL() : Buffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferGL(const BufferGL& buf) : Buffer(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferGL& operator = (const BufferGL &buf)
+    {
+        Buffer::operator=(buf);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferGL(BufferGL&& buf) CL_HPP_NOEXCEPT : Buffer(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferGL& operator = (BufferGL &&buf)
+    {
+        Buffer::operator=(std::move(buf));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        cl_GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief C++ base class for Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image : public Memory
+{
+protected:
+    //! \brief Default constructor - initializes to NULL.
+    Image() : Memory() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image(const Image& img) : Memory(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image& operator = (const Image &img)
+    {
+        Memory::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image(Image&& img) CL_HPP_NOEXCEPT : Memory(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image& operator = (Image &&img)
+    {
+        Memory::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+public:
+    //! \brief Wrapper for clGetImageInfo().
+    template <typename T>
+    cl_int getImageInfo(cl_image_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetImageInfo, object_, name, param),
+            __GET_IMAGE_INFO_ERR);
+    }
+    
+    //! \brief Wrapper for clGetImageInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_image_info, name>::param_type
+    getImageInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_image_info, name>::param_type param;
+        cl_int result = getImageInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+#if defined(CL_VERSION_1_2)
+/*! \brief Class interface for 1D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image1D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image1D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D,
+            width,
+            0, 0, 0, 0, 0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image1D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D(const Image1D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D& operator = (const Image1D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D(Image1D&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D& operator = (Image1D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+
+/*! \class Image1DBuffer
+ * \brief Image interface for 1D buffer images.
+ */
+class Image1DBuffer : public Image
+{
+public:
+    Image1DBuffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        const Buffer &buffer,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_BUFFER,
+            width,
+            0, 0, 0, 0, 0, 0, 0,
+            buffer()
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            NULL, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DBuffer() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { }
+
+    Image1DBuffer& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+    
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer(const Image1DBuffer& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer& operator = (const Image1DBuffer &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer(Image1DBuffer&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer& operator = (Image1DBuffer &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+
+/*! \class Image1DArray
+ * \brief Image interface for arrays of 1D images.
+ */
+class Image1DArray : public Image
+{
+public:
+    Image1DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t arraySize,
+        ::size_t width,
+        ::size_t rowPitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_ARRAY,
+            width,
+            0, 0,  // height, depth (unused)
+            arraySize,
+            rowPitch,
+            0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DArray() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+    Image1DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+    
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray(const Image1DArray& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray& operator = (const Image1DArray &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray(Image1DArray&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray& operator = (Image1DArray &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+
+/*! \brief Class interface for 2D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image2D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image2D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t row_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE2D,
+                width,
+                height,
+                0, 0, // depth, array size (unused)
+                row_pitch,
+                0, 0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(),
+                flags,
+                &format,
+                &desc,
+                host_ptr,
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage2D(
+                context(), flags,&format, width, height, row_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image2D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D(const Image2D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D& operator = (const Image2D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D(Image2D&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D& operator = (Image2D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 2D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D
+{
+public:
+    /*! \brief Constructs an Image2DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture2D().
+     */
+    Image2DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture2D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+    
+    //! \brief Default constructor - initializes to NULL.
+    Image2DGL() : Image2D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const cl_mem& rhs)
+    {
+        Image2D::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL(const Image2DGL& img) : Image2D(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL& operator = (const Image2DGL &img)
+    {
+        Image2D::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL(Image2DGL&& img) CL_HPP_NOEXCEPT : Image2D(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL& operator = (Image2DGL &&img)
+    {
+        Image2D::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class Image2DArray
+ * \brief Image interface for arrays of 2D images.
+ */
+class Image2DArray : public Image
+{
+public:
+    Image2DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t arraySize,
+        ::size_t width,
+        ::size_t height,
+        ::size_t rowPitch,
+        ::size_t slicePitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D_ARRAY,
+            width,
+            height,
+            0,       // depth (unused)
+            arraySize,
+            rowPitch,
+            slicePitch,
+            0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image2DArray() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+    Image2DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+    
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray(const Image2DArray& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray& operator = (const Image2DArray &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray(Image2DArray&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray& operator = (Image2DArray &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for 3D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3D : public Image
+{
+public:
+    /*! \brief Constructs a 3D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image3D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t depth,
+        ::size_t row_pitch = 0,
+        ::size_t slice_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE3D,
+                width,
+                height,
+                depth,
+                0,      // array size (unused)
+                row_pitch,
+                slice_pitch,
+                0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(), 
+                flags, 
+                &format, 
+                &desc, 
+                host_ptr, 
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif  // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage3D(
+                context(), flags, &format, width, height, depth, row_pitch,
+                slice_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3D() : Image() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D(const Image3D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D& operator = (const Image3D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D(Image3D&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D& operator = (Image3D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 3D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3DGL : public Image3D
+{
+public:
+    /*! \brief Constructs an Image3DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture3D().
+     */
+    Image3DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture3D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3DGL() : Image3D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const cl_mem& rhs)
+    {
+        Image3D::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL(const Image3DGL& img) : Image3D(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL& operator = (const Image3DGL &img)
+    {
+        Image3D::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL(Image3DGL&& img) CL_HPP_NOEXCEPT : Image3D(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL& operator = (Image3DGL &&img)
+    {
+        Image3D::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class ImageGL
+ * \brief general image interface for GL interop.
+ * We abstract the 2D and 3D GL images into a single instance here
+ * that wraps all GL sourced images on the grounds that setup information
+ * was performed by OpenCL anyway.
+ */
+class ImageGL : public Image
+{
+public:
+    ImageGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture(
+            context(), 
+            flags, 
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    ImageGL() : Image() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
+
+    ImageGL& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL(const ImageGL& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL& operator = (const ImageGL &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL(ImageGL&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL& operator = (ImageGL &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for GL Render Buffer Memory Objects.
+*
+*  This is provided to facilitate interoperability with OpenGL.
+*
+*  See Memory for details about copy semantics, etc.
+*
+*  \see Memory
+*/
+class BufferRenderGL : 
+#if defined(CL_VERSION_1_2)
+    public ImageGL
+#else // #if defined(CL_VERSION_1_2)
+    public Image2DGL
+#endif //#if defined(CL_VERSION_1_2)
+{
+public:
+    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
+    *         GL Renderbuffer.
+    *
+    *  Wraps clCreateFromGLRenderbuffer().
+    */
+    BufferRenderGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLRenderbuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+#if defined(CL_VERSION_1_2)
+    BufferRenderGL() : ImageGL() {};
+#else // #if defined(CL_VERSION_1_2)
+    BufferRenderGL() : Image2DGL() {};
+#endif //#if defined(CL_VERSION_1_2)
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+    *
+    *  See Memory for further details.
+    */
+#if defined(CL_VERSION_1_2)
+    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : ImageGL(buffer) { }
+#else // #if defined(CL_VERSION_1_2)
+    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Image2DGL(buffer) { }
+#endif //#if defined(CL_VERSION_1_2)
+
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+    *
+    *  See Memory for further details.
+    */
+    BufferRenderGL& operator = (const cl_mem& rhs)
+    {
+#if defined(CL_VERSION_1_2)
+        ImageGL::operator=(rhs);
+#else // #if defined(CL_VERSION_1_2)
+        Image2DGL::operator=(rhs);
+#endif //#if defined(CL_VERSION_1_2)
+        
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+#if defined(CL_VERSION_1_2)
+    BufferRenderGL(const BufferRenderGL& buf) : ImageGL(buf) {}
+#else // #if defined(CL_VERSION_1_2)
+    BufferRenderGL(const BufferRenderGL& buf) : Image2DGL(buf) {}
+#endif //#if defined(CL_VERSION_1_2)
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferRenderGL& operator = (const BufferRenderGL &rhs)
+    {
+#if defined(CL_VERSION_1_2)
+        ImageGL::operator=(rhs);
+#else // #if defined(CL_VERSION_1_2)
+        Image2DGL::operator=(rhs);
+#endif //#if defined(CL_VERSION_1_2)
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+#if defined(CL_VERSION_1_2)
+    BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT : ImageGL(std::move(buf)) {}
+#else // #if defined(CL_VERSION_1_2)
+    BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT : Image2DGL(std::move(buf)) {}
+#endif //#if defined(CL_VERSION_1_2)
+    
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    BufferRenderGL& operator = (BufferRenderGL &&buf)
+    {
+#if defined(CL_VERSION_1_2)
+        ImageGL::operator=(std::move(buf));
+#else // #if defined(CL_VERSION_1_2)
+        Image2DGL::operator=(std::move(buf));
+#endif //#if defined(CL_VERSION_1_2)
+        
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        cl_GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_, type, gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief Class interface for cl_sampler.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_sampler as the original.  For details, see
+ *        clRetainSampler() and clReleaseSampler().
+ *
+ *  \see cl_sampler 
+ */
+class Sampler : public detail::Wrapper<cl_sampler>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Sampler() { }
+
+    /*! \brief Constructs a Sampler in a specified context.
+     *
+     *  Wraps clCreateSampler().
+     */
+    Sampler(
+        const Context& context,
+        cl_bool normalized_coords,
+        cl_addressing_mode addressing_mode,
+        cl_filter_mode filter_mode,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateSampler(
+            context(), 
+            normalized_coords,
+            addressing_mode,
+            filter_mode,
+            &error);
+
+        detail::errHandler(error, __CREATE_SAMPLER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructor from cl_sampler - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_sampler
+     *  into the new Sampler object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+    /*! \brief Assignment operator from cl_sampler - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseSampler() on the value previously held by this instance.
+     */
+    Sampler& operator = (const cl_sampler& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler(const Sampler& sam) : detail::Wrapper<cl_type>(sam) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler& operator = (const Sampler &sam)
+    {
+        detail::Wrapper<cl_type>::operator=(sam);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler(Sampler&& sam) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(sam)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler& operator = (Sampler &&sam)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(sam));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    //! \brief Wrapper for clGetSamplerInfo().
+    template <typename T>
+    cl_int getInfo(cl_sampler_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
+            __GET_SAMPLER_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_sampler_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_sampler_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+class Program;
+class CommandQueue;
+class Kernel;
+
+//! \brief Class interface for specifying NDRange values.
+class NDRange
+{
+private:
+    size_t<3> sizes_;
+    cl_uint dimensions_;
+
+public:
+    //! \brief Default constructor - resulting range has zero dimensions.
+    NDRange()
+        : dimensions_(0)
+    { }
+
+    //! \brief Constructs one-dimensional range.
+    NDRange(::size_t size0)
+        : dimensions_(1)
+    {
+        sizes_[0] = size0;
+    }
+
+    //! \brief Constructs two-dimensional range.
+    NDRange(::size_t size0, ::size_t size1)
+        : dimensions_(2)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+    }
+
+    //! \brief Constructs three-dimensional range.
+    NDRange(::size_t size0, ::size_t size1, ::size_t size2)
+        : dimensions_(3)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+        sizes_[2] = size2;
+    }
+
+    /*! \brief Conversion operator to const ::size_t *.
+     *  
+     *  \returns a pointer to the size of the first dimension.
+     */
+    operator const ::size_t*() const { 
+        return (const ::size_t*) sizes_; 
+    }
+
+    //! \brief Queries the number of dimensions in the range.
+    ::size_t dimensions() const { return dimensions_; }
+};
+
+//! \brief A zero-dimensional range.
+static const NDRange NullRange;
+
+//! \brief Local address wrapper for use with Kernel::setArg
+struct LocalSpaceArg
+{
+    ::size_t size_;
+};
+
+namespace detail {
+
+template <typename T>
+struct KernelArgumentHandler
+{
+    static ::size_t size(const T&) { return sizeof(T); }
+    static const T* ptr(const T& value) { return &value; }
+};
+
+template <>
+struct KernelArgumentHandler<LocalSpaceArg>
+{
+    static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
+    static const void* ptr(const LocalSpaceArg&) { return NULL; }
+};
+
+} 
+//! \endcond
+
+/*! __local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ * Deprecated. Replaced with Local.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg
+__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline LocalSpaceArg
+__local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+/*! Local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ */
+inline LocalSpaceArg
+Local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+//class KernelFunctor;
+
+/*! \brief Class interface for cl_kernel.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_kernel as the original.  For details, see
+ *        clRetainKernel() and clReleaseKernel().
+ *
+ *  \see cl_kernel
+ */
+class Kernel : public detail::Wrapper<cl_kernel>
+{
+public:
+    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Kernel() { }
+
+    /*! \brief Constructor from cl_kernel - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_kernel
+     *  into the new Kernel object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+    /*! \brief Assignment operator from cl_kernel - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseKernel() on the value previously held by this instance.
+     */
+    Kernel& operator = (const cl_kernel& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel& operator = (const Kernel &kernel)
+    {
+        detail::Wrapper<cl_type>::operator=(kernel);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel(Kernel&& kernel) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(kernel)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel& operator = (Kernel &&kernel)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(kernel));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    template <typename T>
+    cl_int getInfo(cl_kernel_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelInfo, object_, name, param),
+            __GET_KERNEL_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(CL_VERSION_1_2)
+    template <typename T>
+    cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
+            __GET_KERNEL_ARG_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
+    getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_arg_info, name>::param_type param;
+        cl_int result = getArgInfo(argIndex, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    template <typename T>
+    cl_int getWorkGroupInfo(
+        const Device& device, cl_kernel_work_group_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
+                __GET_KERNEL_WORK_GROUP_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
+        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+        detail::cl_kernel_work_group_info, name>::param_type param;
+        cl_int result = getWorkGroupInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int setArg(cl_uint index, const T &value)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(
+                object_,
+                index,
+                detail::KernelArgumentHandler<T>::size(value),
+                detail::KernelArgumentHandler<T>::ptr(value)),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    cl_int setArg(cl_uint index, ::size_t size, const void* argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(object_, index, size, argPtr),
+            __SET_KERNEL_ARGS_ERR);
+    }
+};
+
+/*! \class Program
+ * \brief Program interface that implements cl_program.
+ */
+class Program : public detail::Wrapper<cl_program>
+{
+public:
+    typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
+    typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
+
+    Program(
+        const STRING_CLASS& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const STRING_CLASS& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const Sources& sources,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const ::size_t n = (::size_t)sources.size();
+        ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
+        const char** strings = (const char**) alloca(n * sizeof(const char*));
+
+        for (::size_t i = 0; i < n; ++i) {
+            strings[i] = sources[(int)i].first;
+            lengths[i] = sources[(int)i].second;
+        }
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)n, strings, lengths, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Construct a program object from a list of devices and a per-device list of binaries.
+     * \param context A valid OpenCL context in which to construct the program.
+     * \param devices A vector of OpenCL device objects for which the program will be created.
+     * \param binaries A vector of pairs of a pointer to a binary object and its length.
+     * \param binaryStatus An optional vector that on completion will be resized to
+     *   match the size of binaries and filled with values to specify if each binary
+     *   was successfully loaded.
+     *   Set to CL_SUCCESS if the binary was successfully loaded.
+     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
+     *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
+     * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
+     *   CL_INVALID_CONTEXT if context is not a valid context.
+     *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
+     *     or if any entry in binaries is NULL or has length 0.
+     *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
+     *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
+     *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const Binaries& binaries,
+        VECTOR_CLASS<cl_int>* binaryStatus = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        
+        const ::size_t numDevices = devices.size();
+        
+        // Catch size mismatch early and return
+        if(binaries.size() != numDevices) {
+            error = CL_INVALID_VALUE;
+            detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+        ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t));
+        const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**));
+
+        for (::size_t i = 0; i < numDevices; ++i) {
+            images[i] = (const unsigned char*)binaries[i].first;
+            lengths[i] = binaries[(int)i].second;
+        }
+
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        if(binaryStatus) {
+            binaryStatus->resize(numDevices);
+        }
+        
+        object_ = ::clCreateProgramWithBinary(
+            context(), (cl_uint) devices.size(),
+            deviceIDs,
+            lengths, images, (binaryStatus != NULL && numDevices > 0)
+               ? &binaryStatus->front()
+               : NULL, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    /**
+     * Create program using builtin kernels.
+     * \param kernelNames Semi-colon separated list of builtin kernel names
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const STRING_CLASS& kernelNames,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+        
+        object_ = ::clCreateProgramWithBuiltInKernels(
+            context(), 
+            (cl_uint) devices.size(),
+            deviceIDs,
+            kernelNames.c_str(), 
+            &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    Program() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper<cl_type>(program) { }
+
+    Program& operator = (const cl_program& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program(const Program& program) : detail::Wrapper<cl_type>(program) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program& operator = (const Program &program)
+    {
+        detail::Wrapper<cl_type>::operator=(program);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program(Program&& program) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(program)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program& operator = (Program &&program)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(program));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    cl_int build(
+        const VECTOR_CLASS<Device>& devices,
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                (cl_uint)
+                devices.size(),
+                deviceIDs,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+    cl_int build(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+#if defined(CL_VERSION_1_2)
+    cl_int compile(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clCompileProgram(
+                object_,
+                0,
+                NULL,
+                options,
+                0,
+                NULL,
+                NULL,
+                notifyFptr,
+                data),
+                __COMPILE_PROGRAM_ERR);
+    }
+#endif
+
+    template <typename T>
+    cl_int getInfo(cl_program_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetProgramInfo, object_, name, param),
+            __GET_PROGRAM_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int getBuildInfo(
+        const Device& device, cl_program_build_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetProgramBuildInfo, object_, device(), name, param),
+                __GET_PROGRAM_BUILD_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_build_info, name>::param_type
+    getBuildInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_build_info, name>::param_type param;
+        cl_int result = getBuildInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
+    {
+        cl_uint numKernels;
+        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel));
+        err = ::clCreateKernelsInProgram(
+            object_, numKernels, (cl_kernel*) value, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        kernels->assign(&value[0], &value[numKernels]);
+        return CL_SUCCESS;
+    }
+};
+
+#if defined(CL_VERSION_1_2)
+inline Program linkProgram(
+    Program input1,
+    Program input2,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int error_local = CL_SUCCESS;
+
+    cl_program programs[2] = { input1(), input2() };
+
+    Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>(&error_local);
+    if(error_local!=CL_SUCCESS) {
+        detail::errHandler(error_local, __LINK_PROGRAM_ERR);
+    }
+
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        2,
+        programs,
+        notifyFptr,
+        data,
+        &error_local);
+
+    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = error_local;
+    }
+
+    return Program(prog);
+}
+
+inline Program linkProgram(
+    VECTOR_CLASS<Program> inputPrograms,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int error_local = CL_SUCCESS;
+
+    cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program));
+
+    if (programs != NULL) {
+        for (unsigned int i = 0; i < inputPrograms.size(); i++) {
+          programs[i] = inputPrograms[i]();
+        }
+    } 
+
+    Context ctx;
+    if(inputPrograms.size() > 0) {
+        ctx = inputPrograms[0].getInfo<CL_PROGRAM_CONTEXT>(&error_local);
+        if(error_local!=CL_SUCCESS) {
+            detail::errHandler(error_local, __LINK_PROGRAM_ERR);
+        }
+    }
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        (cl_uint)inputPrograms.size(),
+        programs,
+        notifyFptr,
+        data,
+        &error_local);
+
+    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = error_local;
+    }
+
+    return Program(prog);
+}
+#endif
+
+template<>
+inline VECTOR_CLASS<char *> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
+{
+    VECTOR_CLASS< ::size_t> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
+    VECTOR_CLASS<char *> binaries;
+    for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) 
+    {
+        char *ptr = NULL;
+        if (*s != 0) 
+            ptr = new char[*s];
+        binaries.push_back(ptr);
+    }
+    
+    cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries);
+    if (err != NULL) {
+        *err = result;
+    }
+    return binaries;
+}
+
+inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+{
+    cl_int error;
+
+    object_ = ::clCreateKernel(program(), name, &error);
+    detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+    if (err != NULL) {
+        *err = error;
+    }
+
+}
+
+/*! \class CommandQueue
+ * \brief CommandQueue interface for cl_command_queue.
+ */
+class CommandQueue : public detail::Wrapper<cl_command_queue>
+{
+private:
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static std::atomic<int> default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static volatile int default_initialized_;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+    static CommandQueue default_;
+    static volatile cl_int default_error_;
+public:
+   CommandQueue(
+        cl_command_queue_properties properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            object_ = ::clCreateCommandQueue(
+                context(), device(), properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+    /*!
+    * \brief Constructs a CommandQueue for an implementation defined device in the given context
+    */
+    explicit CommandQueue(
+        const Context& context,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        VECTOR_CLASS<cl::Device> devices;
+        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS)
+        {
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+        object_ = ::clCreateCommandQueue(context(), devices[0](), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateCommandQueue(
+            context(), device(), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue(const CommandQueue& queue) : detail::Wrapper<cl_type>(queue) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue& operator = (const CommandQueue &queue)
+    {
+        detail::Wrapper<cl_type>::operator=(queue);
+        return *this;
+    }
+
+#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue(CommandQueue&& queue) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(queue)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue& operator = (CommandQueue &&queue)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(queue));
+        return *this;
+    }
+#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
+
+    static CommandQueue getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            default_ = CommandQueue(context, device, 0, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    CommandQueue() { }
+
+    __CL_EXPLICIT_CONSTRUCTORS CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+    CommandQueue& operator = (const cl_command_queue& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_queue_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetCommandQueueInfo, object_, name, param),
+                __GET_COMMAND_QUEUE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_command_queue_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_queue_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int enqueueReadBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBuffer(
+                object_, src(), dst(), src_offset, dst_offset, size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueReadBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_READ_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        const void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferRect(
+        const Buffer& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        ::size_t src_row_pitch,
+        ::size_t src_slice_pitch,
+        ::size_t dst_row_pitch,
+        ::size_t dst_slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferRect(
+                object_, 
+                src(), 
+                dst(), 
+                (const ::size_t *)src_origin, 
+                (const ::size_t *)dst_origin, 
+                (const ::size_t *)region,
+                src_row_pitch,
+                src_slice_pitch,
+                dst_row_pitch,
+                dst_slice_pitch,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill a buffer object with a pattern
+     * of a given size. The pattern is specified a as vector.
+     * \tparam PatternType The datatype of the pattern field. 
+     *     The pattern type must be an accepted OpenCL data type.
+     */
+    template<typename PatternType>
+    cl_int enqueueFillBuffer(
+        const Buffer& buffer,
+        PatternType pattern,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillBuffer(
+                object_, 
+                buffer(),
+                static_cast<void*>(&pattern),
+                sizeof(PatternType), 
+                offset, 
+                size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueReadImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_WRITE_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyImage(
+        const Image& src,
+        const Image& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImage(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *)dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA floating-point color value if
+     *     the image channel data type is not an unnormalized signed or
+     *     unsigned data type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_float4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA signed integer color value if
+     *     the image channel data type is an unnormalized signed integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_int4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA unsigned integer color value if
+     *     the image channel data type is an unnormalized unsigned integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_uint4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueCopyImageToBuffer(
+        const Image& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& region,
+        ::size_t dst_offset,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImageToBuffer(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *) region, dst_offset,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferToImage(
+        const Buffer& src,
+        const Image& dst,
+        ::size_t src_offset,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferToImage(
+                object_, src(), dst(), src_offset,
+                (const ::size_t *) dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_event tmp;
+        cl_int error;
+        void * result = ::clEnqueueMapBuffer(
+            object_, buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+        if (event != NULL && error == CL_SUCCESS)
+            *event = tmp;
+
+        return result;
+    }
+
+    void* enqueueMapImage(
+        const Image& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t * row_pitch,
+        ::size_t * slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_event tmp;
+        cl_int error;
+        void * result = ::clEnqueueMapImage(
+            object_, buffer(), blocking, flags,
+            (const ::size_t *) origin, (const ::size_t *) region,
+            row_pitch, slice_pitch,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
+        if (err != NULL) {
+              *err = error;
+        }
+        if (event != NULL && error == CL_SUCCESS)
+            *event = tmp;
+        return result;
+    }
+
+    cl_int enqueueUnmapMemObject(
+        const Memory& memory,
+        void* mapped_ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueUnmapMemObject(
+                object_, memory(), mapped_ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or all previously enqueued commands to complete.
+     *
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command returns an event which can be waited on, 
+     * i.e. this event can be waited on to insure that all events either in the event_wait_list 
+     * or all previously enqueued commands, queued before this command to command_queue, 
+     * have completed.
+     */
+    cl_int enqueueMarkerWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarkerWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * A synchronization point that enqueues a barrier operation.
+     *
+     * Enqueues a barrier command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command blocks command execution, that is, any 
+     * following commands enqueued after it do not execute until it completes. This command 
+     * returns an event which can be waited on, i.e. this event can be waited on to insure that 
+     * all events either in the event_wait_list or all previously enqueued commands, queued 
+     * before this command to command_queue, have completed.
+     */
+    cl_int enqueueBarrierWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueBarrierWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_BARRIER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+    
+    /**
+     * Enqueues a command to indicate with which device a set of memory objects
+     * should be associated.
+     */
+    cl_int enqueueMigrateMemObjects(
+        const VECTOR_CLASS<Memory> &memObjects,
+        cl_mem_migration_flags flags,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL
+        ) const
+    {
+        cl_event tmp;
+        
+        cl_mem* localMemObjects = static_cast<cl_mem*>(alloca(memObjects.size() * sizeof(cl_mem)));
+        for( int i = 0; i < (int)memObjects.size(); ++i ) {
+            localMemObjects[i] = memObjects[i]();
+        }
+
+
+        cl_int err = detail::errHandler(
+            ::clEnqueueMigrateMemObjects(
+                object_, 
+                (cl_uint)memObjects.size(), 
+                static_cast<const cl_mem*>(localMemObjects),
+                flags,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueNDRangeKernel(
+        const Kernel& kernel,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local = NullRange,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNDRangeKernel(
+                object_, kernel(), (cl_uint) global.dimensions(),
+                offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
+                (const ::size_t*) global,
+                local.dimensions() != 0 ? (const ::size_t*) local : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NDRANGE_KERNEL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueTask(
+        const Kernel& kernel,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueTask(
+                object_, kernel(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_TASK_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueNativeKernel(
+        void (CL_CALLBACK *userFptr)(void *),
+        std::pair<void*, ::size_t> args,
+        const VECTOR_CLASS<Memory>* mem_objects = NULL,
+        const VECTOR_CLASS<const void*>* mem_locs = NULL,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) 
+            ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem))
+            : NULL;
+
+        if (mems != NULL) {
+            for (unsigned int i = 0; i < mem_objects->size(); i++) {
+                mems[i] = ((*mem_objects)[i])();
+            }
+        }
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNativeKernel(
+                object_, userFptr, args.first, args.second,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                mems,
+                (mem_locs != NULL && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NATIVE_KERNEL);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED 
+    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarker(
+                object_, 
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueWaitForEvents(
+                object_,
+                (cl_uint) events.size(),
+                events.size() > 0 ? (const cl_event*) &events.front() : NULL),
+            __ENQUEUE_WAIT_FOR_EVENTS_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int enqueueAcquireGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueAcquireGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueReleaseGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+#if defined (USE_DX_INTEROP)
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+
+    cl_int enqueueAcquireD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+        
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             pfn_clEnqueueAcquireD3D10ObjectsKHR(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_2)
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_1)
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            pfn_clEnqueueReleaseD3D10ObjectsKHR(
+                object_,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueBarrier(object_),
+            __ENQUEUE_BARRIER_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int flush() const
+    {
+        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
+    }
+
+    cl_int finish() const
+    {
+        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
+    }
+};
+
+#ifdef _WIN32
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) std::atomic<int> CommandQueue::default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__declspec(selectany) CommandQueue CommandQueue::default_;
+__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#else // !_WIN32
+#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) std::atomic<int> CommandQueue::default_initialized_;
+#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
+__attribute__((weak)) CommandQueue CommandQueue::default_;
+__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#endif // !_WIN32
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const Context &context,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if( readOnly ) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if( useHostPtr ) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+    
+    ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+    if( useHostPtr ) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    } else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if( !useHostPtr ) {
+        CommandQueue queue(context, 0, &error);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const CommandQueue &queue,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if (readOnly) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if (useHostPtr) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+
+    ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+    Context context = queue.getInfo<CL_QUEUE_CONTEXT>();
+
+    if (useHostPtr) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    }
+    else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if (!useHostPtr) {
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+inline cl_int enqueueReadBuffer(
+    const Buffer& buffer,
+    cl_bool blocking,
+    ::size_t offset,
+    ::size_t size,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    void * result = ::clEnqueueMapBuffer(
+            queue(), buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+    return result;
+}
+
+inline cl_int enqueueUnmapMemObject(
+    const Memory& memory,
+    void* mapped_ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    cl_event tmp;
+    cl_int err = detail::errHandler(
+        ::clEnqueueUnmapMemObject(
+            queue(), memory(), mapped_ptr,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+    if (event != NULL && err == CL_SUCCESS)
+        *event = tmp;
+
+    return err;
+}
+
+inline cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, startIterator, endIterator, buffer);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, buffer, startIterator, endIterator);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+    
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+#if defined(_MSC_VER)
+    std::copy(
+        startIterator, 
+        endIterator, 
+        stdext::checked_array_iterator<DataType*>(
+            pointer, length));
+#else
+    std::copy(startIterator, endIterator, pointer);
+#endif
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+        
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+    std::copy(pointer, pointer + length, startIterator);
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+#if defined(CL_VERSION_1_1)
+inline cl_int enqueueReadBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    const void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyBufferRect(
+    const Buffer& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    ::size_t src_row_pitch,
+    ::size_t src_slice_pitch,
+    ::size_t dst_row_pitch,
+    ::size_t dst_slice_pitch,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferRect(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        src_row_pitch,
+        src_slice_pitch,
+        dst_row_pitch,
+        dst_slice_pitch,
+        events, 
+        event);
+}
+#endif
+
+inline cl_int enqueueReadImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL) 
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    const void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyImage(
+    const Image& src,
+    const Image& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImage(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyImageToBuffer(
+    const Image& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& region,
+    ::size_t dst_offset,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImageToBuffer(
+        src,
+        dst,
+        src_origin,
+        region,
+        dst_offset,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyBufferToImage(
+    const Buffer& src,
+    const Image& dst,
+    ::size_t src_offset,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferToImage(
+        src,
+        dst,
+        src_offset,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+
+inline cl_int flush(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.flush();
+}
+
+inline cl_int finish(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    } 
+
+
+    return queue.finish();
+}
+
+// Kernel Functor support
+// New interface as of September 2011
+// Requires the C++11 std::tr1::function (note do not support TR1)
+// Visual Studio 2010 and GCC 4.2
+
+struct EnqueueArgs
+{
+    CommandQueue queue_;
+    const NDRange offset_;
+    const NDRange global_;
+    const NDRange local_;
+    VECTOR_CLASS<Event> events_;
+
+    EnqueueArgs(NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(Event e, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+};
+
+namespace detail {
+
+class NullType {};
+
+template<int index, typename T0>
+struct SetArg
+{
+    static void set (Kernel kernel, T0 arg)
+    {
+        kernel.setArg(index, arg);
+    }
+};  
+
+template<int index>
+struct SetArg<index, NullType>
+{
+    static void set (Kernel, NullType)
+    { 
+    }
+};
+
+template <
+   typename T0,   typename T1,   typename T2,   typename T3,
+   typename T4,   typename T5,   typename T6,   typename T7,
+   typename T8,   typename T9,   typename T10,   typename T11,
+   typename T12,   typename T13,   typename T14,   typename T15,
+   typename T16,   typename T17,   typename T18,   typename T19,
+   typename T20,   typename T21,   typename T22,   typename T23,
+   typename T24,   typename T25,   typename T26,   typename T27,
+   typename T28,   typename T29,   typename T30,   typename T31
+
+>
+class KernelFunctorGlobal
+{
+private:
+    Kernel kernel_;
+
+public:
+   KernelFunctorGlobal(
+        Kernel kernel) :
+            kernel_(kernel)
+    {}
+
+   KernelFunctorGlobal(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+            kernel_(program, name.c_str(), err)
+    {}
+
+    Event operator() (
+        const EnqueueArgs& args,
+        T0 t0,
+        T1 t1 = NullType(),
+        T2 t2 = NullType(),
+        T3 t3 = NullType(),
+        T4 t4 = NullType(),
+        T5 t5 = NullType(),
+        T6 t6 = NullType(),
+        T7 t7 = NullType(),
+        T8 t8 = NullType(),
+        T9 t9 = NullType(),
+        T10 t10 = NullType(),
+        T11 t11 = NullType(),
+        T12 t12 = NullType(),
+        T13 t13 = NullType(),
+        T14 t14 = NullType(),
+        T15 t15 = NullType(),
+        T16 t16 = NullType(),
+        T17 t17 = NullType(),
+        T18 t18 = NullType(),
+        T19 t19 = NullType(),
+        T20 t20 = NullType(),
+        T21 t21 = NullType(),
+        T22 t22 = NullType(),
+        T23 t23 = NullType(),
+        T24 t24 = NullType(),
+        T25 t25 = NullType(),
+        T26 t26 = NullType(),
+        T27 t27 = NullType(),
+        T28 t28 = NullType(),
+        T29 t29 = NullType(),
+        T30 t30 = NullType(),
+        T31 t31 = NullType()
+
+        )
+    {
+        Event event;
+        SetArg<0, T0>::set(kernel_, t0);
+        SetArg<1, T1>::set(kernel_, t1);
+        SetArg<2, T2>::set(kernel_, t2);
+        SetArg<3, T3>::set(kernel_, t3);
+        SetArg<4, T4>::set(kernel_, t4);
+        SetArg<5, T5>::set(kernel_, t5);
+        SetArg<6, T6>::set(kernel_, t6);
+        SetArg<7, T7>::set(kernel_, t7);
+        SetArg<8, T8>::set(kernel_, t8);
+        SetArg<9, T9>::set(kernel_, t9);
+        SetArg<10, T10>::set(kernel_, t10);
+        SetArg<11, T11>::set(kernel_, t11);
+        SetArg<12, T12>::set(kernel_, t12);
+        SetArg<13, T13>::set(kernel_, t13);
+        SetArg<14, T14>::set(kernel_, t14);
+        SetArg<15, T15>::set(kernel_, t15);
+        SetArg<16, T16>::set(kernel_, t16);
+        SetArg<17, T17>::set(kernel_, t17);
+        SetArg<18, T18>::set(kernel_, t18);
+        SetArg<19, T19>::set(kernel_, t19);
+        SetArg<20, T20>::set(kernel_, t20);
+        SetArg<21, T21>::set(kernel_, t21);
+        SetArg<22, T22>::set(kernel_, t22);
+        SetArg<23, T23>::set(kernel_, t23);
+        SetArg<24, T24>::set(kernel_, t24);
+        SetArg<25, T25>::set(kernel_, t25);
+        SetArg<26, T26>::set(kernel_, t26);
+        SetArg<27, T27>::set(kernel_, t27);
+        SetArg<28, T28>::set(kernel_, t28);
+        SetArg<29, T29>::set(kernel_, t29);
+        SetArg<30, T30>::set(kernel_, t30);
+        SetArg<31, T31>::set(kernel_, t31);
+
+        
+        args.queue_.enqueueNDRangeKernel(
+            kernel_,
+            args.offset_,
+            args.global_,
+            args.local_,
+            &args.events_,
+            &event);
+        
+        return event;
+    }
+
+};
+
+//------------------------------------------------------------------------------------------------------
+
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30,
+	typename T31>
+struct functionImplementation_
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30,
+		T31 arg31)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30,
+			arg31);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	T30,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1>
+struct functionImplementation_
+<	T0,
+	T1,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1);
+	}
+
+
+};
+
+template<
+	typename T0>
+struct functionImplementation_
+<	T0,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0);
+	}
+
+
+};
+
+
+
+
+
+} // namespace detail
+
+//----------------------------------------------------------------------------------------------
+
+template <
+   typename T0,   typename T1 = detail::NullType,   typename T2 = detail::NullType,
+   typename T3 = detail::NullType,   typename T4 = detail::NullType,
+   typename T5 = detail::NullType,   typename T6 = detail::NullType,
+   typename T7 = detail::NullType,   typename T8 = detail::NullType,
+   typename T9 = detail::NullType,   typename T10 = detail::NullType,
+   typename T11 = detail::NullType,   typename T12 = detail::NullType,
+   typename T13 = detail::NullType,   typename T14 = detail::NullType,
+   typename T15 = detail::NullType,   typename T16 = detail::NullType,
+   typename T17 = detail::NullType,   typename T18 = detail::NullType,
+   typename T19 = detail::NullType,   typename T20 = detail::NullType,
+   typename T21 = detail::NullType,   typename T22 = detail::NullType,
+   typename T23 = detail::NullType,   typename T24 = detail::NullType,
+   typename T25 = detail::NullType,   typename T26 = detail::NullType,
+   typename T27 = detail::NullType,   typename T28 = detail::NullType,
+   typename T29 = detail::NullType,   typename T30 = detail::NullType,
+   typename T31 = detail::NullType
+
+>
+struct make_kernel :
+    public detail::functionImplementation_<
+               T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+
+    >
+{
+public:
+    typedef detail::KernelFunctorGlobal<             
+               T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+
+    > FunctorType;
+
+    make_kernel(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+
+           >(
+            FunctorType(program, name, err)) 
+    {}
+
+    make_kernel(
+        const Kernel kernel) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+
+           >(
+            FunctorType(kernel)) 
+    {}    
+};
+
+
+//----------------------------------------------------------------------------------------------------------------------
+
+#undef __ERR_STR
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#undef __GET_DEVICE_INFO_ERR
+#undef __GET_PLATFORM_INFO_ERR
+#undef __GET_DEVICE_IDS_ERR
+#undef __GET_CONTEXT_INFO_ERR
+#undef __GET_EVENT_INFO_ERR
+#undef __GET_EVENT_PROFILE_INFO_ERR
+#undef __GET_MEM_OBJECT_INFO_ERR
+#undef __GET_IMAGE_INFO_ERR
+#undef __GET_SAMPLER_INFO_ERR
+#undef __GET_KERNEL_INFO_ERR
+#undef __GET_KERNEL_ARG_INFO_ERR
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
+#undef __GET_PROGRAM_INFO_ERR
+#undef __GET_PROGRAM_BUILD_INFO_ERR
+#undef __GET_COMMAND_QUEUE_INFO_ERR
+
+#undef __CREATE_CONTEXT_ERR
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
+
+#undef __CREATE_BUFFER_ERR
+#undef __CREATE_SUBBUFFER_ERR
+#undef __CREATE_IMAGE2D_ERR
+#undef __CREATE_IMAGE3D_ERR
+#undef __CREATE_SAMPLER_ERR
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
+
+#undef __CREATE_USER_EVENT_ERR
+#undef __SET_USER_EVENT_STATUS_ERR
+#undef __SET_EVENT_CALLBACK_ERR
+#undef __SET_PRINTF_CALLBACK_ERR
+
+#undef __WAIT_FOR_EVENTS_ERR
+
+#undef __CREATE_KERNEL_ERR
+#undef __SET_KERNEL_ARGS_ERR
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR
+#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
+#undef __BUILD_PROGRAM_ERR
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR
+
+#undef __CREATE_COMMAND_QUEUE_ERR
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
+#undef __ENQUEUE_READ_BUFFER_ERR
+#undef __ENQUEUE_WRITE_BUFFER_ERR
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
+#undef __ENQEUE_COPY_BUFFER_ERR
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR
+#undef __ENQUEUE_READ_IMAGE_ERR
+#undef __ENQUEUE_WRITE_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
+#undef __ENQUEUE_MAP_BUFFER_ERR
+#undef __ENQUEUE_MAP_IMAGE_ERR
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR
+#undef __ENQUEUE_TASK_ERR
+#undef __ENQUEUE_NATIVE_KERNEL
+
+#undef __CL_EXPLICIT_CONSTRUCTORS
+
+#undef __UNLOAD_COMPILER_ERR
+#endif //__CL_USER_OVERRIDE_ERROR_STRINGS
+
+#undef __CL_FUNCTION_TYPE
+
+// Extensions
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_VERSION_1_1)
+#undef __INIT_CL_EXT_FCN_PTR
+#endif // #if defined(CL_VERSION_1_1)
+#undef __CREATE_SUB_DEVICES
+
+#if defined(USE_CL_DEVICE_FISSION)
+#undef __PARAM_NAME_DEVICE_FISSION
+#endif // USE_CL_DEVICE_FISSION
+
+#undef __DEFAULT_NOT_INITIALIZED 
+#undef __DEFAULT_BEING_INITIALIZED 
+#undef __DEFAULT_INITIALIZED
+
+#undef CL_HPP_RVALUE_REFERENCES_SUPPORTED
+#undef CL_HPP_NOEXCEPT
+
+} // namespace cl
+
+#endif // CL_HPP_
diff --git a/include/triton/external/CL/cl2.hpp b/include/triton/external/CL/cl2.hpp
new file mode 100644
index 000000000..4ac48227a
--- /dev/null
+++ b/include/triton/external/CL/cl2.hpp
@@ -0,0 +1,9677 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2016 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/*! \file
+ *
+ *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33),
+ *       OpenCL 1.2 (rev 15) and OpenCL 2.0 (rev 29)
+ *   \author Lee Howes and Bruce Merry
+ *
+ *   Derived from the OpenCL 1.x C++ bindings written by
+ *   Benedict R. Gaster, Laurent Morichetti and Lee Howes
+ *   With additions and fixes from:
+ *       Brian Cole, March 3rd 2010 and April 2012
+ *       Matt Gruenke, April 2012.
+ *       Bruce Merry, February 2013.
+ *       Tom Deakin and Simon McIntosh-Smith, July 2013
+ *       James Price, 2015-
+ *
+ *   \version 2.0.10
+ *   \date 2016-07-20
+ *
+ *   Optional extension support
+ *
+ *         cl_ext_device_fission
+ *         #define CL_HPP_USE_CL_DEVICE_FISSION
+ *         cl_khr_d3d10_sharing
+ *         #define CL_HPP_USE_DX_INTEROP
+ *         cl_khr_sub_groups
+ *         #define CL_HPP_USE_CL_SUB_GROUPS_KHR
+ *         cl_khr_image2d_from_buffer
+ *         #define CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR
+ *
+ *   Doxygen documentation for this header is available here:
+ *
+ *       http://khronosgroup.github.io/OpenCL-CLHPP/
+ *
+ *   The latest version of this header can be found on the GitHub releases page:
+ *
+ *       https://github.com/KhronosGroup/OpenCL-CLHPP/releases
+ *
+ *   Bugs and patches can be submitted to the GitHub repository:
+ *
+ *       https://github.com/KhronosGroup/OpenCL-CLHPP
+ */
+
+/*! \mainpage
+ * \section intro Introduction
+ * For many large applications C++ is the language of choice and so it seems
+ * reasonable to define C++ bindings for OpenCL.
+ *
+ * The interface is contained with a single C++ header file \em cl2.hpp and all
+ * definitions are contained within the namespace \em cl. There is no additional
+ * requirement to include \em cl.h and to use either the C++ or original C
+ * bindings; it is enough to simply include \em cl2.hpp.
+ *
+ * The bindings themselves are lightweight and correspond closely to the
+ * underlying C API. Using the C++ bindings introduces no additional execution
+ * overhead.
+ *
+ * There are numerous compatibility, portability and memory management
+ * fixes in the new header as well as additional OpenCL 2.0 features.
+ * As a result the header is not directly backward compatible and for this
+ * reason we release it as cl2.hpp rather than a new version of cl.hpp.
+ * 
+ *
+ * \section compatibility Compatibility
+ * Due to the evolution of the underlying OpenCL API the 2.0 C++ bindings
+ * include an updated approach to defining supported feature versions
+ * and the range of valid underlying OpenCL runtime versions supported.
+ *
+ * The combination of preprocessor macros CL_HPP_TARGET_OPENCL_VERSION and 
+ * CL_HPP_MINIMUM_OPENCL_VERSION control this range. These are three digit
+ * decimal values representing OpenCL runime versions. The default for 
+ * the target is 200, representing OpenCL 2.0 and the minimum is also 
+ * defined as 200. These settings would use 2.0 API calls only.
+ * If backward compatibility with a 1.2 runtime is required, the minimum
+ * version may be set to 120.
+ *
+ * Note that this is a compile-time setting, and so affects linking against
+ * a particular SDK version rather than the versioning of the loaded runtime.
+ *
+ * The earlier versions of the header included basic vector and string 
+ * classes based loosely on STL versions. These were difficult to 
+ * maintain and very rarely used. For the 2.0 header we now assume
+ * the presence of the standard library unless requested otherwise.
+ * We use std::array, std::vector, std::shared_ptr and std::string 
+ * throughout to safely manage memory and reduce the chance of a 
+ * recurrance of earlier memory management bugs.
+ *
+ * These classes are used through typedefs in the cl namespace: 
+ * cl::array, cl::vector, cl::pointer and cl::string.
+ * In addition cl::allocate_pointer forwards to std::allocate_shared
+ * by default.
+ * In all cases these standard library classes can be replaced with 
+ * custom interface-compatible versions using the CL_HPP_NO_STD_ARRAY, 
+ * CL_HPP_NO_STD_VECTOR, CL_HPP_NO_STD_UNIQUE_PTR and 
+ * CL_HPP_NO_STD_STRING macros.
+ *
+ * The OpenCL 1.x versions of the C++ bindings included a size_t wrapper
+ * class to interface with kernel enqueue. This caused unpleasant interactions
+ * with the standard size_t declaration and led to namespacing bugs.
+ * In the 2.0 version we have replaced this with a std::array-based interface.
+ * However, the old behaviour can be regained for backward compatibility
+ * using the CL_HPP_ENABLE_SIZE_T_COMPATIBILITY macro.
+ *
+ * Finally, the program construction interface used a clumsy vector-of-pairs
+ * design in the earlier versions. We have replaced that with a cleaner 
+ * vector-of-vectors and vector-of-strings design. However, for backward 
+ * compatibility old behaviour can be regained with the
+ * CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY macro.
+ * 
+ * In OpenCL 2.0 OpenCL C is not entirely backward compatibility with 
+ * earlier versions. As a result a flag must be passed to the OpenCL C
+ * compiled to request OpenCL 2.0 compilation of kernels with 1.2 as
+ * the default in the absence of the flag.
+ * In some cases the C++ bindings automatically compile code for ease.
+ * For those cases the compilation defaults to OpenCL C 2.0.
+ * If this is not wanted, the CL_HPP_CL_1_2_DEFAULT_BUILD macro may
+ * be specified to assume 1.2 compilation.
+ * If more fine-grained decisions on a per-kernel bases are required
+ * then explicit build operations that take the flag should be used.
+ *
+ *
+ * \section parameterization Parameters
+ * This header may be parameterized by a set of preprocessor macros.
+ *
+ * - CL_HPP_TARGET_OPENCL_VERSION
+ *
+ *   Defines the target OpenCL runtime version to build the header
+ *   against. Defaults to 200, representing OpenCL 2.0.
+ *
+ * - CL_HPP_NO_STD_STRING
+ *
+ *   Do not use the standard library string class. cl::string is not
+ *   defined and may be defined by the user before cl2.hpp is
+ *   included.
+ *
+ * - CL_HPP_NO_STD_VECTOR
+ *
+ *   Do not use the standard library vector class. cl::vector is not
+ *   defined and may be defined by the user before cl2.hpp is
+ *   included.
+ *
+ * - CL_HPP_NO_STD_ARRAY
+ *
+ *   Do not use the standard library array class. cl::array is not
+ *   defined and may be defined by the user before cl2.hpp is
+ *   included.
+ *
+ * - CL_HPP_NO_STD_UNIQUE_PTR
+ *
+ *   Do not use the standard library unique_ptr class. cl::pointer and
+ *   the cl::allocate_pointer functions are not defined and may be
+ *   defined by the user before cl2.hpp is included.
+ *
+ * - CL_HPP_ENABLE_DEVICE_FISSION
+ *
+ *   Enables device fission for OpenCL 1.2 platforms.
+ *
+ * - CL_HPP_ENABLE_EXCEPTIONS
+ *
+ *   Enable exceptions for use in the C++ bindings header. This is the
+ *   preferred error handling mechanism but is not required.
+ *
+ * - CL_HPP_ENABLE_SIZE_T_COMPATIBILITY
+ *
+ *   Backward compatibility option to support cl.hpp-style size_t
+ *   class.  Replaces the updated std::array derived version and
+ *   removal of size_t from the namespace. Note that in this case the
+ *   new size_t class is placed in the cl::compatibility namespace and
+ *   thus requires an additional using declaration for direct backward
+ *   compatibility.
+ *
+ * - CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY
+ *
+ *   Enable older vector of pairs interface for construction of
+ *   programs.
+ *
+ * - CL_HPP_CL_1_2_DEFAULT_BUILD
+ *
+ *   Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0
+ *   applies to use of cl::Program construction and other program
+ *   build variants.
+ *
+ *
+ * \section example Example
+ *
+ * The following example shows a general use case for the C++
+ * bindings, including support for the optional exception feature and
+ * also the supplied vector and string classes, see following sections for
+ * decriptions of these features.
+ *
+ * \code
+    #define CL_HPP_ENABLE_EXCEPTIONS
+    #define CL_HPP_TARGET_OPENCL_VERSION 200
+
+    #include <CL/cl2.hpp>
+    #include <iostream>
+    #include <vector>
+    #include <memory>
+    #include <algorithm>
+
+    const int numElements = 32;
+
+    int main(void)
+    {
+        // Filter for a 2.0 platform and set it as the default
+        std::vector<cl::Platform> platforms;
+        cl::Platform::get(&platforms);
+        cl::Platform plat;
+        for (auto &p : platforms) {
+            std::string platver = p.getInfo<CL_PLATFORM_VERSION>();
+            if (platver.find("OpenCL 2.") != std::string::npos) {
+                plat = p;
+            }
+        }
+        if (plat() == 0)  {
+            std::cout << "No OpenCL 2.0 platform found.";
+            return -1;
+        }
+
+        cl::Platform newP = cl::Platform::setDefault(plat);
+        if (newP != plat) {
+            std::cout << "Error setting default platform.";
+            return -1;
+        }
+
+        // Use C++11 raw string literals for kernel source code
+        std::string kernel1{R"CLC(
+            global int globalA;
+            kernel void updateGlobal()
+            {
+              globalA = 75;
+            }
+        )CLC"};
+        std::string kernel2{R"CLC(
+            typedef struct { global int *bar; } Foo;
+            kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB,
+                                  global int *output, int val, write_only pipe int outPipe, queue_t childQueue)
+            {
+              output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar);
+              write_pipe(outPipe, &val);
+              queue_t default_queue = get_default_queue();
+              ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2);
+
+              // Have a child kernel write into third quarter of output
+              enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
+                ^{
+                    output[get_global_size(0)*2 + get_global_id(0)] =
+                      inputA[get_global_size(0)*2 + get_global_id(0)] + inputB[get_global_size(0)*2 + get_global_id(0)] + globalA;
+                });
+
+              // Have a child kernel write into last quarter of output
+              enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
+                ^{
+                    output[get_global_size(0)*3 + get_global_id(0)] =
+                      inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2;
+                });
+            }
+        )CLC"};
+
+        // New simpler string interface style
+        std::vector<std::string> programStrings {kernel1, kernel2};
+
+        cl::Program vectorAddProgram(programStrings);
+        try {
+            vectorAddProgram.build("-cl-std=CL2.0");
+        }
+        catch (...) {
+            // Print build info for all devices
+            cl_int buildErr = CL_SUCCESS;
+            auto buildInfo = vectorAddProgram.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&buildErr);
+            for (auto &pair : buildInfo) {
+                std::cerr << pair.second << std::endl << std::endl;
+            }
+
+            return 1;
+        }
+
+        typedef struct { int *bar; } Foo;
+
+        // Get and run kernel that initializes the program-scope global
+        // A test for kernels that take no arguments
+        auto program2Kernel =
+            cl::KernelFunctor<>(vectorAddProgram, "updateGlobal");
+        program2Kernel(
+            cl::EnqueueArgs(
+            cl::NDRange(1)));
+
+        //////////////////
+        // SVM allocations
+
+        auto anSVMInt = cl::allocate_svm<int, cl::SVMTraitCoarse<>>();
+        *anSVMInt = 5;
+        cl::SVMAllocator<Foo, cl::SVMTraitCoarse<cl::SVMTraitReadOnly<>>> svmAllocReadOnly;
+        auto fooPointer = cl::allocate_pointer<Foo>(svmAllocReadOnly);
+        fooPointer->bar = anSVMInt.get();
+        cl::SVMAllocator<int, cl::SVMTraitCoarse<>> svmAlloc;
+        std::vector<int, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>> inputA(numElements, 1, svmAlloc);
+        cl::coarse_svm_vector<int> inputB(numElements, 2, svmAlloc);
+
+        //
+        //////////////
+
+        // Traditional cl_mem allocations
+        std::vector<int> output(numElements, 0xdeadbeef);
+        cl::Buffer outputBuffer(begin(output), end(output), false);
+        cl::Pipe aPipe(sizeof(cl_int), numElements / 2);
+
+        // Default command queue, also passed in as a parameter
+        cl::DeviceCommandQueue defaultDeviceQueue = cl::DeviceCommandQueue::makeDefault(
+            cl::Context::getDefault(), cl::Device::getDefault());
+
+        auto vectorAddKernel =
+            cl::KernelFunctor<
+                decltype(fooPointer)&,
+                int*,
+                cl::coarse_svm_vector<int>&,
+                cl::Buffer,
+                int,
+                cl::Pipe&,
+                cl::DeviceCommandQueue
+                >(vectorAddProgram, "vectorAdd");
+
+        // Ensure that the additional SVM pointer is available to the kernel
+        // This one was not passed as a parameter
+        vectorAddKernel.setSVMPointers(anSVMInt);
+
+        // Hand control of coarse allocations to runtime
+        cl::enqueueUnmapSVM(anSVMInt);
+        cl::enqueueUnmapSVM(fooPointer);
+        cl::unmapSVM(inputB);
+        cl::unmapSVM(output2);
+
+	    cl_int error;
+	    vectorAddKernel(
+            cl::EnqueueArgs(
+                cl::NDRange(numElements/2),
+                cl::NDRange(numElements/2)),
+            fooPointer,
+            inputA.data(),
+            inputB,
+            outputBuffer,
+            3,
+            aPipe,
+            defaultDeviceQueue,
+		    error
+            );
+
+        cl::copy(outputBuffer, begin(output), end(output));
+        // Grab the SVM output vector using a map
+        cl::mapSVM(output2);
+
+        cl::Device d = cl::Device::getDefault();
+
+        std::cout << "Output:\n";
+        for (int i = 1; i < numElements; ++i) {
+            std::cout << "\t" << output[i] << "\n";
+        }
+        std::cout << "\n\n";
+
+        return 0;
+    }
+ *
+ * \endcode
+ *
+ */
+#ifndef CL_HPP_
+#define CL_HPP_
+
+/* Handle deprecated preprocessor definitions. In each case, we only check for
+ * the old name if the new name is not defined, so that user code can define
+ * both and hence work with either version of the bindings.
+ */
+#if !defined(CL_HPP_USE_DX_INTEROP) && defined(USE_DX_INTEROP)
+# pragma message("cl2.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead")
+# define CL_HPP_USE_DX_INTEROP
+#endif
+#if !defined(CL_HPP_USE_CL_DEVICE_FISSION) && defined(USE_CL_DEVICE_FISSION)
+# pragma message("cl2.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead")
+# define CL_HPP_USE_CL_DEVICE_FISSION
+#endif
+#if !defined(CL_HPP_ENABLE_EXCEPTIONS) && defined(__CL_ENABLE_EXCEPTIONS)
+# pragma message("cl2.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead")
+# define CL_HPP_ENABLE_EXCEPTIONS
+#endif
+#if !defined(CL_HPP_NO_STD_VECTOR) && defined(__NO_STD_VECTOR)
+# pragma message("cl2.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead")
+# define CL_HPP_NO_STD_VECTOR
+#endif
+#if !defined(CL_HPP_NO_STD_STRING) && defined(__NO_STD_STRING)
+# pragma message("cl2.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead")
+# define CL_HPP_NO_STD_STRING
+#endif
+#if defined(VECTOR_CLASS)
+# pragma message("cl2.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead")
+#endif
+#if defined(STRING_CLASS)
+# pragma message("cl2.hpp: STRING_CLASS is deprecated. Alias cl::string instead.")
+#endif
+#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) && defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+# pragma message("cl2.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead")
+# define CL_HPP_USER_OVERRIDE_ERROR_STRINGS
+#endif
+
+/* Warn about features that are no longer supported
+ */
+#if defined(__USE_DEV_VECTOR)
+# pragma message("cl2.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors")
+#endif
+#if defined(__USE_DEV_STRING)
+# pragma message("cl2.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors")
+#endif
+
+/* Detect which version to target */
+#if !defined(CL_HPP_TARGET_OPENCL_VERSION)
+# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 200 (OpenCL 2.0)")
+# define CL_HPP_TARGET_OPENCL_VERSION 200
+#endif
+#if CL_HPP_TARGET_OPENCL_VERSION != 100 && CL_HPP_TARGET_OPENCL_VERSION != 110 && CL_HPP_TARGET_OPENCL_VERSION != 120 && CL_HPP_TARGET_OPENCL_VERSION != 200
+# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120 or 200). It will be set to 200")
+# undef CL_HPP_TARGET_OPENCL_VERSION
+# define CL_HPP_TARGET_OPENCL_VERSION 200
+#endif
+
+#if !defined(CL_HPP_MINIMUM_OPENCL_VERSION)
+# define CL_HPP_MINIMUM_OPENCL_VERSION 200
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION != 100 && CL_HPP_MINIMUM_OPENCL_VERSION != 110 && CL_HPP_MINIMUM_OPENCL_VERSION != 120 && CL_HPP_MINIMUM_OPENCL_VERSION != 200
+# pragma message("cl2.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120 or 200). It will be set to 100")
+# undef CL_HPP_MINIMUM_OPENCL_VERSION
+# define CL_HPP_MINIMUM_OPENCL_VERSION 100
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION > CL_HPP_TARGET_OPENCL_VERSION
+# error "CL_HPP_MINIMUM_OPENCL_VERSION must not be greater than CL_HPP_TARGET_OPENCL_VERSION"
+#endif
+
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
+# define CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+# define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+# define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
+# define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#endif
+
+#ifdef _WIN32
+
+#include <malloc.h>
+
+#if defined(CL_HPP_USE_DX_INTEROP)
+#include <CL/cl_d3d10.h>
+#include <CL/cl_dx9_media_sharing.h>
+#endif
+#endif // _WIN32
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif // _MSC_VER 
+ 
+ // Check for a valid C++ version
+
+// Need to do both tests here because for some reason __cplusplus is not 
+// updated in visual studio
+#if (!defined(_MSC_VER) && __cplusplus < 201103L) || (defined(_MSC_VER) && _MSC_VER < 1700)
+#error Visual studio 2013 or another C++11-supporting compiler required
+#endif
+
+// 
+#if defined(CL_HPP_USE_CL_DEVICE_FISSION) || defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
+#include <CL/cl_ext.h>
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/opencl.h>
+#else
+#include <CL/opencl.h>
+#endif // !__APPLE__
+
+#if (__cplusplus >= 201103L)
+#define CL_HPP_NOEXCEPT_ noexcept
+#else
+#define CL_HPP_NOEXCEPT_
+#endif
+
+#if defined(_MSC_VER)
+# define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany)
+#else
+# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((weak))
+#endif // !_MSC_VER
+
+// Define deprecated prefixes and suffixes to ensure compilation
+// in case they are not pre-defined
+#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+
+#if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
+
+#if !defined(CL_CALLBACK)
+#define CL_CALLBACK
+#endif //CL_CALLBACK
+
+#include <utility>
+#include <limits>
+#include <iterator>
+#include <mutex>
+#include <cstring>
+#include <functional>
+
+
+// Define a size_type to represent a correctly resolved size_t
+#if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
+namespace cl {
+    using size_type = ::size_t;
+} // namespace cl
+#else // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
+namespace cl {
+    using size_type = size_t;
+} // namespace cl
+#endif // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
+
+
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+#include <exception>
+#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
+
+#if !defined(CL_HPP_NO_STD_VECTOR)
+#include <vector>
+namespace cl {
+    template < class T, class Alloc = std::allocator<T> >
+    using vector = std::vector<T, Alloc>;
+} // namespace cl
+#endif // #if !defined(CL_HPP_NO_STD_VECTOR)
+
+#if !defined(CL_HPP_NO_STD_STRING)
+#include <string>
+namespace cl {
+    using string = std::string;
+} // namespace cl
+#endif // #if !defined(CL_HPP_NO_STD_STRING)
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if !defined(CL_HPP_NO_STD_UNIQUE_PTR)
+#include <memory>
+namespace cl {
+    // Replace unique_ptr and allocate_pointer for internal use
+    // to allow user to replace them
+    template<class T, class D>
+    using pointer = std::unique_ptr<T, D>;
+} // namespace cl
+#endif 
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if !defined(CL_HPP_NO_STD_ARRAY)
+#include <array>
+namespace cl {
+    template < class T, size_type N >
+    using array = std::array<T, N>;
+} // namespace cl
+#endif // #if !defined(CL_HPP_NO_STD_ARRAY)
+
+// Define size_type appropriately to allow backward-compatibility
+// use of the old size_t interface class
+#if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
+namespace cl {
+    namespace compatibility {
+        /*! \brief class used to interface between C++ and
+        *  OpenCL C calls that require arrays of size_t values, whose
+        *  size is known statically.
+        */
+        template <int N>
+        class size_t
+        {
+        private:
+            size_type data_[N];
+
+        public:
+            //! \brief Initialize size_t to all 0s
+            size_t()
+            {
+                for (int i = 0; i < N; ++i) {
+                    data_[i] = 0;
+                }
+            }
+
+            size_t(const array<size_type, N> &rhs)
+            {
+                for (int i = 0; i < N; ++i) {
+                    data_[i] = rhs[i];
+                }
+            }
+
+            size_type& operator[](int index)
+            {
+                return data_[index];
+            }
+
+            const size_type& operator[](int index) const
+            {
+                return data_[index];
+            }
+
+            //! \brief Conversion operator to T*.
+            operator size_type* ()             { return data_; }
+
+            //! \brief Conversion operator to const T*.
+            operator const size_type* () const { return data_; }
+
+            operator array<size_type, N>() const
+            {
+                array<size_type, N> ret;
+
+                for (int i = 0; i < N; ++i) {
+                    ret[i] = data_[i];
+                }
+                return ret;
+            }
+        };
+    } // namespace compatibility
+
+    template<int N>
+    using size_t = compatibility::size_t<N>;
+} // namespace cl
+#endif // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
+
+// Helper alias to avoid confusing the macros
+namespace cl {
+    namespace detail {
+        using size_t_array = array<size_type, 3>;
+    } // namespace detail
+} // namespace cl
+
+
+/*! \namespace cl
+ *
+ * \brief The OpenCL C++ bindings are defined within this namespace.
+ *
+ */
+namespace cl {
+    class Memory;
+
+#define CL_HPP_INIT_CL_EXT_FCN_PTR_(name) \
+    if (!pfn_##name) {    \
+    pfn_##name = (PFN_##name) \
+    clGetExtensionFunctionAddress(#name); \
+    if (!pfn_##name) {    \
+    } \
+    }
+
+#define CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, name) \
+    if (!pfn_##name) {    \
+    pfn_##name = (PFN_##name) \
+    clGetExtensionFunctionAddressForPlatform(platform, #name); \
+    if (!pfn_##name) {    \
+    } \
+    }
+
+    class Program;
+    class Device;
+    class Context;
+    class CommandQueue;
+    class DeviceCommandQueue;
+    class Memory;
+    class Buffer;
+    class Pipe;
+
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+    /*! \brief Exception class 
+     * 
+     *  This may be thrown by API functions when CL_HPP_ENABLE_EXCEPTIONS is defined.
+     */
+    class Error : public std::exception
+    {
+    private:
+        cl_int err_;
+        const char * errStr_;
+    public:
+        /*! \brief Create a new CL error exception for a given error code
+         *  and corresponding message.
+         * 
+         *  \param err error code value.
+         *
+         *  \param errStr a descriptive string that must remain in scope until
+         *                handling of the exception has concluded.  If set, it
+         *                will be returned by what().
+         */
+        Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+        {}
+
+        ~Error() throw() {}
+
+        /*! \brief Get error string associated with exception
+         *
+         * \return A memory pointer to the error message string.
+         */
+        virtual const char * what() const throw ()
+        {
+            if (errStr_ == NULL) {
+                return "empty";
+            }
+            else {
+                return errStr_;
+            }
+        }
+
+        /*! \brief Get error code associated with exception
+         *
+         *  \return The error code.
+         */
+        cl_int err(void) const { return err_; }
+    };
+#define CL_HPP_ERR_STR_(x) #x
+#else
+#define CL_HPP_ERR_STR_(x) NULL
+#endif // CL_HPP_ENABLE_EXCEPTIONS
+
+
+namespace detail
+{
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+static inline cl_int errHandler (
+    cl_int err,
+    const char * errStr = NULL)
+{
+    if (err != CL_SUCCESS) {
+        throw Error(err, errStr);
+    }
+    return err;
+}
+#else
+static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+{
+    (void) errStr; // suppress unused variable warning
+    return err;
+}
+#endif // CL_HPP_ENABLE_EXCEPTIONS
+}
+
+
+
+//! \cond DOXYGEN_DETAIL
+#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS)
+#define __GET_DEVICE_INFO_ERR               CL_HPP_ERR_STR_(clGetDeviceInfo)
+#define __GET_PLATFORM_INFO_ERR             CL_HPP_ERR_STR_(clGetPlatformInfo)
+#define __GET_DEVICE_IDS_ERR                CL_HPP_ERR_STR_(clGetDeviceIDs)
+#define __GET_PLATFORM_IDS_ERR              CL_HPP_ERR_STR_(clGetPlatformIDs)
+#define __GET_CONTEXT_INFO_ERR              CL_HPP_ERR_STR_(clGetContextInfo)
+#define __GET_EVENT_INFO_ERR                CL_HPP_ERR_STR_(clGetEventInfo)
+#define __GET_EVENT_PROFILE_INFO_ERR        CL_HPP_ERR_STR_(clGetEventProfileInfo)
+#define __GET_MEM_OBJECT_INFO_ERR           CL_HPP_ERR_STR_(clGetMemObjectInfo)
+#define __GET_IMAGE_INFO_ERR                CL_HPP_ERR_STR_(clGetImageInfo)
+#define __GET_SAMPLER_INFO_ERR              CL_HPP_ERR_STR_(clGetSamplerInfo)
+#define __GET_KERNEL_INFO_ERR               CL_HPP_ERR_STR_(clGetKernelInfo)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __GET_KERNEL_ARG_INFO_ERR           CL_HPP_ERR_STR_(clGetKernelArgInfo)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __GET_KERNEL_WORK_GROUP_INFO_ERR    CL_HPP_ERR_STR_(clGetKernelWorkGroupInfo)
+#define __GET_PROGRAM_INFO_ERR              CL_HPP_ERR_STR_(clGetProgramInfo)
+#define __GET_PROGRAM_BUILD_INFO_ERR        CL_HPP_ERR_STR_(clGetProgramBuildInfo)
+#define __GET_COMMAND_QUEUE_INFO_ERR        CL_HPP_ERR_STR_(clGetCommandQueueInfo)
+
+#define __CREATE_CONTEXT_ERR                CL_HPP_ERR_STR_(clCreateContext)
+#define __CREATE_CONTEXT_FROM_TYPE_ERR      CL_HPP_ERR_STR_(clCreateContextFromType)
+#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   CL_HPP_ERR_STR_(clGetSupportedImageFormats)
+
+#define __CREATE_BUFFER_ERR                 CL_HPP_ERR_STR_(clCreateBuffer)
+#define __COPY_ERR                          CL_HPP_ERR_STR_(cl::copy)
+#define __CREATE_SUBBUFFER_ERR              CL_HPP_ERR_STR_(clCreateSubBuffer)
+#define __CREATE_GL_BUFFER_ERR              CL_HPP_ERR_STR_(clCreateFromGLBuffer)
+#define __CREATE_GL_RENDER_BUFFER_ERR       CL_HPP_ERR_STR_(clCreateFromGLBuffer)
+#define __GET_GL_OBJECT_INFO_ERR            CL_HPP_ERR_STR_(clGetGLObjectInfo)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __CREATE_IMAGE_ERR                  CL_HPP_ERR_STR_(clCreateImage)
+#define __CREATE_GL_TEXTURE_ERR             CL_HPP_ERR_STR_(clCreateFromGLTexture)
+#define __IMAGE_DIMENSION_ERR               CL_HPP_ERR_STR_(Incorrect image dimensions)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR CL_HPP_ERR_STR_(clSetMemObjectDestructorCallback)
+
+#define __CREATE_USER_EVENT_ERR             CL_HPP_ERR_STR_(clCreateUserEvent)
+#define __SET_USER_EVENT_STATUS_ERR         CL_HPP_ERR_STR_(clSetUserEventStatus)
+#define __SET_EVENT_CALLBACK_ERR            CL_HPP_ERR_STR_(clSetEventCallback)
+#define __WAIT_FOR_EVENTS_ERR               CL_HPP_ERR_STR_(clWaitForEvents)
+
+#define __CREATE_KERNEL_ERR                 CL_HPP_ERR_STR_(clCreateKernel)
+#define __SET_KERNEL_ARGS_ERR               CL_HPP_ERR_STR_(clSetKernelArg)
+#define __CREATE_PROGRAM_WITH_SOURCE_ERR    CL_HPP_ERR_STR_(clCreateProgramWithSource)
+#define __CREATE_PROGRAM_WITH_BINARY_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBinary)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBuiltInKernels)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __BUILD_PROGRAM_ERR                 CL_HPP_ERR_STR_(clBuildProgram)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __COMPILE_PROGRAM_ERR               CL_HPP_ERR_STR_(clCompileProgram)
+#define __LINK_PROGRAM_ERR                  CL_HPP_ERR_STR_(clLinkProgram)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __CREATE_KERNELS_IN_PROGRAM_ERR     CL_HPP_ERR_STR_(clCreateKernelsInProgram)
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#define __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR          CL_HPP_ERR_STR_(clCreateCommandQueueWithProperties)
+#define __CREATE_SAMPLER_WITH_PROPERTIES_ERR                CL_HPP_ERR_STR_(clCreateSamplerWithProperties)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#define __SET_COMMAND_QUEUE_PROPERTY_ERR    CL_HPP_ERR_STR_(clSetCommandQueueProperty)
+#define __ENQUEUE_READ_BUFFER_ERR           CL_HPP_ERR_STR_(clEnqueueReadBuffer)
+#define __ENQUEUE_READ_BUFFER_RECT_ERR      CL_HPP_ERR_STR_(clEnqueueReadBufferRect)
+#define __ENQUEUE_WRITE_BUFFER_ERR          CL_HPP_ERR_STR_(clEnqueueWriteBuffer)
+#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     CL_HPP_ERR_STR_(clEnqueueWriteBufferRect)
+#define __ENQEUE_COPY_BUFFER_ERR            CL_HPP_ERR_STR_(clEnqueueCopyBuffer)
+#define __ENQEUE_COPY_BUFFER_RECT_ERR       CL_HPP_ERR_STR_(clEnqueueCopyBufferRect)
+#define __ENQUEUE_FILL_BUFFER_ERR           CL_HPP_ERR_STR_(clEnqueueFillBuffer)
+#define __ENQUEUE_READ_IMAGE_ERR            CL_HPP_ERR_STR_(clEnqueueReadImage)
+#define __ENQUEUE_WRITE_IMAGE_ERR           CL_HPP_ERR_STR_(clEnqueueWriteImage)
+#define __ENQUEUE_COPY_IMAGE_ERR            CL_HPP_ERR_STR_(clEnqueueCopyImage)
+#define __ENQUEUE_FILL_IMAGE_ERR            CL_HPP_ERR_STR_(clEnqueueFillImage)
+#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  CL_HPP_ERR_STR_(clEnqueueCopyImageToBuffer)
+#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  CL_HPP_ERR_STR_(clEnqueueCopyBufferToImage)
+#define __ENQUEUE_MAP_BUFFER_ERR            CL_HPP_ERR_STR_(clEnqueueMapBuffer)
+#define __ENQUEUE_MAP_IMAGE_ERR             CL_HPP_ERR_STR_(clEnqueueMapImage)
+#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      CL_HPP_ERR_STR_(clEnqueueUnMapMemObject)
+#define __ENQUEUE_NDRANGE_KERNEL_ERR        CL_HPP_ERR_STR_(clEnqueueNDRangeKernel)
+#define __ENQUEUE_NATIVE_KERNEL             CL_HPP_ERR_STR_(clEnqueueNativeKernel)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   CL_HPP_ERR_STR_(clEnqueueMigrateMemObjects)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+#define __ENQUEUE_ACQUIRE_GL_ERR            CL_HPP_ERR_STR_(clEnqueueAcquireGLObjects)
+#define __ENQUEUE_RELEASE_GL_ERR            CL_HPP_ERR_STR_(clEnqueueReleaseGLObjects)
+
+#define __CREATE_PIPE_ERR             CL_HPP_ERR_STR_(clCreatePipe)
+#define __GET_PIPE_INFO_ERR           CL_HPP_ERR_STR_(clGetPipeInfo)
+
+
+#define __RETAIN_ERR                        CL_HPP_ERR_STR_(Retain Object)
+#define __RELEASE_ERR                       CL_HPP_ERR_STR_(Release Object)
+#define __FLUSH_ERR                         CL_HPP_ERR_STR_(clFlush)
+#define __FINISH_ERR                        CL_HPP_ERR_STR_(clFinish)
+#define __VECTOR_CAPACITY_ERR               CL_HPP_ERR_STR_(Vector capacity error)
+
+/**
+ * CL 1.2 version that uses device fission.
+ */
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __CREATE_SUB_DEVICES_ERR            CL_HPP_ERR_STR_(clCreateSubDevices)
+#else
+#define __CREATE_SUB_DEVICES_ERR            CL_HPP_ERR_STR_(clCreateSubDevicesEXT)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __ENQUEUE_MARKER_ERR                CL_HPP_ERR_STR_(clEnqueueMarker)
+#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       CL_HPP_ERR_STR_(clEnqueueWaitForEvents)
+#define __ENQUEUE_BARRIER_ERR               CL_HPP_ERR_STR_(clEnqueueBarrier)
+#define __UNLOAD_COMPILER_ERR               CL_HPP_ERR_STR_(clUnloadCompiler)
+#define __CREATE_GL_TEXTURE_2D_ERR          CL_HPP_ERR_STR_(clCreateFromGLTexture2D)
+#define __CREATE_GL_TEXTURE_3D_ERR          CL_HPP_ERR_STR_(clCreateFromGLTexture3D)
+#define __CREATE_IMAGE2D_ERR                CL_HPP_ERR_STR_(clCreateImage2D)
+#define __CREATE_IMAGE3D_ERR                CL_HPP_ERR_STR_(clCreateImage3D)
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+/**
+ * Deprecated APIs for 2.0
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+#define __CREATE_COMMAND_QUEUE_ERR          CL_HPP_ERR_STR_(clCreateCommandQueue)
+#define __ENQUEUE_TASK_ERR                  CL_HPP_ERR_STR_(clEnqueueTask)
+#define __CREATE_SAMPLER_ERR                CL_HPP_ERR_STR_(clCreateSampler)
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+/**
+ * CL 1.2 marker and barrier commands
+ */
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __ENQUEUE_MARKER_WAIT_LIST_ERR                CL_HPP_ERR_STR_(clEnqueueMarkerWithWaitList)
+#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               CL_HPP_ERR_STR_(clEnqueueBarrierWithWaitList)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+#endif // CL_HPP_USER_OVERRIDE_ERROR_STRINGS
+//! \endcond
+
+
+namespace detail {
+
+// Generic getInfoHelper. The final parameter is used to guide overload
+// resolution: the actual parameter passed is an int, which makes this
+// a worse conversion sequence than a specialization that declares the
+// parameter as an int.
+template<typename Functor, typename T>
+inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
+{
+    return f(name, sizeof(T), param, NULL);
+}
+
+// Specialized for getInfo<CL_PROGRAM_BINARIES>
+// Assumes that the output vector was correctly resized on the way in
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, vector<vector<unsigned char>>* param, int)
+{
+    if (name != CL_PROGRAM_BINARIES) {
+        return CL_INVALID_VALUE;
+    }
+    if (param) {
+        // Create array of pointers, calculate total size and pass pointer array in
+        size_type numBinaries = param->size();
+        vector<unsigned char*> binariesPointers(numBinaries);
+
+        for (size_type i = 0; i < numBinaries; ++i)
+        {
+            binariesPointers[i] = (*param)[i].data();
+        }
+
+        cl_int err = f(name, numBinaries * sizeof(unsigned char*), binariesPointers.data(), NULL);
+
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+    }
+
+
+    return CL_SUCCESS;
+}
+
+// Specialized getInfoHelper for vector params
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, vector<T>* param, long)
+{
+    size_type required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    const size_type elements = required / sizeof(T);
+
+    // Temporary to avoid changing param on an error
+    vector<T> localData(elements);
+    err = f(name, required, localData.data(), NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    if (param) {
+        *param = std::move(localData);
+    }
+
+    return CL_SUCCESS;
+}
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template <typename Func, typename T>
+inline cl_int getInfoHelper(
+    Func f, cl_uint name, vector<T>* param, int, typename T::cl_type = 0)
+{
+    size_type required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    const size_type elements = required / sizeof(typename T::cl_type);
+
+    vector<typename T::cl_type> value(elements);
+    err = f(name, required, value.data(), NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    if (param) {
+        // Assign to convert CL type to T for each element
+        param->resize(elements);
+
+        // Assign to param, constructing with retain behaviour
+        // to correctly capture each underlying CL object
+        for (size_type i = 0; i < elements; i++) {
+            (*param)[i] = T(value[i], true);
+        }
+    }
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for string params
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, string* param, long)
+{
+    size_type required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    // std::string has a constant data member
+    // a char vector does not
+    if (required > 0) {
+        vector<char> value(required);
+        err = f(name, required, value.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+        if (param) {
+            param->assign(begin(value), prev(end(value)));
+        }
+    }
+    else if (param) {
+        param->assign("");
+    }
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for clsize_t params
+template <typename Func, size_type N>
+inline cl_int getInfoHelper(Func f, cl_uint name, array<size_type, N>* param, long)
+{
+    size_type required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    size_type elements = required / sizeof(size_type);
+    vector<size_type> value(elements, 0);
+
+    err = f(name, required, value.data(), NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    
+    // Bound the copy with N to prevent overruns
+    // if passed N > than the amount copied
+    if (elements > N) {
+        elements = N;
+    }
+    for (size_type i = 0; i < elements; ++i) {
+        (*param)[i] = value[i];
+    }
+
+    return CL_SUCCESS;
+}
+
+template<typename T> struct ReferenceHandler;
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template<typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
+{
+    typename T::cl_type value;
+    cl_int err = f(name, sizeof(value), &value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    *param = value;
+    if (value != NULL)
+    {
+        err = param->retain();
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+    }
+    return CL_SUCCESS;
+}
+
+#define CL_HPP_PARAM_NAME_INFO_1_0_(F) \
+    F(cl_platform_info, CL_PLATFORM_PROFILE, string) \
+    F(cl_platform_info, CL_PLATFORM_VERSION, string) \
+    F(cl_platform_info, CL_PLATFORM_NAME, string) \
+    F(cl_platform_info, CL_PLATFORM_VENDOR, string) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, string) \
+    \
+    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
+    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, size_type) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, cl::vector<size_type>) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
+    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, size_type) \
+    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, size_type) \
+    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
+    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
+    F(cl_device_info, CL_DEVICE_NAME, string) \
+    F(cl_device_info, CL_DEVICE_VENDOR, string) \
+    F(cl_device_info, CL_DRIVER_VERSION, string) \
+    F(cl_device_info, CL_DEVICE_PROFILE, string) \
+    F(cl_device_info, CL_DEVICE_VERSION, string) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS, string) \
+    \
+    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
+    F(cl_context_info, CL_CONTEXT_DEVICES, cl::vector<Device>) \
+    F(cl_context_info, CL_CONTEXT_PROPERTIES, cl::vector<cl_context_properties>) \
+    \
+    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
+    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
+    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
+    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_int) \
+    \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
+    \
+    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
+    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
+    F(cl_mem_info, CL_MEM_SIZE, size_type) \
+    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
+    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
+    \
+    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
+    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, size_type) \
+    F(cl_image_info, CL_IMAGE_ROW_PITCH, size_type) \
+    F(cl_image_info, CL_IMAGE_SLICE_PITCH, size_type) \
+    F(cl_image_info, CL_IMAGE_WIDTH, size_type) \
+    F(cl_image_info, CL_IMAGE_HEIGHT, size_type) \
+    F(cl_image_info, CL_IMAGE_DEPTH, size_type) \
+    \
+    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
+    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
+    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_bool) \
+    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_addressing_mode) \
+    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_filter_mode) \
+    \
+    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
+    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_DEVICES, cl::vector<Device>) \
+    F(cl_program_info, CL_PROGRAM_SOURCE, string) \
+    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, cl::vector<size_type>) \
+    F(cl_program_info, CL_PROGRAM_BINARIES, cl::vector<cl::vector<unsigned char>>) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, string) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, string) \
+    \
+    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, string) \
+    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
+    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, size_type) \
+    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::detail::size_t_array) \
+    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
+    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
+    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
+    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+
+#define CL_HPP_PARAM_NAME_INFO_1_1_(F) \
+    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, string) \
+    \
+    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
+    F(cl_mem_info, CL_MEM_OFFSET, size_type) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_type) \
+    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
+
+#define CL_HPP_PARAM_NAME_INFO_1_2_(F) \
+    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, size_type) \
+    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, string) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
+    \
+    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, string) \
+    \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, string) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, string) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_QUALIFIER, cl_kernel_arg_type_qualifier) \
+    \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl::Device) \
+    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, cl::vector<cl_device_partition_property>) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, cl::vector<cl_device_partition_property>)  \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, size_type) \
+    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, string) \
+    \
+    F(cl_image_info, CL_IMAGE_ARRAY_SIZE, size_type) \
+    F(cl_image_info, CL_IMAGE_NUM_MIP_LEVELS, cl_uint) \
+    F(cl_image_info, CL_IMAGE_NUM_SAMPLES, cl_uint)
+
+#define CL_HPP_PARAM_NAME_INFO_2_0_(F) \
+    F(cl_device_info, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, cl_command_queue_properties) \
+    F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES, cl_command_queue_properties) \
+    F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_ON_DEVICE_QUEUES, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_ON_DEVICE_EVENTS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_PIPE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PIPE_MAX_PACKET_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_SVM_CAPABILITIES, cl_device_svm_capabilities) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT, cl_uint) \
+    F(cl_command_queue_info, CL_QUEUE_SIZE, cl_uint) \
+    F(cl_mem_info, CL_MEM_USES_SVM_POINTER, cl_bool) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, size_type) \
+    F(cl_pipe_info, CL_PIPE_PACKET_SIZE, cl_uint) \
+    F(cl_pipe_info, CL_PIPE_MAX_PACKETS, cl_uint)
+
+#define CL_HPP_PARAM_NAME_DEVICE_FISSION_(F) \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, cl::vector<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, cl::vector<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
+    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, cl::vector<cl_device_partition_property_ext>)
+
+template <typename enum_type, cl_int Name>
+struct param_traits {};
+
+#define CL_HPP_DECLARE_PARAM_TRAITS_(token, param_name, T) \
+struct token;                                        \
+template<>                                           \
+struct param_traits<detail:: token,param_name>       \
+{                                                    \
+    enum { value = param_name };                     \
+    typedef T param_type;                            \
+};
+
+CL_HPP_PARAM_NAME_INFO_1_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+CL_HPP_PARAM_NAME_INFO_1_1_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+CL_HPP_PARAM_NAME_INFO_1_2_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+CL_HPP_PARAM_NAME_INFO_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+
+// Flags deprecated in OpenCL 2.0
+#define CL_HPP_PARAM_NAME_INFO_1_0_DEPRECATED_IN_2_0_(F) \
+    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+#define CL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(F) \
+    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool)
+
+#define CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(F) \
+    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer)
+
+// Include deprecated query flags based on versions
+// Only include deprecated 1.0 flags if 2.0 not active as there is an enum clash
+#if CL_HPP_TARGET_OPENCL_VERSION > 100 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 && CL_HPP_TARGET_OPENCL_VERSION < 200
+CL_HPP_PARAM_NAME_INFO_1_0_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 110
+#if CL_HPP_TARGET_OPENCL_VERSION > 110 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+CL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+#if CL_HPP_TARGET_OPENCL_VERSION > 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
+
+#if defined(CL_HPP_USE_CL_DEVICE_FISSION)
+CL_HPP_PARAM_NAME_DEVICE_FISSION_(CL_HPP_DECLARE_PARAM_TRAITS_);
+#endif // CL_HPP_USE_CL_DEVICE_FISSION
+
+#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, string)
+#endif
+
+#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
+#endif
+
+#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, vector<size_type>)
+#endif
+#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_WIDTH_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
+#endif
+
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_WARP_SIZE_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_GPU_OVERLAP_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
+#endif
+
+// Convenience functions
+
+template <typename Func, typename T>
+inline cl_int
+getInfo(Func f, cl_uint name, T* param)
+{
+    return getInfoHelper(f, name, param, 0);
+}
+
+template <typename Func, typename Arg0>
+struct GetInfoFunctor0
+{
+    Func f_; const Arg0& arg0_;
+    cl_int operator ()(
+        cl_uint param, size_type size, void* value, size_type* size_ret)
+    { return f_(arg0_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename Arg1>
+struct GetInfoFunctor1
+{
+    Func f_; const Arg0& arg0_; const Arg1& arg1_;
+    cl_int operator ()(
+        cl_uint param, size_type size, void* value, size_type* size_ret)
+    { return f_(arg0_, arg1_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
+{
+    GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template <typename Func, typename Arg0, typename Arg1, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
+{
+    GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+
+template<typename T>
+struct ReferenceHandler
+{ };
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+/**
+ * OpenCL 1.2 devices do have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int retain(cl_device_id device)
+    { return ::clRetainDevice(device); }
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int release(cl_device_id device)
+    { return ::clReleaseDevice(device); }
+};
+#else // CL_HPP_TARGET_OPENCL_VERSION >= 120
+/**
+ * OpenCL 1.1 devices do not have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    // cl_device_id does not have retain().
+    static cl_int retain(cl_device_id)
+    { return CL_SUCCESS; }
+    // cl_device_id does not have release().
+    static cl_int release(cl_device_id)
+    { return CL_SUCCESS; }
+};
+#endif // ! (CL_HPP_TARGET_OPENCL_VERSION >= 120)
+
+template <>
+struct ReferenceHandler<cl_platform_id>
+{
+    // cl_platform_id does not have retain().
+    static cl_int retain(cl_platform_id)
+    { return CL_SUCCESS; }
+    // cl_platform_id does not have release().
+    static cl_int release(cl_platform_id)
+    { return CL_SUCCESS; }
+};
+
+template <>
+struct ReferenceHandler<cl_context>
+{
+    static cl_int retain(cl_context context)
+    { return ::clRetainContext(context); }
+    static cl_int release(cl_context context)
+    { return ::clReleaseContext(context); }
+};
+
+template <>
+struct ReferenceHandler<cl_command_queue>
+{
+    static cl_int retain(cl_command_queue queue)
+    { return ::clRetainCommandQueue(queue); }
+    static cl_int release(cl_command_queue queue)
+    { return ::clReleaseCommandQueue(queue); }
+};
+
+template <>
+struct ReferenceHandler<cl_mem>
+{
+    static cl_int retain(cl_mem memory)
+    { return ::clRetainMemObject(memory); }
+    static cl_int release(cl_mem memory)
+    { return ::clReleaseMemObject(memory); }
+};
+
+template <>
+struct ReferenceHandler<cl_sampler>
+{
+    static cl_int retain(cl_sampler sampler)
+    { return ::clRetainSampler(sampler); }
+    static cl_int release(cl_sampler sampler)
+    { return ::clReleaseSampler(sampler); }
+};
+
+template <>
+struct ReferenceHandler<cl_program>
+{
+    static cl_int retain(cl_program program)
+    { return ::clRetainProgram(program); }
+    static cl_int release(cl_program program)
+    { return ::clReleaseProgram(program); }
+};
+
+template <>
+struct ReferenceHandler<cl_kernel>
+{
+    static cl_int retain(cl_kernel kernel)
+    { return ::clRetainKernel(kernel); }
+    static cl_int release(cl_kernel kernel)
+    { return ::clReleaseKernel(kernel); }
+};
+
+template <>
+struct ReferenceHandler<cl_event>
+{
+    static cl_int retain(cl_event event)
+    { return ::clRetainEvent(event); }
+    static cl_int release(cl_event event)
+    { return ::clReleaseEvent(event); }
+};
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
+// Extracts version number with major in the upper 16 bits, minor in the lower 16
+static cl_uint getVersion(const vector<char> &versionInfo)
+{
+    int highVersion = 0;
+    int lowVersion = 0;
+    int index = 7;
+    while(versionInfo[index] != '.' ) {
+        highVersion *= 10;
+        highVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    ++index;
+    while(versionInfo[index] != ' ' &&  versionInfo[index] != '\0') {
+        lowVersion *= 10;
+        lowVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    return (highVersion << 16) | lowVersion;
+}
+
+static cl_uint getPlatformVersion(cl_platform_id platform)
+{
+    size_type size = 0;
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
+
+    vector<char> versionInfo(size);
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, versionInfo.data(), &size);
+    return getVersion(versionInfo);
+}
+
+static cl_uint getDevicePlatformVersion(cl_device_id device)
+{
+    cl_platform_id platform;
+    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+    return getPlatformVersion(platform);
+}
+
+static cl_uint getContextPlatformVersion(cl_context context)
+{
+    // The platform cannot be queried directly, so we first have to grab a
+    // device and obtain its context
+    size_type size = 0;
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
+    if (size == 0)
+        return 0;
+    vector<cl_device_id> devices(size/sizeof(cl_device_id));
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices.data(), NULL);
+    return getDevicePlatformVersion(devices[0]);
+}
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
+
+template <typename T>
+class Wrapper
+{
+public:
+    typedef T cl_type;
+
+protected:
+    cl_type object_;
+
+public:
+    Wrapper() : object_(NULL) { }
+    
+    Wrapper(const cl_type &obj, bool retainObject) : object_(obj) 
+    {
+        if (retainObject) { 
+            detail::errHandler(retain(), __RETAIN_ERR); 
+        }
+    }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        detail::errHandler(retain(), __RETAIN_ERR);
+    }
+
+    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT_
+    {
+        object_ = rhs.object_;
+        rhs.object_ = NULL;
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (this != &rhs) {
+            detail::errHandler(release(), __RELEASE_ERR);
+            object_ = rhs.object_;
+            detail::errHandler(retain(), __RETAIN_ERR);
+        }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
+    {
+        if (this != &rhs) {
+            detail::errHandler(release(), __RELEASE_ERR);
+            object_ = rhs.object_;
+            rhs.object_ = NULL;
+        }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        detail::errHandler(release(), __RELEASE_ERR);
+        object_ = rhs;
+        return *this;
+    }
+
+    const cl_type& operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+    const cl_type get() const { return object_; }
+
+    cl_type get() { return object_; }
+
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        if (object_ != nullptr) {
+            return ReferenceHandler<cl_type>::retain(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+
+    cl_int release() const
+    {
+        if (object_ != nullptr) {
+            return ReferenceHandler<cl_type>::release(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+};
+
+template <>
+class Wrapper<cl_device_id>
+{
+public:
+    typedef cl_device_id cl_type;
+
+protected:
+    cl_type object_;
+    bool referenceCountable_;
+
+    static bool isReferenceCountable(cl_device_id device)
+    {
+        bool retVal = false;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        if (device != NULL) {
+            int version = getDevicePlatformVersion(device);
+            if(version > ((1 << 16) + 1)) {
+                retVal = true;
+            }
+        }
+#else // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        retVal = true;
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+        return retVal;
+    }
+
+public:
+    Wrapper() : object_(NULL), referenceCountable_(false) 
+    { 
+    }
+    
+    Wrapper(const cl_type &obj, bool retainObject) : 
+        object_(obj), 
+        referenceCountable_(false) 
+    {
+        referenceCountable_ = isReferenceCountable(obj); 
+
+        if (retainObject) {
+            detail::errHandler(retain(), __RETAIN_ERR);
+        }
+    }
+
+    ~Wrapper()
+    {
+        release();
+    }
+    
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = isReferenceCountable(object_); 
+        detail::errHandler(retain(), __RETAIN_ERR);
+    }
+
+    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT_
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = rhs.referenceCountable_;
+        rhs.object_ = NULL;
+        rhs.referenceCountable_ = false;
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (this != &rhs) {
+            detail::errHandler(release(), __RELEASE_ERR);
+            object_ = rhs.object_;
+            referenceCountable_ = rhs.referenceCountable_;
+            detail::errHandler(retain(), __RETAIN_ERR);
+        }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
+    {
+        if (this != &rhs) {
+            detail::errHandler(release(), __RELEASE_ERR);
+            object_ = rhs.object_;
+            referenceCountable_ = rhs.referenceCountable_;
+            rhs.object_ = NULL;
+            rhs.referenceCountable_ = false;
+        }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        detail::errHandler(release(), __RELEASE_ERR);
+        object_ = rhs;
+        referenceCountable_ = isReferenceCountable(object_); 
+        return *this;
+    }
+
+    const cl_type& operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+    cl_type get() const { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, vector<U>*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        if( object_ != nullptr && referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::retain(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+
+    cl_int release() const
+    {
+        if (object_ != nullptr && referenceCountable_) {
+            return ReferenceHandler<cl_type>::release(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+};
+
+template <typename T>
+inline bool operator==(const Wrapper<T> &lhs, const Wrapper<T> &rhs)
+{
+    return lhs() == rhs();
+}
+
+template <typename T>
+inline bool operator!=(const Wrapper<T> &lhs, const Wrapper<T> &rhs)
+{
+    return !operator==(lhs, rhs);
+}
+
+} // namespace detail
+//! \endcond
+
+
+using BuildLogType = vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, CL_PROGRAM_BUILD_LOG>::param_type>>;
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+/**
+* Exception class for build errors to carry build info
+*/
+class BuildError : public Error
+{
+private:
+    BuildLogType buildLogs;
+public:
+    BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec)
+    {
+    }
+
+    BuildLogType getBuildLog() const
+    {
+        return buildLogs;
+    }
+};
+namespace detail {
+    static inline cl_int buildErrHandler(
+        cl_int err,
+        const char * errStr,
+        const BuildLogType &buildLogs)
+    {
+        if (err != CL_SUCCESS) {
+            throw BuildError(err, errStr, buildLogs);
+        }
+        return err;
+    }
+} // namespace detail
+
+#else
+namespace detail {
+    static inline cl_int buildErrHandler(
+        cl_int err,
+        const char * errStr,
+        const BuildLogType &buildLogs)
+    {
+        (void)buildLogs; // suppress unused variable warning
+        (void)errStr;
+        return err;
+    }
+} // namespace detail
+#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
+
+
+/*! \stuct ImageFormat
+ *  \brief Adds constructors and member functions for cl_image_format.
+ *
+ *  \see cl_image_format
+ */
+struct ImageFormat : public cl_image_format
+{
+    //! \brief Default constructor - performs no initialization.
+    ImageFormat(){}
+
+    //! \brief Initializing constructor.
+    ImageFormat(cl_channel_order order, cl_channel_type type)
+    {
+        image_channel_order = order;
+        image_channel_data_type = type;
+    }
+
+    //! \brief Assignment operator.
+    ImageFormat& operator = (const ImageFormat& rhs)
+    {
+        if (this != &rhs) {
+            this->image_channel_data_type = rhs.image_channel_data_type;
+            this->image_channel_order     = rhs.image_channel_order;
+        }
+        return *this;
+    }
+};
+
+/*! \brief Class interface for cl_device_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_device_id
+ */
+class Device : public detail::Wrapper<cl_device_id>
+{
+private:
+    static std::once_flag default_initialized_;
+    static Device default_;
+    static cl_int default_error_;
+
+    /*! \brief Create the default context.
+    *
+    * This sets @c default_ and @c default_error_. It does not throw
+    * @c cl::Error.
+    */
+    static void makeDefault();
+
+    /*! \brief Create the default platform from a provided platform.
+    *
+    * This sets @c default_. It does not throw
+    * @c cl::Error.
+    */
+    static void makeDefaultProvided(const Device &p) {
+        default_ = p;
+    }
+
+public:
+#ifdef CL_HPP_UNIT_TEST_ENABLE
+    /*! \brief Reset the default.
+    *
+    * This sets @c default_ to an empty value to support cleanup in
+    * the unit test framework.
+    * This function is not thread safe.
+    */
+    static void unitTestClearDefault() {
+        default_ = Device();
+    }
+#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
+
+    //! \brief Default constructor - initializes to NULL.
+    Device() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    explicit Device(const cl_device_id &device, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(device, retainObject) { }
+
+    /*! \brief Returns the first device on the default context.
+     *
+     *  \see Context::getDefault()
+     */
+    static Device getDefault(
+        cl_int *errResult = NULL)
+    {
+        std::call_once(default_initialized_, makeDefault);
+        detail::errHandler(default_error_);
+        if (errResult != NULL) {
+            *errResult = default_error_;
+        }
+        return default_;
+    }
+
+    /**
+    * Modify the default device to be used by
+    * subsequent operations.
+    * Will only set the default if no default was previously created.
+    * @return updated default device.
+    *         Should be compared to the passed value to ensure that it was updated.
+    */
+    static Device setDefault(const Device &default_device)
+    {
+        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_device));
+        detail::errHandler(default_error_);
+        return default_;
+    }
+
+    /*! \brief Assignment operator from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device& operator = (const cl_device_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    Device(const Device& dev) : detail::Wrapper<cl_type>(dev) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    Device& operator = (const Device &dev)
+    {
+        detail::Wrapper<cl_type>::operator=(dev);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    Device(Device&& dev) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(dev)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    Device& operator = (Device &&dev)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(dev));
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo().
+    template <typename T>
+    cl_int getInfo(cl_device_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetDeviceInfo, object_, name, param),
+            __GET_DEVICE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_device_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_device_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /**
+     * CL 1.2 version
+     */
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    //! \brief Wrapper for clCreateSubDevices().
+    cl_int createSubDevices(
+        const cl_device_partition_property * properties,
+        vector<Device>* devices)
+    {
+        cl_uint n = 0;
+        cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+        }
+
+        vector<cl_device_id> ids(n);
+        err = clCreateSubDevices(object_, properties, n, ids.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+        }
+
+        // Cannot trivially assign because we need to capture intermediates 
+        // with safe construction
+        if (devices) {
+            devices->resize(ids.size());
+
+            // Assign to param, constructing with retain behaviour
+            // to correctly capture each underlying CL object
+            for (size_type i = 0; i < ids.size(); i++) {
+                // We do not need to retain because this device is being created 
+                // by the runtime
+                (*devices)[i] = Device(ids[i], false);
+            }
+        }
+
+        return CL_SUCCESS;
+    }
+#elif defined(CL_HPP_USE_CL_DEVICE_FISSION)
+
+/**
+ * CL 1.1 version that uses device fission extension.
+ */
+    cl_int createSubDevices(
+        const cl_device_partition_property_ext * properties,
+        vector<Device>* devices)
+    {
+        typedef CL_API_ENTRY cl_int 
+            ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
+                cl_device_id /*in_device*/,
+                const cl_device_partition_property_ext * /* properties */,
+                cl_uint /*num_entries*/,
+                cl_device_id * /*out_devices*/,
+                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSubDevicesEXT);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+        }
+
+        vector<cl_device_id> ids(n);
+        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+        }
+        // Cannot trivially assign because we need to capture intermediates 
+        // with safe construction
+        if (devices) {
+            devices->resize(ids.size());
+
+            // Assign to param, constructing with retain behaviour
+            // to correctly capture each underlying CL object
+            for (size_type i = 0; i < ids.size(); i++) {
+                // We do not need to retain because this device is being created 
+                // by the runtime
+                (*devices)[i] = Device(ids[i], false);
+            }
+        }
+        return CL_SUCCESS;
+    }
+#endif // defined(CL_HPP_USE_CL_DEVICE_FISSION)
+};
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Device::default_initialized_;
+CL_HPP_DEFINE_STATIC_MEMBER_ Device Device::default_;
+CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Device::default_error_ = CL_SUCCESS;
+
+/*! \brief Class interface for cl_platform_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_platform_id
+ */
+class Platform : public detail::Wrapper<cl_platform_id>
+{
+private:
+    static std::once_flag default_initialized_;
+    static Platform default_;
+    static cl_int default_error_;
+
+    /*! \brief Create the default context.
+    *
+    * This sets @c default_ and @c default_error_. It does not throw
+    * @c cl::Error.
+    */
+    static void makeDefault() {
+        /* Throwing an exception from a call_once invocation does not do
+        * what we wish, so we catch it and save the error.
+        */
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        try
+#endif
+        {
+            // If default wasn't passed ,generate one
+            // Otherwise set it
+            cl_uint n = 0;
+
+            cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+            if (err != CL_SUCCESS) {
+                default_error_ = err;
+                return;
+            }
+            if (n == 0) {
+                default_error_ = CL_INVALID_PLATFORM;
+                return;
+            }
+
+            vector<cl_platform_id> ids(n);
+            err = ::clGetPlatformIDs(n, ids.data(), NULL);
+            if (err != CL_SUCCESS) {
+                default_error_ = err;
+                return;
+            }
+
+            default_ = Platform(ids[0]);
+        }
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        catch (cl::Error &e) {
+            default_error_ = e.err();
+        }
+#endif
+    }
+
+    /*! \brief Create the default platform from a provided platform.
+     *
+     * This sets @c default_. It does not throw
+     * @c cl::Error.
+     */
+    static void makeDefaultProvided(const Platform &p) {
+       default_ = p;
+    }
+    
+public:
+#ifdef CL_HPP_UNIT_TEST_ENABLE
+    /*! \brief Reset the default.
+    *
+    * This sets @c default_ to an empty value to support cleanup in
+    * the unit test framework.
+    * This function is not thread safe.
+    */
+    static void unitTestClearDefault() {
+        default_ = Platform();
+    }
+#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
+
+    //! \brief Default constructor - initializes to NULL.
+    Platform() : detail::Wrapper<cl_type>()  { }
+
+    /*! \brief Constructor from cl_platform_id.
+     * 
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    explicit Platform(const cl_platform_id &platform, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(platform, retainObject) { }
+
+    /*! \brief Assignment operator from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform& operator = (const cl_platform_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    static Platform getDefault(
+        cl_int *errResult = NULL)
+    {
+        std::call_once(default_initialized_, makeDefault);
+        detail::errHandler(default_error_);
+        if (errResult != NULL) {
+            *errResult = default_error_;
+        }
+        return default_;
+    }
+
+    /**
+     * Modify the default platform to be used by 
+     * subsequent operations.
+     * Will only set the default if no default was previously created.
+     * @return updated default platform. 
+     *         Should be compared to the passed value to ensure that it was updated.
+     */
+    static Platform setDefault(const Platform &default_platform)
+    {
+        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_platform));
+        detail::errHandler(default_error_);
+        return default_;
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo().
+    cl_int getInfo(cl_platform_info name, string* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetPlatformInfo, object_, name, param),
+            __GET_PLATFORM_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_platform_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_platform_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of devices for this platform.
+     * 
+     *  Wraps clGetDeviceIDs().
+     */
+    cl_int getDevices(
+        cl_device_type type,
+        vector<Device>* devices) const
+    {
+        cl_uint n = 0;
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+        cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        vector<cl_device_id> ids(n);
+        err = ::clGetDeviceIDs(object_, type, n, ids.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        // Cannot trivially assign because we need to capture intermediates 
+        // with safe construction
+        // We must retain things we obtain from the API to avoid releasing
+        // API-owned objects.
+        if (devices) {
+            devices->resize(ids.size());
+
+            // Assign to param, constructing with retain behaviour
+            // to correctly capture each underlying CL object
+            for (size_type i = 0; i < ids.size(); i++) {
+                (*devices)[i] = Device(ids[i], true);
+            }
+        }
+        return CL_SUCCESS;
+    }
+
+#if defined(CL_HPP_USE_DX_INTEROP)
+   /*! \brief Get the list of available D3D10 devices.
+     *
+     *  \param d3d_device_source.
+     *
+     *  \param d3d_object.
+     *
+     *  \param d3d_device_set.
+     *
+     *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
+     *  values returned in devices can be used to identify a specific OpenCL
+     *  device. If \a devices argument is NULL, this argument is ignored.
+     *
+     *  \return One of the following values:
+     *    - CL_SUCCESS if the function is executed successfully.
+     *
+     *  The application can query specific capabilities of the OpenCL device(s)
+     *  returned by cl::getDevices. This can be used by the application to
+     *  determine which device(s) to use.
+     *
+     * \note In the case that exceptions are enabled and a return value
+     * other than CL_SUCCESS is generated, then cl::Error exception is
+     * generated.
+     */
+    cl_int getDevices(
+        cl_d3d10_device_source_khr d3d_device_source,
+        void *                     d3d_object,
+        cl_d3d10_device_set_khr    d3d_device_set,
+        vector<Device>* devices) const
+    {
+        typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
+            cl_platform_id platform, 
+            cl_d3d10_device_source_khr d3d_device_source, 
+            void * d3d_object,
+            cl_d3d10_device_set_khr d3d_device_set,
+            cl_uint num_entries,
+            cl_device_id * devices,
+            cl_uint* num_devices);
+
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+
+        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(object_, clGetDeviceIDsFromD3D10KHR);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set, 
+            0, 
+            NULL, 
+            &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        vector<cl_device_id> ids(n);
+        err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set,
+            n, 
+            ids.data(), 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        // Cannot trivially assign because we need to capture intermediates 
+        // with safe construction
+        // We must retain things we obtain from the API to avoid releasing
+        // API-owned objects.
+        if (devices) {
+            devices->resize(ids.size());
+
+            // Assign to param, constructing with retain behaviour
+            // to correctly capture each underlying CL object
+            for (size_type i = 0; i < ids.size(); i++) {
+                (*devices)[i] = Device(ids[i], true);
+            }
+        }
+        return CL_SUCCESS;
+    }
+#endif
+
+    /*! \brief Gets a list of available platforms.
+     * 
+     *  Wraps clGetPlatformIDs().
+     */
+    static cl_int get(
+        vector<Platform>* platforms)
+    {
+        cl_uint n = 0;
+
+        if( platforms == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        vector<cl_platform_id> ids(n);
+        err = ::clGetPlatformIDs(n, ids.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        if (platforms) {
+            platforms->resize(ids.size());
+
+            // Platforms don't reference count
+            for (size_type i = 0; i < ids.size(); i++) {
+                (*platforms)[i] = Platform(ids[i]);
+            }
+        }
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static cl_int get(
+        Platform * platform)
+    {
+        cl_int err;
+        Platform default_platform = Platform::getDefault(&err);
+        if (platform) {
+            *platform = default_platform;
+        }
+        return err;
+    }
+
+    /*! \brief Gets the first available platform, returning it by value.
+     *
+     * \return Returns a valid platform if one is available.
+     *         If no platform is available will return a null platform.
+     * Throws an exception if no platforms are available
+     * or an error condition occurs.
+     * Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static Platform get(
+        cl_int * errResult = NULL)
+    {
+        cl_int err;
+        Platform default_platform = Platform::getDefault(&err);
+        if (errResult) {
+            *errResult = err;
+        }
+        return default_platform;
+    }    
+    
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    //! \brief Wrapper for clUnloadCompiler().
+    cl_int
+    unloadCompiler()
+    {
+        return ::clUnloadPlatformCompiler(object_);
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+}; // class Platform
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Platform::default_initialized_;
+CL_HPP_DEFINE_STATIC_MEMBER_ Platform Platform::default_;
+CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Platform::default_error_ = CL_SUCCESS;
+
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+/**
+ * Unload the OpenCL compiler.
+ * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
+UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline cl_int
+UnloadCompiler()
+{
+    return ::clUnloadCompiler();
+}
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+/*! \brief Class interface for cl_context.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_context as the original.  For details, see
+ *        clRetainContext() and clReleaseContext().
+ *
+ *  \see cl_context
+ */
+class Context 
+    : public detail::Wrapper<cl_context>
+{
+private:
+    static std::once_flag default_initialized_;
+    static Context default_;
+    static cl_int default_error_;
+
+    /*! \brief Create the default context from the default device type in the default platform.
+     *
+     * This sets @c default_ and @c default_error_. It does not throw
+     * @c cl::Error.
+     */
+    static void makeDefault() {
+        /* Throwing an exception from a call_once invocation does not do
+         * what we wish, so we catch it and save the error.
+         */
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        try
+#endif
+        {
+#if !defined(__APPLE__) && !defined(__MACOS)
+            const Platform &p = Platform::getDefault();
+            cl_platform_id defaultPlatform = p();
+            cl_context_properties properties[3] = {
+                CL_CONTEXT_PLATFORM, (cl_context_properties)defaultPlatform, 0
+            };
+#else // #if !defined(__APPLE__) && !defined(__MACOS)
+            cl_context_properties *properties = nullptr;
+#endif // #if !defined(__APPLE__) && !defined(__MACOS)
+
+            default_ = Context(
+                CL_DEVICE_TYPE_DEFAULT,
+                properties,
+                NULL,
+                NULL,
+                &default_error_);
+        }
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        catch (cl::Error &e) {
+            default_error_ = e.err();
+        }
+#endif
+    }
+
+
+    /*! \brief Create the default context from a provided Context.
+     *
+     * This sets @c default_. It does not throw
+     * @c cl::Error.
+     */
+    static void makeDefaultProvided(const Context &c) {
+        default_ = c;
+    }
+    
+public:
+#ifdef CL_HPP_UNIT_TEST_ENABLE
+    /*! \brief Reset the default.
+    *
+    * This sets @c default_ to an empty value to support cleanup in
+    * the unit test framework.
+    * This function is not thread safe.
+    */
+    static void unitTestClearDefault() {
+        default_ = Context();
+    }
+#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
+
+    /*! \brief Constructs a context including a list of specified devices.
+     *
+     *  Wraps clCreateContext().
+     */
+    Context(
+        const vector<Device>& devices,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            size_type,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        size_type numDevices = devices.size();
+        vector<cl_device_id> deviceIDs(numDevices);
+
+        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        object_ = ::clCreateContext(
+            properties, (cl_uint) numDevices,
+            deviceIDs.data(),
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Context(
+        const Device& device,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            size_type,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_device_id deviceID = device();
+
+        object_ = ::clCreateContext(
+            properties, 1,
+            &deviceID,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+    
+    /*! \brief Constructs a context including all or a subset of devices of a specified type.
+     *
+     *  Wraps clCreateContextFromType().
+     */
+    Context(
+        cl_device_type type,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            size_type,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if !defined(__APPLE__) && !defined(__MACOS)
+        cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
+
+        if (properties == NULL) {
+            // Get a valid platform ID as we cannot send in a blank one
+            vector<Platform> platforms;
+            error = Platform::get(&platforms);
+            if (error != CL_SUCCESS) {
+                detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = error;
+                }
+                return;
+            }
+
+            // Check the platforms we found for a device of our specified type
+            cl_context_properties platform_id = 0;
+            for (unsigned int i = 0; i < platforms.size(); i++) {
+
+                vector<Device> devices;
+
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+                try {
+#endif
+
+                    error = platforms[i].getDevices(type, &devices);
+
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+                } catch (Error) {}
+    // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
+    // We do error checking next anyway, and can throw there if needed
+#endif
+
+                // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
+                if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
+                    detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                    if (err != NULL) {
+                        *err = error;
+                    }
+                }
+
+                if (devices.size() > 0) {
+                    platform_id = (cl_context_properties)platforms[i]();
+                    break;
+                }
+            }
+
+            if (platform_id == 0) {
+                detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = CL_DEVICE_NOT_FOUND;
+                }
+                return;
+            }
+
+            prop[1] = platform_id;
+            properties = &prop[0];
+        }
+#endif
+        object_ = ::clCreateContextFromType(
+            properties, type, notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context(const Context& ctx) : detail::Wrapper<cl_type>(ctx) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context& operator = (const Context &ctx)
+    {
+        detail::Wrapper<cl_type>::operator=(ctx);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context(Context&& ctx) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(ctx)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context& operator = (Context &&ctx)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(ctx));
+        return *this;
+    }
+
+
+    /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
+     *
+     *  \note All calls to this function return the same cl_context as the first.
+     */
+    static Context getDefault(cl_int * err = NULL) 
+    {
+        std::call_once(default_initialized_, makeDefault);
+        detail::errHandler(default_error_);
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+    }
+
+    /**
+     * Modify the default context to be used by
+     * subsequent operations.
+     * Will only set the default if no default was previously created.
+     * @return updated default context.
+     *         Should be compared to the passed value to ensure that it was updated.
+     */
+    static Context setDefault(const Context &default_context)
+    {
+        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_context));
+        detail::errHandler(default_error_);
+        return default_;
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Context() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_context
+     *  into the new Context object.
+     */
+    explicit Context(const cl_context& context, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(context, retainObject) { }
+
+    /*! \brief Assignment operator from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseContext() on the value previously held by this instance.
+     */
+    Context& operator = (const cl_context& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetContextInfo().
+    template <typename T>
+    cl_int getInfo(cl_context_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetContextInfo, object_, name, param),
+            __GET_CONTEXT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetContextInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_context_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_context_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of supported image formats.
+     *  
+     *  Wraps clGetSupportedImageFormats().
+     */
+    cl_int getSupportedImageFormats(
+        cl_mem_flags flags,
+        cl_mem_object_type type,
+        vector<ImageFormat>* formats) const
+    {
+        cl_uint numEntries;
+        
+        if (!formats) {
+            return CL_SUCCESS;
+        }
+
+        cl_int err = ::clGetSupportedImageFormats(
+           object_, 
+           flags,
+           type, 
+           0, 
+           NULL, 
+           &numEntries);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        if (numEntries > 0) {
+            vector<ImageFormat> value(numEntries);
+            err = ::clGetSupportedImageFormats(
+                object_,
+                flags,
+                type,
+                numEntries,
+                (cl_image_format*)value.data(),
+                NULL);
+            if (err != CL_SUCCESS) {
+                return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+            }
+
+            formats->assign(begin(value), end(value));
+        }
+        else {
+            // If no values are being returned, ensure an empty vector comes back
+            formats->clear();
+        }
+
+        return CL_SUCCESS;
+    }
+};
+
+inline void Device::makeDefault()
+{
+    /* Throwing an exception from a call_once invocation does not do
+    * what we wish, so we catch it and save the error.
+    */
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+    try
+#endif
+    {
+        cl_int error = 0;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS) {
+            default_error_ = error;
+        }
+        else {
+            default_ = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+            default_error_ = CL_SUCCESS;
+        }
+    }
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+    catch (cl::Error &e) {
+        default_error_ = e.err();
+    }
+#endif
+}
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Context::default_initialized_;
+CL_HPP_DEFINE_STATIC_MEMBER_ Context Context::default_;
+CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Context::default_error_ = CL_SUCCESS;
+
+/*! \brief Class interface for cl_event.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_event as the original.  For details, see
+ *        clRetainEvent() and clReleaseEvent().
+ *
+ *  \see cl_event
+ */
+class Event : public detail::Wrapper<cl_event>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Event() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_event - takes ownership.
+     * 
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  This effectively transfers ownership of a refcount on the cl_event
+     *  into the new Event object.
+     */
+    explicit Event(const cl_event& event, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(event, retainObject) { }
+
+    /*! \brief Assignment operator from cl_event - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseEvent() on the value previously held by this instance.
+     */
+    Event& operator = (const cl_event& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetEventInfo().
+    template <typename T>
+    cl_int getInfo(cl_event_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetEventInfo, object_, name, param),
+            __GET_EVENT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_event_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_event_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo().
+    template <typename T>
+    cl_int getProfilingInfo(cl_profiling_info name, T* param) const
+    {
+        return detail::errHandler(detail::getInfo(
+            &::clGetEventProfilingInfo, object_, name, param),
+            __GET_EVENT_PROFILE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_profiling_info, name>::param_type
+    getProfilingInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_profiling_info, name>::param_type param;
+        cl_int result = getProfilingInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Blocks the calling thread until this event completes.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    cl_int wait() const
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(1, &object_),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+    /*! \brief Registers a user callback function for a specific command execution status.
+     *
+     *  Wraps clSetEventCallback().
+     */
+    cl_int setCallback(
+        cl_int type,
+        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetEventCallback(
+                object_,
+                type,
+                pfn_notify,
+                user_data), 
+            __SET_EVENT_CALLBACK_ERR);
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+    /*! \brief Blocks the calling thread until every event specified is complete.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    static cl_int
+    waitForEvents(const vector<Event>& events)
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(
+                (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+};
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+/*! \brief Class interface for user events (a subset of cl_event's).
+ * 
+ *  See Event for details about copy semantics, etc.
+ */
+class UserEvent : public Event
+{
+public:
+    /*! \brief Constructs a user event on a given context.
+     *
+     *  Wraps clCreateUserEvent().
+     */
+    UserEvent(
+        const Context& context,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateUserEvent(
+            context(),
+            &error);
+
+        detail::errHandler(error, __CREATE_USER_EVENT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    UserEvent() : Event() { }
+
+    /*! \brief Sets the execution status of a user event object.
+     *
+     *  Wraps clSetUserEventStatus().
+     */
+    cl_int setStatus(cl_int status)
+    {
+        return detail::errHandler(
+            ::clSetUserEventStatus(object_,status), 
+            __SET_USER_EVENT_STATUS_ERR);
+    }
+};
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+/*! \brief Blocks the calling thread until every event specified is complete.
+ * 
+ *  Wraps clWaitForEvents().
+ */
+inline static cl_int
+WaitForEvents(const vector<Event>& events)
+{
+    return detail::errHandler(
+        ::clWaitForEvents(
+            (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+        __WAIT_FOR_EVENTS_ERR);
+}
+
+/*! \brief Class interface for cl_mem.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_mem as the original.  For details, see
+ *        clRetainMemObject() and clReleaseMemObject().
+ *
+ *  \see cl_mem
+ */
+class Memory : public detail::Wrapper<cl_mem>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Memory() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  Optionally transfer ownership of a refcount on the cl_mem
+     *  into the new Memory object.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *
+     *  See Memory for further details.
+     */
+    explicit Memory(const cl_mem& memory, bool retainObject) :
+        detail::Wrapper<cl_type>(memory, retainObject) { }
+
+    /*! \brief Assignment operator from cl_mem - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseMemObject() on the value previously held by this instance.
+     */
+    Memory& operator = (const cl_mem& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory(const Memory& mem) : detail::Wrapper<cl_type>(mem) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory& operator = (const Memory &mem)
+    {
+        detail::Wrapper<cl_type>::operator=(mem);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory(Memory&& mem) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(mem)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory& operator = (Memory &&mem)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(mem));
+        return *this;
+    }
+
+
+    //! \brief Wrapper for clGetMemObjectInfo().
+    template <typename T>
+    cl_int getInfo(cl_mem_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
+            __GET_MEM_OBJECT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_mem_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_mem_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+    /*! \brief Registers a callback function to be called when the memory object
+     *         is no longer needed.
+     *
+     *  Wraps clSetMemObjectDestructorCallback().
+     *
+     *  Repeated calls to this function, for a given cl_mem value, will append
+     *  to the list of functions called (in reverse order) when memory object's
+     *  resources are freed and the memory object is deleted.
+     *
+     *  \note
+     *  The registered callbacks are associated with the underlying cl_mem
+     *  value - not the Memory class instance.
+     */
+    cl_int setDestructorCallback(
+        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetMemObjectDestructorCallback(
+                object_,
+                pfn_notify,
+                user_data), 
+            __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+};
+
+// Pre-declare copy functions
+class Buffer;
+template< typename IteratorType >
+cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+namespace detail
+{
+    class SVMTraitNull
+    {
+    public:
+        static cl_svm_mem_flags getSVMMemFlags()
+        {
+            return 0;
+        }
+    };
+} // namespace detail
+
+template<class Trait = detail::SVMTraitNull>
+class SVMTraitReadWrite
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return CL_MEM_READ_WRITE |
+            Trait::getSVMMemFlags();
+    }
+};
+
+template<class Trait = detail::SVMTraitNull>
+class SVMTraitReadOnly
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return CL_MEM_READ_ONLY |
+            Trait::getSVMMemFlags();
+    }
+};
+
+template<class Trait = detail::SVMTraitNull>
+class SVMTraitWriteOnly
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return CL_MEM_WRITE_ONLY |
+            Trait::getSVMMemFlags();
+    }
+};
+
+template<class Trait = SVMTraitReadWrite<>>
+class SVMTraitCoarse
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return Trait::getSVMMemFlags();
+    }
+};
+
+template<class Trait = SVMTraitReadWrite<>>
+class SVMTraitFine
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return CL_MEM_SVM_FINE_GRAIN_BUFFER |
+            Trait::getSVMMemFlags();
+    }
+};
+
+template<class Trait = SVMTraitReadWrite<>>
+class SVMTraitAtomic
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return
+            CL_MEM_SVM_FINE_GRAIN_BUFFER |
+            CL_MEM_SVM_ATOMICS |
+            Trait::getSVMMemFlags();
+    }
+};
+
+// Pre-declare SVM map function
+template<typename T>
+inline cl_int enqueueMapSVM(
+    T* ptr,
+    cl_bool blocking,
+    cl_map_flags flags,
+    size_type size,
+    const vector<Event>* events = NULL,
+    Event* event = NULL);
+
+/**
+ * STL-like allocator class for managing SVM objects provided for convenience.
+ *
+ * Note that while this behaves like an allocator for the purposes of constructing vectors and similar objects,
+ * care must be taken when using with smart pointers.
+ * The allocator should not be used to construct a unique_ptr if we are using coarse-grained SVM mode because
+ * the coarse-grained management behaviour would behave incorrectly with respect to reference counting.
+ *
+ * Instead the allocator embeds a Deleter which may be used with unique_ptr and is used
+ * with the allocate_shared and allocate_ptr supplied operations.
+ */
+template<typename T, class SVMTrait>
+class SVMAllocator {
+private:
+    Context context_;
+
+public:
+    typedef T value_type;
+    typedef value_type* pointer;
+    typedef const value_type* const_pointer;
+    typedef value_type& reference;
+    typedef const value_type& const_reference;
+    typedef std::size_t size_type;
+    typedef std::ptrdiff_t difference_type;
+
+    template<typename U>
+    struct rebind
+    {
+        typedef SVMAllocator<U, SVMTrait> other;
+    };
+
+    template<typename U, typename V>
+    friend class SVMAllocator;
+
+    SVMAllocator() :
+        context_(Context::getDefault())
+    {
+    }
+
+    explicit SVMAllocator(cl::Context context) :
+        context_(context)
+    {
+    }
+
+
+    SVMAllocator(const SVMAllocator &other) :
+        context_(other.context_)
+    {
+    }
+
+    template<typename U>
+    SVMAllocator(const SVMAllocator<U, SVMTrait> &other) :
+        context_(other.context_)
+    {
+    }
+
+    ~SVMAllocator()
+    {
+    }
+
+    pointer address(reference r) CL_HPP_NOEXCEPT_
+    {
+        return std::addressof(r);
+    }
+
+    const_pointer address(const_reference r) CL_HPP_NOEXCEPT_
+    {
+        return std::addressof(r);
+    }
+
+    /**
+     * Allocate an SVM pointer.
+     *
+     * If the allocator is coarse-grained, this will take ownership to allow
+     * containers to correctly construct data in place. 
+     */
+    pointer allocate(
+        size_type size,
+        typename cl::SVMAllocator<void, SVMTrait>::const_pointer = 0)
+    {
+        // Allocate memory with default alignment matching the size of the type
+        void* voidPointer =
+            clSVMAlloc(
+            context_(),
+            SVMTrait::getSVMMemFlags(),
+            size*sizeof(T),
+            0);
+        pointer retValue = reinterpret_cast<pointer>(
+            voidPointer);
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        if (!retValue) {
+            std::bad_alloc excep;
+            throw excep;
+        }
+#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
+
+        // If allocation was coarse-grained then map it
+        if (!(SVMTrait::getSVMMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) {
+            cl_int err = enqueueMapSVM(retValue, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, size*sizeof(T));
+            if (err != CL_SUCCESS) {
+                std::bad_alloc excep;
+                throw excep;
+            }
+        }
+
+        // If exceptions disabled, return null pointer from allocator
+        return retValue;
+    }
+
+    void deallocate(pointer p, size_type)
+    {
+        clSVMFree(context_(), p);
+    }
+
+    /**
+     * Return the maximum possible allocation size.
+     * This is the minimum of the maximum sizes of all devices in the context.
+     */
+    size_type max_size() const CL_HPP_NOEXCEPT_
+    {
+        size_type maxSize = std::numeric_limits<size_type>::max() / sizeof(T);
+
+        for (const Device &d : context_.getInfo<CL_CONTEXT_DEVICES>()) {
+            maxSize = std::min(
+                maxSize, 
+                static_cast<size_type>(d.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>()));
+        }
+
+        return maxSize;
+    }
+
+    template< class U, class... Args >
+    void construct(U* p, Args&&... args)
+    {
+        new(p)T(args...);
+    }
+
+    template< class U >
+    void destroy(U* p)
+    {
+        p->~U();
+    }
+
+    /**
+     * Returns true if the contexts match.
+     */
+    inline bool operator==(SVMAllocator const& rhs)
+    {
+        return (context_==rhs.context_);
+    }
+
+    inline bool operator!=(SVMAllocator const& a)
+    {
+        return !operator==(a);
+    }
+}; // class SVMAllocator        return cl::pointer<T>(tmp, detail::Deleter<T, Alloc>{alloc, copies});
+
+
+template<class SVMTrait>
+class SVMAllocator<void, SVMTrait> {
+public:
+    typedef void value_type;
+    typedef value_type* pointer;
+    typedef const value_type* const_pointer;
+
+    template<typename U>
+    struct rebind
+    {
+        typedef SVMAllocator<U, SVMTrait> other;
+    };
+
+    template<typename U, typename V>
+    friend class SVMAllocator;
+};
+
+#if !defined(CL_HPP_NO_STD_UNIQUE_PTR)
+namespace detail
+{
+    template<class Alloc>
+    class Deleter {
+    private:
+        Alloc alloc_;
+        size_type copies_;
+
+    public:
+        typedef typename std::allocator_traits<Alloc>::pointer pointer;
+
+        Deleter(const Alloc &alloc, size_type copies) : alloc_{ alloc }, copies_{ copies }
+        {
+        }
+
+        void operator()(pointer ptr) const {
+            Alloc tmpAlloc{ alloc_ };
+            std::allocator_traits<Alloc>::destroy(tmpAlloc, std::addressof(*ptr));
+            std::allocator_traits<Alloc>::deallocate(tmpAlloc, ptr, copies_);
+        }
+    };
+} // namespace detail
+
+/**
+ * Allocation operation compatible with std::allocate_ptr.
+ * Creates a unique_ptr<T> by default.
+ * This requirement is to ensure that the control block is not
+ * allocated in memory inaccessible to the host.
+ */
+template <class T, class Alloc, class... Args>
+cl::pointer<T, detail::Deleter<Alloc>> allocate_pointer(const Alloc &alloc_, Args&&... args)
+{
+    Alloc alloc(alloc_);
+    static const size_type copies = 1;
+
+    // Ensure that creation of the management block and the
+    // object are dealt with separately such that we only provide a deleter
+
+    T* tmp = std::allocator_traits<Alloc>::allocate(alloc, copies);
+    if (!tmp) {
+        std::bad_alloc excep;
+        throw excep;
+    }
+    try {
+        std::allocator_traits<Alloc>::construct(
+            alloc,
+            std::addressof(*tmp),
+            std::forward<Args>(args)...);
+
+        return cl::pointer<T, detail::Deleter<Alloc>>(tmp, detail::Deleter<Alloc>{alloc, copies});
+    }
+    catch (std::bad_alloc b)
+    {
+        std::allocator_traits<Alloc>::deallocate(alloc, tmp, copies);
+        throw;
+    }
+}
+
+template< class T, class SVMTrait, class... Args >
+cl::pointer<T, detail::Deleter<SVMAllocator<T, SVMTrait>>> allocate_svm(Args... args)
+{
+    SVMAllocator<T, SVMTrait> alloc;
+    return cl::allocate_pointer<T>(alloc, args...);
+}
+
+template< class T, class SVMTrait, class... Args >
+cl::pointer<T, detail::Deleter<SVMAllocator<T, SVMTrait>>> allocate_svm(const cl::Context &c, Args... args)
+{
+    SVMAllocator<T, SVMTrait> alloc(c);
+    return cl::allocate_pointer<T>(alloc, args...);
+}
+#endif // #if !defined(CL_HPP_NO_STD_UNIQUE_PTR)
+
+/*! \brief Vector alias to simplify contruction of coarse-grained SVM containers.
+ * 
+ */
+template < class T >
+using coarse_svm_vector = vector<T, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>>;
+
+/*! \brief Vector alias to simplify contruction of fine-grained SVM containers.
+*
+*/
+template < class T >
+using fine_svm_vector = vector<T, cl::SVMAllocator<int, cl::SVMTraitFine<>>>;
+
+/*! \brief Vector alias to simplify contruction of fine-grained SVM containers that support platform atomics.
+*
+*/
+template < class T >
+using atomic_svm_vector = vector<T, cl::SVMAllocator<int, cl::SVMTraitAtomic<>>>;
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+
+/*! \brief Class interface for Buffer Memory Objects.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class Buffer : public Memory
+{
+public:
+
+    /*! \brief Constructs a Buffer in a specified context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     */
+    Buffer(
+        const Context& context,
+        cl_mem_flags flags,
+        size_type size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a Buffer in the default context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     *
+     *  \see Context::getDefault()
+     */
+    Buffer(
+         cl_mem_flags flags,
+        size_type size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(
+        IteratorType startIterator,
+        IteratorType endIterator,
+        bool readOnly,
+        bool useHostPtr = false,
+        cl_int* err = NULL)
+    {
+        typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+        cl_int error;
+
+        cl_mem_flags flags = 0;
+        if( readOnly ) {
+            flags |= CL_MEM_READ_ONLY;
+        }
+        else {
+            flags |= CL_MEM_READ_WRITE;
+        }
+        if( useHostPtr ) {
+            flags |= CL_MEM_USE_HOST_PTR;
+        }
+        
+        size_type size = sizeof(DataType)*(endIterator - startIterator);
+
+        Context context = Context::getDefault(err);
+
+        if( useHostPtr ) {
+            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        } else {
+            object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+        }
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        if( !useHostPtr ) {
+            error = cl::copy(startIterator, endIterator, *this);
+            detail::errHandler(error, __CREATE_BUFFER_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators using a specified context.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+    
+    /*!
+    * \brief Construct a Buffer from a host container via iterators using a specified queue.
+    * If useHostPtr is specified iterators must be random access.
+    */
+    template< typename IteratorType >
+    Buffer(const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Buffer() : Memory() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with earlier versions.
+     *
+     *  See Memory for further details.
+     */
+    explicit Buffer(const cl_mem& buffer, bool retainObject = false) :
+        Memory(buffer, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+    *
+    *  See Memory for further details.
+    */
+    Buffer& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer(const Buffer& buf) : Memory(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer& operator = (const Buffer &buf)
+    {
+        Memory::operator=(buf);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer(Buffer&& buf) CL_HPP_NOEXCEPT_ : Memory(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer& operator = (Buffer &&buf)
+    {
+        Memory::operator=(std::move(buf));
+        return *this;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+    /*! \brief Creates a new buffer object from this.
+     *
+     *  Wraps clCreateSubBuffer().
+     */
+    Buffer createSubBuffer(
+        cl_mem_flags flags,
+        cl_buffer_create_type buffer_create_type,
+        const void * buffer_create_info,
+        cl_int * err = NULL)
+    {
+        Buffer result;
+        cl_int error;
+        result.object_ = ::clCreateSubBuffer(
+            object_, 
+            flags, 
+            buffer_create_type, 
+            buffer_create_info, 
+            &error);
+
+        detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return result;
+    }		
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+};
+
+#if defined (CL_HPP_USE_DX_INTEROP)
+/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
+ *
+ *  This is provided to facilitate interoperability with Direct3D.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class BufferD3D10 : public Buffer
+{
+public:
+   
+
+    /*! \brief Constructs a BufferD3D10, in a specified context, from a
+     *         given ID3D10Buffer.
+     *
+     *  Wraps clCreateFromD3D10BufferKHR().
+     */
+    BufferD3D10(
+        const Context& context,
+        cl_mem_flags flags,
+        ID3D10Buffer* bufobj,
+        cl_int * err = NULL) : pfn_clCreateFromD3D10BufferKHR(nullptr)
+    {
+        typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
+            cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
+            cl_int* errcode_ret);
+        PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
+        cl_platform platform = -1;
+        for( int i = 0; i < props.size(); ++i ) {
+            if( props[i] == CL_CONTEXT_PLATFORM ) {
+                platform = props[i+1];
+            }
+        }
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateFromD3D10BufferKHR);
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 110
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateFromD3D10BufferKHR);
+#endif
+
+        cl_int error;
+        object_ = pfn_clCreateFromD3D10BufferKHR(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferD3D10() : Buffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with 
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit BufferD3D10(const cl_mem& buffer, bool retainObject = false) : 
+        Buffer(buffer, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferD3D10(const BufferD3D10& buf) : 
+        Buffer(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferD3D10& operator = (const BufferD3D10 &buf)
+    {
+        Buffer::operator=(buf);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferD3D10(BufferD3D10&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferD3D10& operator = (BufferD3D10 &&buf)
+    {
+        Buffer::operator=(std::move(buf));
+        return *this;
+    }
+};
+#endif
+
+/*! \brief Class interface for GL Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferGL in a specified context, from a given
+     *         GL buffer.
+     *
+     *  Wraps clCreateFromGLBuffer().
+     */
+    BufferGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLBuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferGL() : Buffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit BufferGL(const cl_mem& buffer, bool retainObject = false) :
+        Buffer(buffer, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferGL(const BufferGL& buf) : Buffer(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferGL& operator = (const BufferGL &buf)
+    {
+        Buffer::operator=(buf);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferGL(BufferGL&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferGL& operator = (BufferGL &&buf)
+    {
+        Buffer::operator=(std::move(buf));
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        cl_GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief Class interface for GL Render Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferRenderGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
+     *         GL Renderbuffer.
+     *
+     *  Wraps clCreateFromGLRenderbuffer().
+     */
+    BufferRenderGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLRenderbuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferRenderGL() : Buffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with 
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit BufferRenderGL(const cl_mem& buffer, bool retainObject = false) :
+        Buffer(buffer, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferRenderGL(const BufferRenderGL& buf) : Buffer(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferRenderGL& operator = (const BufferRenderGL &buf)
+    {
+        Buffer::operator=(buf);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferRenderGL& operator = (BufferRenderGL &&buf)
+    {
+        Buffer::operator=(std::move(buf));
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        cl_GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief C++ base class for Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image : public Memory
+{
+protected:
+    //! \brief Default constructor - initializes to NULL.
+    Image() : Memory() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image(const cl_mem& image, bool retainObject = false) :
+        Memory(image, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image(const Image& img) : Memory(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image& operator = (const Image &img)
+    {
+        Memory::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image(Image&& img) CL_HPP_NOEXCEPT_ : Memory(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image& operator = (Image &&img)
+    {
+        Memory::operator=(std::move(img));
+        return *this;
+    }
+
+
+public:
+    //! \brief Wrapper for clGetImageInfo().
+    template <typename T>
+    cl_int getImageInfo(cl_image_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetImageInfo, object_, name, param),
+            __GET_IMAGE_INFO_ERR);
+    }
+    
+    //! \brief Wrapper for clGetImageInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_image_info, name>::param_type
+    getImageInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_image_info, name>::param_type param;
+        cl_int result = getImageInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+/*! \brief Class interface for 1D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image1D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image1D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type width,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D,
+            width,
+            0, 0, 0, 0, 0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image1D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image1D(const cl_mem& image1D, bool retainObject = false) :
+        Image(image1D, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D(const Image1D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D& operator = (const Image1D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D(Image1D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D& operator = (Image1D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+
+};
+
+/*! \class Image1DBuffer
+ * \brief Image interface for 1D buffer images.
+ */
+class Image1DBuffer : public Image
+{
+public:
+    Image1DBuffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type width,
+        const Buffer &buffer,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_BUFFER,
+            width,
+            0, 0, 0, 0, 0, 0, 0,
+            buffer()
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            NULL, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DBuffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image1DBuffer(const cl_mem& image1D, bool retainObject = false) :
+        Image(image1D, retainObject) { }
+
+    Image1DBuffer& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer(const Image1DBuffer& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer& operator = (const Image1DBuffer &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer(Image1DBuffer&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer& operator = (Image1DBuffer &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+
+};
+
+/*! \class Image1DArray
+ * \brief Image interface for arrays of 1D images.
+ */
+class Image1DArray : public Image
+{
+public:
+    Image1DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type arraySize,
+        size_type width,
+        size_type rowPitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_ARRAY,
+            width,
+            0, 0,  // height, depth (unused)
+            arraySize,
+            rowPitch,
+            0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DArray() { }
+  
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image1DArray(const cl_mem& imageArray, bool retainObject = false) :
+        Image(imageArray, retainObject) { }
+
+
+    Image1DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray(const Image1DArray& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray& operator = (const Image1DArray &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray(Image1DArray&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray& operator = (Image1DArray &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+
+};
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+
+/*! \brief Class interface for 2D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image2D : public Image
+{
+public:
+    /*! \brief Constructs a 2D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image2D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type width,
+        size_type height,
+        size_type row_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 120
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE2D,
+                width,
+                height,
+                0, 0, // depth, array size (unused)
+                row_pitch,
+                0, 0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(),
+                flags,
+                &format,
+                &desc,
+                host_ptr,
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage2D(
+                context(), flags,&format, width, height, row_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200 || defined(CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR)
+    /*! \brief Constructs a 2D Image from a buffer.
+    * \note This will share storage with the underlying buffer.
+    *
+    *  Wraps clCreateImage().
+    */
+    Image2D(
+        const Context& context,
+        ImageFormat format,
+        const Buffer &sourceBuffer,
+        size_type width,
+        size_type height,
+        size_type row_pitch = 0,
+        cl_int* err = nullptr)
+    {
+        cl_int error;
+
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D,
+            width,
+            height,
+            0, 0, // depth, array size (unused)
+            row_pitch,
+            0, 0, 0,
+            // Use buffer as input to image
+            sourceBuffer()
+        };
+        object_ = ::clCreateImage(
+            context(),
+            0, // flags inherited from buffer
+            &format,
+            &desc,
+            nullptr,
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != nullptr) {
+            *err = error;
+        }
+    }
+#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200 || defined(CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR)
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /*! \brief Constructs a 2D Image from an image.
+    * \note This will share storage with the underlying image but may
+    *       reinterpret the channel order and type.
+    *
+    * The image will be created matching with a descriptor matching the source. 
+    *
+    * \param order is the channel order to reinterpret the image data as.
+    *              The channel order may differ as described in the OpenCL 
+    *              2.0 API specification.
+    *
+    * Wraps clCreateImage().
+    */
+    Image2D(
+        const Context& context,
+        cl_channel_order order,
+        const Image &sourceImage,
+        cl_int* err = nullptr)
+    {
+        cl_int error;
+
+        // Descriptor fields have to match source image
+        size_type sourceWidth = 
+            sourceImage.getImageInfo<CL_IMAGE_WIDTH>();
+        size_type sourceHeight = 
+            sourceImage.getImageInfo<CL_IMAGE_HEIGHT>();
+        size_type sourceRowPitch =
+            sourceImage.getImageInfo<CL_IMAGE_ROW_PITCH>();
+        cl_uint sourceNumMIPLevels =
+            sourceImage.getImageInfo<CL_IMAGE_NUM_MIP_LEVELS>();
+        cl_uint sourceNumSamples =
+            sourceImage.getImageInfo<CL_IMAGE_NUM_SAMPLES>();
+        cl_image_format sourceFormat =
+            sourceImage.getImageInfo<CL_IMAGE_FORMAT>();
+
+        // Update only the channel order. 
+        // Channel format inherited from source.
+        sourceFormat.image_channel_order = order;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D,
+            sourceWidth,
+            sourceHeight,
+            0, 0, // depth (unused), array size (unused)
+            sourceRowPitch,
+            0, // slice pitch (unused)
+            sourceNumMIPLevels,
+            sourceNumSamples,
+            // Use buffer as input to image
+            sourceImage()
+        };
+        object_ = ::clCreateImage(
+            context(),
+            0, // flags should be inherited from mem_object
+            &sourceFormat,
+            &desc,
+            nullptr,
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != nullptr) {
+            *err = error;
+        }
+    }
+#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+    //! \brief Default constructor - initializes to NULL.
+    Image2D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image2D(const cl_mem& image2D, bool retainObject = false) :
+        Image(image2D, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D(const Image2D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D& operator = (const Image2D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D(Image2D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D& operator = (Image2D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+
+};
+
+
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+/*! \brief Class interface for GL 2D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D 
+{
+public:
+    /*! \brief Constructs an Image2DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture2D().
+     */
+    Image2DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture2D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+    
+    //! \brief Default constructor - initializes to NULL.
+    Image2DGL() : Image2D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image2DGL(const cl_mem& image, bool retainObject = false) : 
+        Image2D(image, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *c
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const cl_mem& rhs)
+    {
+        Image2D::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL(const Image2DGL& img) : Image2D(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL& operator = (const Image2DGL &img)
+    {
+        Image2D::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL(Image2DGL&& img) CL_HPP_NOEXCEPT_ : Image2D(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL& operator = (Image2DGL &&img)
+    {
+        Image2D::operator=(std::move(img));
+        return *this;
+    }
+
+} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+/*! \class Image2DArray
+ * \brief Image interface for arrays of 2D images.
+ */
+class Image2DArray : public Image
+{
+public:
+    Image2DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type arraySize,
+        size_type width,
+        size_type height,
+        size_type rowPitch,
+        size_type slicePitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D_ARRAY,
+            width,
+            height,
+            0,       // depth (unused)
+            arraySize,
+            rowPitch,
+            slicePitch,
+            0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image2DArray() { }
+    
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image2DArray(const cl_mem& imageArray, bool retainObject = false) : Image(imageArray, retainObject) { }
+
+    Image2DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray(const Image2DArray& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray& operator = (const Image2DArray &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray(Image2DArray&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray& operator = (Image2DArray &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+};
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+/*! \brief Class interface for 3D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3D : public Image
+{
+public:
+    /*! \brief Constructs a 3D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image3D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type width,
+        size_type height,
+        size_type depth,
+        size_type row_pitch = 0,
+        size_type slice_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 120
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE3D,
+                width,
+                height,
+                depth,
+                0,      // array size (unused)
+                row_pitch,
+                slice_pitch,
+                0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(), 
+                flags, 
+                &format, 
+                &desc, 
+                host_ptr, 
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif  // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage3D(
+                context(), flags, &format, width, height, depth, row_pitch,
+                slice_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3D() : Image() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image3D(const cl_mem& image3D, bool retainObject = false) : 
+        Image(image3D, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D(const Image3D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D& operator = (const Image3D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D(Image3D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D& operator = (Image3D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+};
+
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+/*! \brief Class interface for GL 3D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3DGL : public Image3D
+{
+public:
+    /*! \brief Constructs an Image3DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture3D().
+     */
+    Image3DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture3D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3DGL() : Image3D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image3DGL(const cl_mem& image, bool retainObject = false) : 
+        Image3D(image, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const cl_mem& rhs)
+    {
+        Image3D::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL(const Image3DGL& img) : Image3D(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL& operator = (const Image3DGL &img)
+    {
+        Image3D::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL(Image3DGL&& img) CL_HPP_NOEXCEPT_ : Image3D(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL& operator = (Image3DGL &&img)
+    {
+        Image3D::operator=(std::move(img));
+        return *this;
+    }
+};
+#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+/*! \class ImageGL
+ * \brief general image interface for GL interop.
+ * We abstract the 2D and 3D GL images into a single instance here
+ * that wraps all GL sourced images on the grounds that setup information
+ * was performed by OpenCL anyway.
+ */
+class ImageGL : public Image
+{
+public:
+    ImageGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture(
+            context(), 
+            flags, 
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    ImageGL() : Image() { }
+    
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit ImageGL(const cl_mem& image, bool retainObject = false) : 
+        Image(image, retainObject) { }
+
+    ImageGL& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL(const ImageGL& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL& operator = (const ImageGL &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL(ImageGL&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL& operator = (ImageGL &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+};
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+/*! \brief Class interface for Pipe Memory Objects.
+*
+*  See Memory for details about copy semantics, etc.
+*
+*  \see Memory
+*/
+class Pipe : public Memory
+{
+public:
+
+    /*! \brief Constructs a Pipe in a specified context.
+     *
+     * Wraps clCreatePipe().
+     * @param context Context in which to create the pipe.
+     * @param flags Bitfield. Only CL_MEM_READ_WRITE and CL_MEM_HOST_NO_ACCESS are valid.
+     * @param packet_size Size in bytes of a single packet of the pipe.
+     * @param max_packets Number of packets that may be stored in the pipe.
+     *
+     */
+    Pipe(
+        const Context& context,
+        cl_uint packet_size,
+        cl_uint max_packets,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS;
+        object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error);
+
+        detail::errHandler(error, __CREATE_PIPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a Pipe in a the default context.
+     *
+     * Wraps clCreatePipe().
+     * @param flags Bitfield. Only CL_MEM_READ_WRITE and CL_MEM_HOST_NO_ACCESS are valid.
+     * @param packet_size Size in bytes of a single packet of the pipe.
+     * @param max_packets Number of packets that may be stored in the pipe.
+     *
+     */
+    Pipe(
+        cl_uint packet_size,
+        cl_uint max_packets,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(err);
+
+        cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS;
+        object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error);
+
+        detail::errHandler(error, __CREATE_PIPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Pipe() : Memory() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with earlier versions.
+     *
+     *  See Memory for further details.
+     */
+    explicit Pipe(const cl_mem& pipe, bool retainObject = false) :
+        Memory(pipe, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Pipe& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Pipe(const Pipe& pipe) : Memory(pipe) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Pipe& operator = (const Pipe &pipe)
+    {
+        Memory::operator=(pipe);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Pipe(Pipe&& pipe) CL_HPP_NOEXCEPT_ : Memory(std::move(pipe)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Pipe& operator = (Pipe &&pipe)
+    {
+        Memory::operator=(std::move(pipe));
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo().
+    template <typename T>
+    cl_int getInfo(cl_pipe_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetPipeInfo, object_, name, param),
+            __GET_PIPE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
+    template <cl_int name> typename
+        detail::param_traits<detail::cl_pipe_info, name>::param_type
+        getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_pipe_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+}; // class Pipe
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+
+/*! \brief Class interface for cl_sampler.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_sampler as the original.  For details, see
+ *        clRetainSampler() and clReleaseSampler().
+ *
+ *  \see cl_sampler 
+ */
+class Sampler : public detail::Wrapper<cl_sampler>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Sampler() { }
+
+    /*! \brief Constructs a Sampler in a specified context.
+     *
+     *  Wraps clCreateSampler().
+     */
+    Sampler(
+        const Context& context,
+        cl_bool normalized_coords,
+        cl_addressing_mode addressing_mode,
+        cl_filter_mode filter_mode,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        cl_sampler_properties sampler_properties[] = {
+            CL_SAMPLER_NORMALIZED_COORDS, normalized_coords,
+            CL_SAMPLER_ADDRESSING_MODE, addressing_mode,
+            CL_SAMPLER_FILTER_MODE, filter_mode,
+            0 };
+        object_ = ::clCreateSamplerWithProperties(
+            context(),
+            sampler_properties,
+            &error);
+
+        detail::errHandler(error, __CREATE_SAMPLER_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#else
+        object_ = ::clCreateSampler(
+            context(),
+            normalized_coords,
+            addressing_mode,
+            filter_mode,
+            &error);
+
+        detail::errHandler(error, __CREATE_SAMPLER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#endif        
+    }
+
+    /*! \brief Constructor from cl_sampler - takes ownership.
+     * 
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  This effectively transfers ownership of a refcount on the cl_sampler
+     *  into the new Sampler object.
+     */
+    explicit Sampler(const cl_sampler& sampler, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(sampler, retainObject) { }
+
+    /*! \brief Assignment operator from cl_sampler - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseSampler() on the value previously held by this instance.
+     */
+    Sampler& operator = (const cl_sampler& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler(const Sampler& sam) : detail::Wrapper<cl_type>(sam) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler& operator = (const Sampler &sam)
+    {
+        detail::Wrapper<cl_type>::operator=(sam);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler(Sampler&& sam) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(sam)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler& operator = (Sampler &&sam)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(sam));
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo().
+    template <typename T>
+    cl_int getInfo(cl_sampler_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
+            __GET_SAMPLER_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_sampler_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_sampler_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+class Program;
+class CommandQueue;
+class DeviceCommandQueue;
+class Kernel;
+
+//! \brief Class interface for specifying NDRange values.
+class NDRange
+{
+private:
+    size_type sizes_[3];
+    cl_uint dimensions_;
+
+public:
+    //! \brief Default constructor - resulting range has zero dimensions.
+    NDRange()
+        : dimensions_(0)
+    {
+        sizes_[0] = 0;
+        sizes_[1] = 0;
+        sizes_[2] = 0;
+    }
+
+    //! \brief Constructs one-dimensional range.
+    NDRange(size_type size0)
+        : dimensions_(1)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = 1;
+        sizes_[2] = 1;
+    }
+
+    //! \brief Constructs two-dimensional range.
+    NDRange(size_type size0, size_type size1)
+        : dimensions_(2)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+        sizes_[2] = 1;
+    }
+
+    //! \brief Constructs three-dimensional range.
+    NDRange(size_type size0, size_type size1, size_type size2)
+        : dimensions_(3)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+        sizes_[2] = size2;
+    }
+
+    /*! \brief Conversion operator to const size_type *.
+     *  
+     *  \returns a pointer to the size of the first dimension.
+     */
+    operator const size_type*() const { 
+        return sizes_; 
+    }
+
+    //! \brief Queries the number of dimensions in the range.
+    size_type dimensions() const 
+    { 
+        return dimensions_; 
+    }
+
+    //! \brief Returns the size of the object in bytes based on the
+    // runtime number of dimensions
+    size_type size() const
+    {
+        return dimensions_*sizeof(size_type);
+    }
+
+    size_type* get()
+    {
+        return sizes_;
+    }
+    
+    const size_type* get() const
+    {
+        return sizes_;
+    }
+};
+
+//! \brief A zero-dimensional range.
+static const NDRange NullRange;
+
+//! \brief Local address wrapper for use with Kernel::setArg
+struct LocalSpaceArg
+{
+    size_type size_;
+};
+
+namespace detail {
+
+template <typename T, class Enable = void>
+struct KernelArgumentHandler;
+
+// Enable for objects that are not subclasses of memory
+// Pointers, constants etc
+template <typename T>
+struct KernelArgumentHandler<T, typename std::enable_if<!std::is_base_of<cl::Memory, T>::value>::type>
+{
+    static size_type size(const T&) { return sizeof(T); }
+    static const T* ptr(const T& value) { return &value; }
+};
+
+// Enable for subclasses of memory where we want to get a reference to the cl_mem out
+// and pass that in for safety
+template <typename T>
+struct KernelArgumentHandler<T, typename std::enable_if<std::is_base_of<cl::Memory, T>::value>::type>
+{
+    static size_type size(const T&) { return sizeof(cl_mem); }
+    static const cl_mem* ptr(const T& value) { return &(value()); }
+};
+
+// Specialization for DeviceCommandQueue defined later
+
+template <>
+struct KernelArgumentHandler<LocalSpaceArg, void>
+{
+    static size_type size(const LocalSpaceArg& value) { return value.size_; }
+    static const void* ptr(const LocalSpaceArg&) { return NULL; }
+};
+
+} 
+//! \endcond
+
+/*! Local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ */
+inline LocalSpaceArg
+Local(size_type size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+/*! \brief Class interface for cl_kernel.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_kernel as the original.  For details, see
+ *        clRetainKernel() and clReleaseKernel().
+ *
+ *  \see cl_kernel
+ */
+class Kernel : public detail::Wrapper<cl_kernel>
+{
+public:
+    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Kernel() { }
+
+    /*! \brief Constructor from cl_kernel - takes ownership.
+     * 
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  This effectively transfers ownership of a refcount on the cl_kernel
+     *  into the new Kernel object.
+     */
+    explicit Kernel(const cl_kernel& kernel, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(kernel, retainObject) { }
+
+    /*! \brief Assignment operator from cl_kernel - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseKernel() on the value previously held by this instance.
+     */
+    Kernel& operator = (const cl_kernel& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel& operator = (const Kernel &kernel)
+    {
+        detail::Wrapper<cl_type>::operator=(kernel);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel(Kernel&& kernel) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(kernel)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel& operator = (Kernel &&kernel)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(kernel));
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_kernel_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelInfo, object_, name, param),
+            __GET_KERNEL_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    template <typename T>
+    cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
+            __GET_KERNEL_ARG_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
+    getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_arg_info, name>::param_type param;
+        cl_int result = getArgInfo(argIndex, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    template <typename T>
+    cl_int getWorkGroupInfo(
+        const Device& device, cl_kernel_work_group_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
+                __GET_KERNEL_WORK_GROUP_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
+        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+        detail::cl_kernel_work_group_info, name>::param_type param;
+        cl_int result = getWorkGroupInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+    
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
+    cl_int getSubGroupInfo(const cl::Device &dev, cl_kernel_sub_group_info name, const cl::NDRange &range, size_type* param) const
+    {
+        typedef clGetKernelSubGroupInfoKHR_fn PFN_clGetKernelSubGroupInfoKHR;
+        static PFN_clGetKernelSubGroupInfoKHR pfn_clGetKernelSubGroupInfoKHR = NULL;
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetKernelSubGroupInfoKHR);
+
+        return detail::errHandler(
+            pfn_clGetKernelSubGroupInfoKHR(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr),
+            __GET_KERNEL_ARG_INFO_ERR);
+    }
+
+    template <cl_int name>
+        size_type getSubGroupInfo(const cl::Device &dev, const cl::NDRange &range, cl_int* err = NULL) const
+    {
+        size_type param;
+        cl_int result = getSubGroupInfo(dev, name, range, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+#endif // #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /*! \brief setArg overload taking a shared_ptr type
+     */
+    template<typename T, class D>
+    cl_int setArg(cl_uint index, const cl::pointer<T, D> &argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArgSVMPointer(object_, index, argPtr.get()),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    /*! \brief setArg overload taking a vector type.
+     */
+    template<typename T, class Alloc>
+    cl_int setArg(cl_uint index, const cl::vector<T, Alloc> &argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArgSVMPointer(object_, index, argPtr.data()),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    /*! \brief setArg overload taking a pointer type
+     */
+    template<typename T>
+    typename std::enable_if<std::is_pointer<T>::value, cl_int>::type
+        setArg(cl_uint index, const T argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArgSVMPointer(object_, index, argPtr),
+            __SET_KERNEL_ARGS_ERR);
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+    /*! \brief setArg overload taking a POD type
+     */
+    template <typename T>
+    typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type
+        setArg(cl_uint index, const T &value)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(
+                object_,
+                index,
+                detail::KernelArgumentHandler<T>::size(value),
+                detail::KernelArgumentHandler<T>::ptr(value)),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    cl_int setArg(cl_uint index, size_type size, const void* argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(object_, index, size, argPtr),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /*!
+     * Specify a vector of SVM pointers that the kernel may access in 
+     * addition to its arguments.
+     */
+    cl_int setSVMPointers(const vector<void*> &pointerList)
+    {
+        return detail::errHandler(
+            ::clSetKernelExecInfo(
+                object_,
+                CL_KERNEL_EXEC_INFO_SVM_PTRS,
+                sizeof(void*)*pointerList.size(),
+                pointerList.data()));
+    }
+
+    /*!
+     * Specify a std::array of SVM pointers that the kernel may access in
+     * addition to its arguments.
+     */
+    template<int ArrayLength>
+    cl_int setSVMPointers(const std::array<void*, ArrayLength> &pointerList)
+    {
+        return detail::errHandler(
+            ::clSetKernelExecInfo(
+                object_,
+                CL_KERNEL_EXEC_INFO_SVM_PTRS,
+                sizeof(void*)*pointerList.size(),
+                pointerList.data()));
+    }
+
+    /*! \brief Enable fine-grained system SVM.
+     *
+     * \note It is only possible to enable fine-grained system SVM if all devices
+     *       in the context associated with kernel support it.
+     * 
+     * \param svmEnabled True if fine-grained system SVM is requested. False otherwise.
+     * \return CL_SUCCESS if the function was executed succesfully. CL_INVALID_OPERATION
+     *         if no devices in the context support fine-grained system SVM.
+     *
+     * \see clSetKernelExecInfo
+     */
+    cl_int enableFineGrainedSystemSVM(bool svmEnabled)
+    {
+        cl_bool svmEnabled_ = svmEnabled ? CL_TRUE : CL_FALSE;
+        return detail::errHandler(
+            ::clSetKernelExecInfo(
+                object_,
+                CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM,
+                sizeof(cl_bool),
+                &svmEnabled_
+                )
+            );
+    }
+    
+    template<int index, int ArrayLength, class D, typename T0, typename T1, typename... Ts>
+    void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0, const pointer<T1, D> &t1, Ts & ... ts)
+    {
+        pointerList[index] = static_cast<void*>(t0.get());
+        setSVMPointersHelper<index + 1, ArrayLength>(pointerList, t1, ts...);
+    }
+
+    template<int index, int ArrayLength, typename T0, typename T1, typename... Ts>
+    typename std::enable_if<std::is_pointer<T0>::value, void>::type
+    setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0, T1 t1, Ts... ts)
+    {
+        pointerList[index] = static_cast<void*>(t0);
+        setSVMPointersHelper<index + 1, ArrayLength>(pointerList, t1, ts...);
+    }
+
+    template<int index, int ArrayLength, typename T0, class D>
+    void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0)
+    {
+        pointerList[index] = static_cast<void*>(t0.get());
+    }
+
+
+    template<int index, int ArrayLength, typename T0>
+    typename std::enable_if<std::is_pointer<T0>::value, void>::type
+    setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0)
+    {
+        pointerList[index] = static_cast<void*>(t0);
+    }
+
+    template<typename T0, typename... Ts>
+    cl_int setSVMPointers(const T0 &t0, Ts & ... ts)
+    {
+        std::array<void*, 1 + sizeof...(Ts)> pointerList;
+
+        setSVMPointersHelper<0, 1 + sizeof...(Ts)>(pointerList, t0, ts...);
+        return detail::errHandler(
+            ::clSetKernelExecInfo(
+            object_,
+            CL_KERNEL_EXEC_INFO_SVM_PTRS,
+            sizeof(void*)*(1 + sizeof...(Ts)),
+            pointerList.data()));
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+};
+
+/*! \class Program
+ * \brief Program interface that implements cl_program.
+ */
+class Program : public detail::Wrapper<cl_program>
+{
+public:
+#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+    typedef vector<vector<unsigned char>> Binaries;
+    typedef vector<string> Sources;
+#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+    typedef vector<std::pair<const void*, size_type> > Binaries;
+    typedef vector<std::pair<const char*, size_type> > Sources;
+#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+    
+    Program(
+        const string& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const size_type length  = source.size();
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                "-cl-std=CL2.0",
+#else
+                "",
+#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                NULL,
+                NULL);
+
+            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const string& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const size_type length  = source.size();
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                "-cl-std=CL2.0",
+#else
+                "",
+#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                NULL,
+                NULL);
+            
+            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Create a program from a vector of source strings and the default context.
+     * Does not compile or link the program.
+     */
+    Program(
+        const Sources& sources,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        Context context = Context::getDefault(err);
+
+        const size_type n = (size_type)sources.size();
+
+        vector<size_type> lengths(n);
+        vector<const char*> strings(n);
+
+        for (size_type i = 0; i < n; ++i) {
+#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+            strings[i] = sources[(int)i].data();
+            lengths[i] = sources[(int)i].length();
+#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+            strings[i] = sources[(int)i].first;
+            lengths[i] = sources[(int)i].second;
+#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+        }
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)n, strings.data(), lengths.data(), &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Create a program from a vector of source strings and a provided context.
+     * Does not compile or link the program.
+     */
+    Program(
+        const Context& context,
+        const Sources& sources,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const size_type n = (size_type)sources.size();
+
+        vector<size_type> lengths(n);
+        vector<const char*> strings(n);
+
+        for (size_type i = 0; i < n; ++i) {
+#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+            strings[i] = sources[(int)i].data();
+            lengths[i] = sources[(int)i].length();
+#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+            strings[i] = sources[(int)i].first;
+            lengths[i] = sources[(int)i].second;
+#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+        }
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)n, strings.data(), lengths.data(), &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Construct a program object from a list of devices and a per-device list of binaries.
+     * \param context A valid OpenCL context in which to construct the program.
+     * \param devices A vector of OpenCL device objects for which the program will be created.
+     * \param binaries A vector of pairs of a pointer to a binary object and its length.
+     * \param binaryStatus An optional vector that on completion will be resized to
+     *   match the size of binaries and filled with values to specify if each binary
+     *   was successfully loaded.
+     *   Set to CL_SUCCESS if the binary was successfully loaded.
+     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
+     *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
+     * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
+     *   CL_INVALID_CONTEXT if context is not a valid context.
+     *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
+     *     or if any entry in binaries is NULL or has length 0.
+     *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
+     *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
+     *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
+     */
+    Program(
+        const Context& context,
+        const vector<Device>& devices,
+        const Binaries& binaries,
+        vector<cl_int>* binaryStatus = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        
+        const size_type numDevices = devices.size();
+        
+        // Catch size mismatch early and return
+        if(binaries.size() != numDevices) {
+            error = CL_INVALID_VALUE;
+            detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+
+        vector<size_type> lengths(numDevices);
+        vector<const unsigned char*> images(numDevices);
+#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+        for (size_type i = 0; i < numDevices; ++i) {
+            images[i] = binaries[i].data();
+            lengths[i] = binaries[(int)i].size();
+        }
+#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+        for (size_type i = 0; i < numDevices; ++i) {
+            images[i] = (const unsigned char*)binaries[i].first;
+            lengths[i] = binaries[(int)i].second;
+        }
+#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+        
+        vector<cl_device_id> deviceIDs(numDevices);
+        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        if(binaryStatus) {
+            binaryStatus->resize(numDevices);
+        }
+        
+        object_ = ::clCreateProgramWithBinary(
+            context(), (cl_uint) devices.size(),
+            deviceIDs.data(),
+            lengths.data(), images.data(), (binaryStatus != NULL && numDevices > 0)
+               ? &binaryStatus->front()
+               : NULL, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    /**
+     * Create program using builtin kernels.
+     * \param kernelNames Semi-colon separated list of builtin kernel names
+     */
+    Program(
+        const Context& context,
+        const vector<Device>& devices,
+        const string& kernelNames,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+
+        size_type numDevices = devices.size();
+        vector<cl_device_id> deviceIDs(numDevices);
+        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+        
+        object_ = ::clCreateProgramWithBuiltInKernels(
+            context(), 
+            (cl_uint) devices.size(),
+            deviceIDs.data(),
+            kernelNames.c_str(), 
+            &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    Program() { }
+    
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     */
+    explicit Program(const cl_program& program, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(program, retainObject) { }
+
+    Program& operator = (const cl_program& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program(const Program& program) : detail::Wrapper<cl_type>(program) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program& operator = (const Program &program)
+    {
+        detail::Wrapper<cl_type>::operator=(program);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program(Program&& program) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(program)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program& operator = (Program &&program)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(program));
+        return *this;
+    }
+
+    cl_int build(
+        const vector<Device>& devices,
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        size_type numDevices = devices.size();
+        vector<cl_device_id> deviceIDs(numDevices);
+        
+        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        cl_int buildError = ::clBuildProgram(
+            object_,
+            (cl_uint)
+            devices.size(),
+            deviceIDs.data(),
+            options,
+            notifyFptr,
+            data);
+
+        return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+    }
+
+    cl_int build(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        cl_int buildError = ::clBuildProgram(
+            object_,
+            0,
+            NULL,
+            options,
+            notifyFptr,
+            data);
+
+
+        return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    cl_int compile(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        cl_int error = ::clCompileProgram(
+            object_,
+            0,
+            NULL,
+            options,
+            0,
+            NULL,
+            NULL,
+            notifyFptr,
+            data);
+        return detail::buildErrHandler(error, __COMPILE_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    template <typename T>
+    cl_int getInfo(cl_program_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetProgramInfo, object_, name, param),
+            __GET_PROGRAM_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int getBuildInfo(
+        const Device& device, cl_program_build_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetProgramBuildInfo, object_, device(), name, param),
+                __GET_PROGRAM_BUILD_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_build_info, name>::param_type
+    getBuildInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_build_info, name>::param_type param;
+        cl_int result = getBuildInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+    
+    /**
+     * Build info function that returns a vector of device/info pairs for the specified 
+     * info type and for all devices in the program.
+     * On an error reading the info for any device, an empty vector of info will be returned.
+     */
+    template <cl_int name>
+    vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>>
+        getBuildInfo(cl_int *err = NULL) const
+    {
+        cl_int result = CL_SUCCESS;
+
+        auto devs = getInfo<CL_PROGRAM_DEVICES>(&result);
+        vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>>
+            devInfo;
+
+        // If there was an initial error from getInfo return the error
+        if (result != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = result;
+            }
+            return devInfo;
+        }
+
+        for (const cl::Device &d : devs) {
+            typename detail::param_traits<
+                detail::cl_program_build_info, name>::param_type param;
+            result = getBuildInfo(d, name, &param);
+            devInfo.push_back(
+                std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>
+                (d, param));
+            if (result != CL_SUCCESS) {
+                // On error, leave the loop and return the error code
+                break;
+            }
+        }
+        if (err != NULL) {
+            *err = result;
+        }
+        if (result != CL_SUCCESS) {
+            devInfo.clear();
+        }
+        return devInfo;
+    }
+
+    cl_int createKernels(vector<Kernel>* kernels)
+    {
+        cl_uint numKernels;
+        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        vector<cl_kernel> value(numKernels);
+        
+        err = ::clCreateKernelsInProgram(
+            object_, numKernels, value.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        if (kernels) {
+            kernels->resize(value.size());
+
+            // Assign to param, constructing with retain behaviour
+            // to correctly capture each underlying CL object
+            for (size_type i = 0; i < value.size(); i++) {
+                // We do not need to retain because this kernel is being created 
+                // by the runtime
+                (*kernels)[i] = Kernel(value[i], false);
+            }
+        }
+        return CL_SUCCESS;
+    }
+};
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+inline Program linkProgram(
+    Program input1,
+    Program input2,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int error_local = CL_SUCCESS;
+
+    cl_program programs[2] = { input1(), input2() };
+
+    Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>(&error_local);
+    if(error_local!=CL_SUCCESS) {
+        detail::errHandler(error_local, __LINK_PROGRAM_ERR);
+    }
+
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        2,
+        programs,
+        notifyFptr,
+        data,
+        &error_local);
+
+    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = error_local;
+    }
+
+    return Program(prog);
+}
+
+inline Program linkProgram(
+    vector<Program> inputPrograms,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int error_local = CL_SUCCESS;
+
+    vector<cl_program> programs(inputPrograms.size());
+
+    for (unsigned int i = 0; i < inputPrograms.size(); i++) {
+        programs[i] = inputPrograms[i]();
+    }
+    
+    Context ctx;
+    if(inputPrograms.size() > 0) {
+        ctx = inputPrograms[0].getInfo<CL_PROGRAM_CONTEXT>(&error_local);
+        if(error_local!=CL_SUCCESS) {
+            detail::errHandler(error_local, __LINK_PROGRAM_ERR);
+        }
+    }
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        (cl_uint)inputPrograms.size(),
+        programs.data(),
+        notifyFptr,
+        data,
+        &error_local);
+
+    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = error_local;
+    }
+
+    return Program(prog, false);
+}
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+// Template specialization for CL_PROGRAM_BINARIES
+template <>
+inline cl_int cl::Program::getInfo(cl_program_info name, vector<vector<unsigned char>>* param) const
+{
+    if (name != CL_PROGRAM_BINARIES) {
+        return CL_INVALID_VALUE;
+    }
+    if (param) {
+        // Resize the parameter array appropriately for each allocation
+        // and pass down to the helper
+
+        vector<size_type> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
+        size_type numBinaries = sizes.size();
+
+        // Resize the parameter array and constituent arrays
+        param->resize(numBinaries);
+        for (size_type i = 0; i < numBinaries; ++i) {
+            (*param)[i].resize(sizes[i]);
+        }
+
+        return detail::errHandler(
+            detail::getInfo(&::clGetProgramInfo, object_, name, param),
+            __GET_PROGRAM_INFO_ERR);
+    }
+
+    return CL_SUCCESS;
+}
+
+template<>
+inline vector<vector<unsigned char>> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
+{
+    vector<vector<unsigned char>> binariesVectors;
+
+    cl_int result = getInfo(CL_PROGRAM_BINARIES, &binariesVectors);
+    if (err != NULL) {
+        *err = result;
+    }
+    return binariesVectors;
+}
+
+inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+{
+    cl_int error;
+
+    object_ = ::clCreateKernel(program(), name, &error);
+    detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+    if (err != NULL) {
+        *err = error;
+    }
+
+}
+
+enum class QueueProperties : cl_command_queue_properties
+{
+    None = 0,
+    Profiling = CL_QUEUE_PROFILING_ENABLE,
+    OutOfOrder = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+};
+
+inline QueueProperties operator|(QueueProperties lhs, QueueProperties rhs)
+{
+    return static_cast<QueueProperties>(static_cast<cl_command_queue_properties>(lhs) | static_cast<cl_command_queue_properties>(rhs));
+}
+
+/*! \class CommandQueue
+ * \brief CommandQueue interface for cl_command_queue.
+ */
+class CommandQueue : public detail::Wrapper<cl_command_queue>
+{
+private:
+    static std::once_flag default_initialized_;
+    static CommandQueue default_;
+    static cl_int default_error_;
+
+    /*! \brief Create the default command queue returned by @ref getDefault.
+     *
+     * It sets default_error_ to indicate success or failure. It does not throw
+     * @c cl::Error.
+     */
+    static void makeDefault()
+    {
+        /* We don't want to throw an error from this function, so we have to
+         * catch and set the error flag.
+         */
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        try
+#endif
+        {
+            int error;
+            Context context = Context::getDefault(&error);
+
+            if (error != CL_SUCCESS) {
+                default_error_ = error;
+            }
+            else {
+                Device device = Device::getDefault();
+                default_ = CommandQueue(context, device, 0, &default_error_);
+            }
+        }
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        catch (cl::Error &e) {
+            default_error_ = e.err();
+        }
+#endif
+    }
+
+    /*! \brief Create the default command queue.
+     *
+     * This sets @c default_. It does not throw
+     * @c cl::Error.
+     */
+    static void makeDefaultProvided(const CommandQueue &c) {
+        default_ = c;
+    }
+
+public:
+#ifdef CL_HPP_UNIT_TEST_ENABLE
+    /*! \brief Reset the default.
+    *
+    * This sets @c default_ to an empty value to support cleanup in
+    * the unit test framework.
+    * This function is not thread safe.
+    */
+    static void unitTestClearDefault() {
+        default_ = CommandQueue();
+    }
+#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
+        
+
+    /*!
+     * \brief Constructs a CommandQueue based on passed properties.
+     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+     */
+   CommandQueue(
+        cl_command_queue_properties properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+            bool useWithProperties;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+            // Run-time decision based on the actual platform
+            {
+                cl_uint version = detail::getContextPlatformVersion(context());
+                useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
+            }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
+            useWithProperties = true;
+#else
+            useWithProperties = false;
+#endif
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+            if (useWithProperties) {
+                cl_queue_properties queue_properties[] = {
+                    CL_QUEUE_PROPERTIES, properties, 0 };
+                if ((properties & CL_QUEUE_ON_DEVICE) == 0) {
+                    object_ = ::clCreateCommandQueueWithProperties(
+                        context(), device(), queue_properties, &error);
+                }
+                else {
+                    error = CL_INVALID_QUEUE_PROPERTIES;
+                }
+
+                detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+                if (err != NULL) {
+                    *err = error;
+                }
+            }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
+            if (!useWithProperties) {
+                object_ = ::clCreateCommandQueue(
+                    context(), device(), properties, &error);
+
+                detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+                if (err != NULL) {
+                    *err = error;
+                }
+            }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        }
+    }
+
+   /*!
+    * \brief Constructs a CommandQueue based on passed properties.
+    * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+    */
+   CommandQueue(
+       QueueProperties properties,
+       cl_int* err = NULL)
+   {
+       cl_int error;
+
+       Context context = Context::getDefault(&error);
+       detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+       if (error != CL_SUCCESS) {
+           if (err != NULL) {
+               *err = error;
+           }
+       }
+       else {
+           Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+           bool useWithProperties;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+           // Run-time decision based on the actual platform
+           {
+               cl_uint version = detail::getContextPlatformVersion(context());
+               useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
+           }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
+           useWithProperties = true;
+#else
+           useWithProperties = false;
+#endif
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+           if (useWithProperties) {
+               cl_queue_properties queue_properties[] = {
+                   CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
+
+               object_ = ::clCreateCommandQueueWithProperties(
+                   context(), device(), queue_properties, &error);
+
+               detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+               if (err != NULL) {
+                   *err = error;
+               }
+           }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
+           if (!useWithProperties) {
+               object_ = ::clCreateCommandQueue(
+                   context(), device(), static_cast<cl_command_queue_properties>(properties), &error);
+
+               detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+               if (err != NULL) {
+                   *err = error;
+               }
+           }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
+
+       }
+   }
+
+    /*!
+     * \brief Constructs a CommandQueue for an implementation defined device in the given context
+     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+     */
+    explicit CommandQueue(
+        const Context& context,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useWithProperties;
+        vector<cl::Device> devices;
+        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS)
+        {
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
+        }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
+        useWithProperties = true;
+#else
+        useWithProperties = false;
+#endif
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        if (useWithProperties) {
+            cl_queue_properties queue_properties[] = {
+                CL_QUEUE_PROPERTIES, properties, 0 };
+            if ((properties & CL_QUEUE_ON_DEVICE) == 0) {
+                object_ = ::clCreateCommandQueueWithProperties(
+                    context(), devices[0](), queue_properties, &error);
+            }
+            else {
+                error = CL_INVALID_QUEUE_PROPERTIES;
+            }
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        if (!useWithProperties) {
+            object_ = ::clCreateCommandQueue(
+                context(), devices[0](), properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
+    }
+
+    /*!
+    * \brief Constructs a CommandQueue for an implementation defined device in the given context
+    * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+    */
+    explicit CommandQueue(
+        const Context& context,
+        QueueProperties properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useWithProperties;
+        vector<cl::Device> devices;
+        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS)
+        {
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
+        }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
+        useWithProperties = true;
+#else
+        useWithProperties = false;
+#endif
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        if (useWithProperties) {
+            cl_queue_properties queue_properties[] = {
+                CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
+            object_ = ::clCreateCommandQueueWithProperties(
+                context(), devices[0](), queue_properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        if (!useWithProperties) {
+            object_ = ::clCreateCommandQueue(
+                context(), devices[0](), static_cast<cl_command_queue_properties>(properties), &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
+    }
+
+    /*!
+     * \brief Constructs a CommandQueue for a passed device and context
+     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+     */
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useWithProperties;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
+        }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
+        useWithProperties = true;
+#else
+        useWithProperties = false;
+#endif
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        if (useWithProperties) {
+            cl_queue_properties queue_properties[] = {
+                CL_QUEUE_PROPERTIES, properties, 0 };
+            object_ = ::clCreateCommandQueueWithProperties(
+                context(), device(), queue_properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        if (!useWithProperties) {
+            object_ = ::clCreateCommandQueue(
+                context(), device(), properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
+    }
+
+    /*!
+     * \brief Constructs a CommandQueue for a passed device and context
+     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+     */
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        QueueProperties properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useWithProperties;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
+        }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
+        useWithProperties = true;
+#else
+        useWithProperties = false;
+#endif
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        if (useWithProperties) {
+            cl_queue_properties queue_properties[] = {
+                CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
+            object_ = ::clCreateCommandQueueWithProperties(
+                context(), device(), queue_properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
+        if (!useWithProperties) {
+            object_ = ::clCreateCommandQueue(
+                context(), device(), static_cast<cl_command_queue_properties>(properties), &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
+    }
+
+    static CommandQueue getDefault(cl_int * err = NULL) 
+    {
+        std::call_once(default_initialized_, makeDefault);
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+#else // CL_HPP_TARGET_OPENCL_VERSION >= 200
+        detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_ERR);
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+    }
+
+    /**
+     * Modify the default command queue to be used by
+     * subsequent operations.
+     * Will only set the default if no default was previously created.
+     * @return updated default command queue.
+     *         Should be compared to the passed value to ensure that it was updated.
+     */
+    static CommandQueue setDefault(const CommandQueue &default_queue)
+    {
+        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_queue));
+        detail::errHandler(default_error_);
+        return default_;
+    }
+
+    CommandQueue() { }
+
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     */
+    explicit CommandQueue(const cl_command_queue& commandQueue, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(commandQueue, retainObject) { }
+
+    CommandQueue& operator = (const cl_command_queue& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue(const CommandQueue& queue) : detail::Wrapper<cl_type>(queue) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue& operator = (const CommandQueue &queue)
+    {
+        detail::Wrapper<cl_type>::operator=(queue);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue(CommandQueue&& queue) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(queue)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue& operator = (CommandQueue &&queue)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(queue));
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_queue_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetCommandQueueInfo, object_, name, param),
+                __GET_COMMAND_QUEUE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_command_queue_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_queue_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int enqueueReadBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        size_type offset,
+        size_type size,
+        void* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        size_type offset,
+        size_type size,
+        const void* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        size_type src_offset,
+        size_type dst_offset,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBuffer(
+                object_, src(), dst(), src_offset, dst_offset, size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueReadBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const array<size_type, 3>& buffer_offset,
+        const array<size_type, 3>& host_offset,
+        const array<size_type, 3>& region,
+        size_type buffer_row_pitch,
+        size_type buffer_slice_pitch,
+        size_type host_row_pitch,
+        size_type host_slice_pitch,
+        void *ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBufferRect(
+                object_, 
+                buffer(), 
+                blocking,
+                buffer_offset.data(),
+                host_offset.data(),
+                region.data(),
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_READ_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const array<size_type, 3>& buffer_offset,
+        const array<size_type, 3>& host_offset,
+        const array<size_type, 3>& region,
+        size_type buffer_row_pitch,
+        size_type buffer_slice_pitch,
+        size_type host_row_pitch,
+        size_type host_slice_pitch,
+        const void *ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBufferRect(
+                object_, 
+                buffer(), 
+                blocking,
+                buffer_offset.data(),
+                host_offset.data(),
+                region.data(),
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferRect(
+        const Buffer& src,
+        const Buffer& dst,
+        const array<size_type, 3>& src_origin,
+        const array<size_type, 3>& dst_origin,
+        const array<size_type, 3>& region,
+        size_type src_row_pitch,
+        size_type src_slice_pitch,
+        size_type dst_row_pitch,
+        size_type dst_slice_pitch,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferRect(
+                object_, 
+                src(), 
+                dst(), 
+                src_origin.data(),
+                dst_origin.data(),
+                region.data(),
+                src_row_pitch,
+                src_slice_pitch,
+                dst_row_pitch,
+                dst_slice_pitch,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    /**
+     * Enqueue a command to fill a buffer object with a pattern
+     * of a given size. The pattern is specified as a vector type.
+     * \tparam PatternType The datatype of the pattern field. 
+     *     The pattern type must be an accepted OpenCL data type.
+     * \tparam offset Is the offset in bytes into the buffer at 
+     *     which to start filling. This must be a multiple of 
+     *     the pattern size.
+     * \tparam size Is the size in bytes of the region to fill.
+     *     This must be a multiple of the pattern size.
+     */
+    template<typename PatternType>
+    cl_int enqueueFillBuffer(
+        const Buffer& buffer,
+        PatternType pattern,
+        size_type offset,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillBuffer(
+                object_, 
+                buffer(),
+                static_cast<void*>(&pattern),
+                sizeof(PatternType), 
+                offset, 
+                size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    cl_int enqueueReadImage(
+        const Image& image,
+        cl_bool blocking,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        size_type row_pitch,
+        size_type slice_pitch,
+        void* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadImage(
+                object_, 
+                image(), 
+                blocking, 
+                origin.data(),
+                region.data(), 
+                row_pitch, 
+                slice_pitch, 
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteImage(
+        const Image& image,
+        cl_bool blocking,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        size_type row_pitch,
+        size_type slice_pitch,
+        const void* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteImage(
+                object_, 
+                image(), 
+                blocking, 
+                origin.data(),
+                region.data(), 
+                row_pitch, 
+                slice_pitch, 
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_WRITE_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyImage(
+        const Image& src,
+        const Image& dst,
+        const array<size_type, 3>& src_origin,
+        const array<size_type, 3>& dst_origin,
+        const array<size_type, 3>& region,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImage(
+                object_, 
+                src(), 
+                dst(), 
+                src_origin.data(),
+                dst_origin.data(), 
+                region.data(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA floating-point color value if
+     *     the image channel data type is not an unnormalized signed or
+     *     unsigned data type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_float4 fillColor,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                origin.data(),
+                region.data(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA signed integer color value if
+     *     the image channel data type is an unnormalized signed integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_int4 fillColor,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                origin.data(),
+                region.data(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA unsigned integer color value if
+     *     the image channel data type is an unnormalized unsigned integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_uint4 fillColor,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                origin.data(),
+                region.data(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    cl_int enqueueCopyImageToBuffer(
+        const Image& src,
+        const Buffer& dst,
+        const array<size_type, 3>& src_origin,
+        const array<size_type, 3>& region,
+        size_type dst_offset,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImageToBuffer(
+                object_, 
+                src(), 
+                dst(), 
+                src_origin.data(),
+                region.data(), 
+                dst_offset,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferToImage(
+        const Buffer& src,
+        const Image& dst,
+        size_type src_offset,
+        const array<size_type, 3>& dst_origin,
+        const array<size_type, 3>& region,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferToImage(
+                object_, 
+                src(), 
+                dst(), 
+                src_offset,
+                dst_origin.data(), 
+                region.data(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        size_type offset,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_event tmp;
+        cl_int error;
+        void * result = ::clEnqueueMapBuffer(
+            object_, buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+        if (event != NULL && error == CL_SUCCESS)
+            *event = tmp;
+
+        return result;
+    }
+
+    void* enqueueMapImage(
+        const Image& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        size_type * row_pitch,
+        size_type * slice_pitch,
+        const vector<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_event tmp;
+        cl_int error;
+        void * result = ::clEnqueueMapImage(
+            object_, buffer(), blocking, flags,
+            origin.data(), 
+            region.data(),
+            row_pitch, slice_pitch,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
+        if (err != NULL) {
+              *err = error;
+        }
+        if (event != NULL && error == CL_SUCCESS)
+            *event = tmp;
+        return result;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /**
+     * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
+     * This variant takes a raw SVM pointer.
+     */
+    template<typename T>
+    cl_int enqueueMapSVM(
+        T* ptr,
+        cl_bool blocking,
+        cl_map_flags flags,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(::clEnqueueSVMMap(
+            object_, blocking, flags, static_cast<void*>(ptr), size,
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MAP_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+
+    /**
+     * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
+     * This variant takes a cl::pointer instance.
+     */
+    template<typename T, class D>
+    cl_int enqueueMapSVM(
+        cl::pointer<T, D> &ptr,
+        cl_bool blocking,
+        cl_map_flags flags,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(::clEnqueueSVMMap(
+            object_, blocking, flags, static_cast<void*>(ptr.get()), size,
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MAP_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
+     * This variant takes a cl::vector instance.
+     */
+    template<typename T, class Alloc>
+    cl_int enqueueMapSVM(
+        cl::vector<T, Alloc> &container,
+        cl_bool blocking,
+        cl_map_flags flags,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(::clEnqueueSVMMap(
+            object_, blocking, flags, static_cast<void*>(container.data()), container.size(),
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MAP_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+    cl_int enqueueUnmapMemObject(
+        const Memory& memory,
+        void* mapped_ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueUnmapMemObject(
+                object_, memory(), mapped_ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /**
+     * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
+     * This variant takes a raw SVM pointer.
+     */
+    template<typename T>
+    cl_int enqueueUnmapSVM(
+        T* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueSVMUnmap(
+            object_, static_cast<void*>(ptr),
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
+     * This variant takes a cl::pointer instance.
+     */
+    template<typename T, class D>
+    cl_int enqueueUnmapSVM(
+        cl::pointer<T, D> &ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueSVMUnmap(
+            object_, static_cast<void*>(ptr.get()),
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
+     * This variant takes a cl::vector instance.
+     */
+    template<typename T, class Alloc>
+    cl_int enqueueUnmapSVM(
+        cl::vector<T, Alloc> &container,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueSVMUnmap(
+            object_, static_cast<void*>(container.data()),
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    /**
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or all previously enqueued commands to complete.
+     *
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command returns an event which can be waited on, 
+     * i.e. this event can be waited on to insure that all events either in the event_wait_list 
+     * or all previously enqueued commands, queued before this command to command_queue, 
+     * have completed.
+     */
+    cl_int enqueueMarkerWithWaitList(
+        const vector<Event> *events = 0,
+        Event *event = 0) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarkerWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * A synchronization point that enqueues a barrier operation.
+     *
+     * Enqueues a barrier command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command blocks command execution, that is, any 
+     * following commands enqueued after it do not execute until it completes. This command 
+     * returns an event which can be waited on, i.e. this event can be waited on to insure that 
+     * all events either in the event_wait_list or all previously enqueued commands, queued 
+     * before this command to command_queue, have completed.
+     */
+    cl_int enqueueBarrierWithWaitList(
+        const vector<Event> *events = 0,
+        Event *event = 0) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueBarrierWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_BARRIER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+    
+    /**
+     * Enqueues a command to indicate with which device a set of memory objects
+     * should be associated.
+     */
+    cl_int enqueueMigrateMemObjects(
+        const vector<Memory> &memObjects,
+        cl_mem_migration_flags flags,
+        const vector<Event>* events = NULL,
+        Event* event = NULL
+        ) const
+    {
+        cl_event tmp;
+        
+        vector<cl_mem> localMemObjects(memObjects.size());
+
+        for( int i = 0; i < (int)memObjects.size(); ++i ) {
+            localMemObjects[i] = memObjects[i]();
+        }
+
+
+        cl_int err = detail::errHandler(
+            ::clEnqueueMigrateMemObjects(
+                object_, 
+                (cl_uint)memObjects.size(), 
+                localMemObjects.data(),
+                flags,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    cl_int enqueueNDRangeKernel(
+        const Kernel& kernel,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local = NullRange,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNDRangeKernel(
+                object_, kernel(), (cl_uint) global.dimensions(),
+                offset.dimensions() != 0 ? (const size_type*) offset : NULL,
+                (const size_type*) global,
+                local.dimensions() != 0 ? (const size_type*) local : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NDRANGE_KERNEL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+    CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(
+        const Kernel& kernel,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueTask(
+                object_, kernel(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_TASK_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+
+    cl_int enqueueNativeKernel(
+        void (CL_CALLBACK *userFptr)(void *),
+        std::pair<void*, size_type> args,
+        const vector<Memory>* mem_objects = NULL,
+        const vector<const void*>* mem_locs = NULL,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        size_type elements = 0;
+        if (mem_objects != NULL) {
+            elements = mem_objects->size();
+        }
+        vector<cl_mem> mems(elements);
+        for (unsigned int i = 0; i < elements; i++) {
+            mems[i] = ((*mem_objects)[i])();
+        }
+        
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNativeKernel(
+                object_, userFptr, args.first, args.second,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                mems.data(),
+                (mem_locs != NULL && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NATIVE_KERNEL);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED 
+    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarker(
+                object_, 
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueWaitForEvents(const vector<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueWaitForEvents(
+                object_,
+                (cl_uint) events.size(),
+                events.size() > 0 ? (const cl_event*) &events.front() : NULL),
+            __ENQUEUE_WAIT_FOR_EVENTS_ERR);
+    }
+#endif // defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+    cl_int enqueueAcquireGLObjects(
+         const vector<Memory>* mem_objects = NULL,
+         const vector<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueAcquireGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseGLObjects(
+         const vector<Memory>* mem_objects = NULL,
+         const vector<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueReleaseGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+#if defined (CL_HPP_USE_DX_INTEROP)
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+
+    cl_int enqueueAcquireD3D10Objects(
+         const vector<Memory>* mem_objects = NULL,
+         const vector<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+        
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             pfn_clEnqueueAcquireD3D10ObjectsKHR(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseD3D10Objects(
+         const vector<Memory>* mem_objects = NULL,
+         const vector<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueReleaseD3D10ObjectsKHR);
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueReleaseD3D10ObjectsKHR);
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            pfn_clEnqueueReleaseD3D10ObjectsKHR(
+                object_,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueBarrier(object_),
+            __ENQUEUE_BARRIER_ERR);
+    }
+#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
+
+    cl_int flush() const
+    {
+        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
+    }
+
+    cl_int finish() const
+    {
+        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
+    }
+}; // CommandQueue
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandQueue::default_initialized_;
+CL_HPP_DEFINE_STATIC_MEMBER_ CommandQueue CommandQueue::default_;
+CL_HPP_DEFINE_STATIC_MEMBER_ cl_int CommandQueue::default_error_ = CL_SUCCESS;
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+enum class DeviceQueueProperties : cl_command_queue_properties
+{
+    None = 0,
+    Profiling = CL_QUEUE_PROFILING_ENABLE,
+};
+
+inline DeviceQueueProperties operator|(DeviceQueueProperties lhs, DeviceQueueProperties rhs)
+{
+    return static_cast<DeviceQueueProperties>(static_cast<cl_command_queue_properties>(lhs) | static_cast<cl_command_queue_properties>(rhs));
+}
+
+/*! \class DeviceCommandQueue
+ * \brief DeviceCommandQueue interface for device cl_command_queues.
+ */
+class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
+{
+public:
+
+    /*!
+     * Trivial empty constructor to create a null queue.
+     */
+    DeviceCommandQueue() { }
+
+    /*!
+     * Default construct device command queue on default context and device
+     */
+    DeviceCommandQueue(DeviceQueueProperties properties, cl_int* err = NULL)
+    {
+        cl_int error;
+        cl::Context context = cl::Context::getDefault();
+        cl::Device device = cl::Device::getDefault();
+
+        cl_command_queue_properties mergedProperties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
+
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, mergedProperties, 0 };
+        object_ = ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * Create a device command queue for a specified device in the passed context.
+     */
+    DeviceCommandQueue(
+        const Context& context,
+        const Device& device,
+        DeviceQueueProperties properties = DeviceQueueProperties::None,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_command_queue_properties mergedProperties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, mergedProperties, 0 };
+        object_ = ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * Create a device command queue for a specified device in the passed context.
+     */
+    DeviceCommandQueue(
+        const Context& context,
+        const Device& device,
+        cl_uint queueSize,
+        DeviceQueueProperties properties = DeviceQueueProperties::None,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_command_queue_properties mergedProperties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, mergedProperties,
+            CL_QUEUE_SIZE, queueSize, 
+            0 };
+        object_ = ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructor from cl_command_queue - takes ownership.
+    *
+    * \param retainObject will cause the constructor to retain its cl object.
+    *                     Defaults to false to maintain compatibility with
+    *                     earlier versions.
+    */
+    explicit DeviceCommandQueue(const cl_command_queue& commandQueue, bool retainObject = false) :
+        detail::Wrapper<cl_type>(commandQueue, retainObject) { }
+
+    DeviceCommandQueue& operator = (const cl_command_queue& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    DeviceCommandQueue(const DeviceCommandQueue& queue) : detail::Wrapper<cl_type>(queue) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    DeviceCommandQueue& operator = (const DeviceCommandQueue &queue)
+    {
+        detail::Wrapper<cl_type>::operator=(queue);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    DeviceCommandQueue(DeviceCommandQueue&& queue) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(queue)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    DeviceCommandQueue& operator = (DeviceCommandQueue &&queue)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(queue));
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_queue_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+            &::clGetCommandQueueInfo, object_, name, param),
+            __GET_COMMAND_QUEUE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+        detail::param_traits<detail::cl_command_queue_info, name>::param_type
+        getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_queue_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*!
+    * Create a new default device command queue for the default device,
+    * in the default context and of the default size.
+    * If there is already a default queue for the specified device this
+    * function will return the pre-existing queue.
+    */
+    static DeviceCommandQueue makeDefault(
+        cl_int *err = nullptr)
+    {
+        cl_int error;
+        cl::Context context = cl::Context::getDefault();
+        cl::Device device = cl::Device::getDefault();
+
+        cl_command_queue_properties properties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT;
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, properties,
+            0 };
+        DeviceCommandQueue deviceQueue(
+            ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error));
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return deviceQueue;
+    }
+
+    /*!
+    * Create a new default device command queue for the specified device
+    * and of the default size.
+    * If there is already a default queue for the specified device this
+    * function will return the pre-existing queue.
+    */
+    static DeviceCommandQueue makeDefault(
+        const Context &context, const Device &device, cl_int *err = nullptr)
+    {
+        cl_int error;
+
+        cl_command_queue_properties properties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT;
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, properties,
+            0 };
+        DeviceCommandQueue deviceQueue(
+            ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error));
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return deviceQueue;
+    }
+
+    /*!
+     * Create a new default device command queue for the specified device 
+     * and of the requested size in bytes.
+     * If there is already a default queue for the specified device this
+     * function will return the pre-existing queue.
+     */
+    static DeviceCommandQueue makeDefault(
+        const Context &context, const Device &device, cl_uint queueSize, cl_int *err = nullptr)
+    {
+        cl_int error;
+
+        cl_command_queue_properties properties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT;
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, properties,
+            CL_QUEUE_SIZE, queueSize,
+            0 };
+        DeviceCommandQueue deviceQueue(
+            ::clCreateCommandQueueWithProperties(
+                context(), device(), queue_properties, &error));
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return deviceQueue;
+    }
+}; // DeviceCommandQueue
+
+namespace detail
+{
+    // Specialization for device command queue
+    template <>
+    struct KernelArgumentHandler<cl::DeviceCommandQueue, void>
+    {
+        static size_type size(const cl::DeviceCommandQueue&) { return sizeof(cl_command_queue); }
+        static const cl_command_queue* ptr(const cl::DeviceCommandQueue& value) { return &(value()); }
+    };
+} // namespace detail
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const Context &context,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if( readOnly ) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if( useHostPtr ) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+    
+    size_type size = sizeof(DataType)*(endIterator - startIterator);
+
+    if( useHostPtr ) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    } else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if( !useHostPtr ) {
+        CommandQueue queue(context, 0, &error);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const CommandQueue &queue,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if (readOnly) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if (useHostPtr) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+
+    size_type size = sizeof(DataType)*(endIterator - startIterator);
+
+    Context context = queue.getInfo<CL_QUEUE_CONTEXT>();
+
+    if (useHostPtr) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    }
+    else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if (!useHostPtr) {
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+inline cl_int enqueueReadBuffer(
+    const Buffer& buffer,
+    cl_bool blocking,
+    size_type offset,
+    size_type size,
+    void* ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        size_type offset,
+        size_type size,
+        const void* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        size_type offset,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    void * result = ::clEnqueueMapBuffer(
+            queue(), buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+    return result;
+}
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+/**
+ * Enqueues to the default queue a command that will allow the host to
+ * update a region of a coarse-grained SVM buffer.
+ * This variant takes a raw SVM pointer.
+ */
+template<typename T>
+inline cl_int enqueueMapSVM(
+    T* ptr,
+    cl_bool blocking,
+    cl_map_flags flags,
+    size_type size,
+    const vector<Event>* events,
+    Event* event)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    }
+
+    return queue.enqueueMapSVM(
+        ptr, blocking, flags, size, events, event);
+}
+
+/**
+ * Enqueues to the default queue a command that will allow the host to 
+ * update a region of a coarse-grained SVM buffer.
+ * This variant takes a cl::pointer instance.
+ */
+template<typename T, class D>
+inline cl_int enqueueMapSVM(
+    cl::pointer<T, D> ptr,
+    cl_bool blocking,
+    cl_map_flags flags,
+    size_type size,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    }
+
+    return queue.enqueueMapSVM(
+        ptr, blocking, flags, size, events, event);
+}
+
+/**
+ * Enqueues to the default queue a command that will allow the host to
+ * update a region of a coarse-grained SVM buffer.
+ * This variant takes a cl::vector instance.
+ */
+template<typename T, class Alloc>
+inline cl_int enqueueMapSVM(
+    cl::vector<T, Alloc> container,
+    cl_bool blocking,
+    cl_map_flags flags,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    }
+
+    return queue.enqueueMapSVM(
+        container, blocking, flags, events, event);
+}
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+inline cl_int enqueueUnmapMemObject(
+    const Memory& memory,
+    void* mapped_ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    cl_event tmp;
+    cl_int err = detail::errHandler(
+        ::clEnqueueUnmapMemObject(
+        queue(), memory(), mapped_ptr,
+        (events != NULL) ? (cl_uint)events->size() : 0,
+        (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+        (event != NULL) ? &tmp : NULL),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+    if (event != NULL && err == CL_SUCCESS)
+        *event = tmp;
+
+    return err;
+}
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+/**
+ * Enqueues to the default queue a command that will release a coarse-grained 
+ * SVM buffer back to the OpenCL runtime.
+ * This variant takes a raw SVM pointer.
+ */
+template<typename T>
+inline cl_int enqueueUnmapSVM(
+    T* ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+    }
+
+    return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event), 
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+}
+
+/**
+ * Enqueues to the default queue a command that will release a coarse-grained 
+ * SVM buffer back to the OpenCL runtime.
+ * This variant takes a cl::pointer instance.
+ */
+template<typename T, class D>
+inline cl_int enqueueUnmapSVM(
+    cl::pointer<T, D> &ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+    }
+
+    return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+}
+
+/**
+ * Enqueues to the default queue a command that will release a coarse-grained 
+ * SVM buffer back to the OpenCL runtime.
+ * This variant takes a cl::vector instance.
+ */
+template<typename T, class Alloc>
+inline cl_int enqueueUnmapSVM(
+    cl::vector<T, Alloc> &container,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+    }
+
+    return detail::errHandler(queue.enqueueUnmapSVM(container, events, event),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+}
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+inline cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        size_type src_offset,
+        size_type dst_offset,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, startIterator, endIterator, buffer);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, buffer, startIterator, endIterator);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+    
+    size_type length = endIterator-startIterator;
+    size_type byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+#if defined(_MSC_VER)
+    std::copy(
+        startIterator, 
+        endIterator, 
+        stdext::checked_array_iterator<DataType*>(
+            pointer, length));
+#else
+    std::copy(startIterator, endIterator, pointer);
+#endif
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+        
+    size_type length = endIterator-startIterator;
+    size_type byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+    std::copy(pointer, pointer + length, startIterator);
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+/**
+ * Blocking SVM map operation - performs a blocking map underneath.
+ */
+template<typename T, class Alloc>
+inline cl_int mapSVM(cl::vector<T, Alloc> &container)
+{
+    return enqueueMapSVM(container, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE);
+}
+
+/**
+* Blocking SVM map operation - performs a blocking map underneath.
+*/
+template<typename T, class Alloc>
+inline cl_int unmapSVM(cl::vector<T, Alloc> &container)
+{
+    return enqueueUnmapSVM(container);
+}
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+inline cl_int enqueueReadBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const array<size_type, 3>& buffer_offset,
+    const array<size_type, 3>& host_offset,
+    const array<size_type, 3>& region,
+    size_type buffer_row_pitch,
+    size_type buffer_slice_pitch,
+    size_type host_row_pitch,
+    size_type host_slice_pitch,
+    void *ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const array<size_type, 3>& buffer_offset,
+    const array<size_type, 3>& host_offset,
+    const array<size_type, 3>& region,
+    size_type buffer_row_pitch,
+    size_type buffer_slice_pitch,
+    size_type host_row_pitch,
+    size_type host_slice_pitch,
+    const void *ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyBufferRect(
+    const Buffer& src,
+    const Buffer& dst,
+    const array<size_type, 3>& src_origin,
+    const array<size_type, 3>& dst_origin,
+    const array<size_type, 3>& region,
+    size_type src_row_pitch,
+    size_type src_slice_pitch,
+    size_type dst_row_pitch,
+    size_type dst_slice_pitch,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferRect(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        src_row_pitch,
+        src_slice_pitch,
+        dst_row_pitch,
+        dst_slice_pitch,
+        events, 
+        event);
+}
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+inline cl_int enqueueReadImage(
+    const Image& image,
+    cl_bool blocking,
+    const array<size_type, 3>& origin,
+    const array<size_type, 3>& region,
+    size_type row_pitch,
+    size_type slice_pitch,
+    void* ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL) 
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteImage(
+    const Image& image,
+    cl_bool blocking,
+    const array<size_type, 3>& origin,
+    const array<size_type, 3>& region,
+    size_type row_pitch,
+    size_type slice_pitch,
+    const void* ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyImage(
+    const Image& src,
+    const Image& dst,
+    const array<size_type, 3>& src_origin,
+    const array<size_type, 3>& dst_origin,
+    const array<size_type, 3>& region,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImage(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyImageToBuffer(
+    const Image& src,
+    const Buffer& dst,
+    const array<size_type, 3>& src_origin,
+    const array<size_type, 3>& region,
+    size_type dst_offset,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImageToBuffer(
+        src,
+        dst,
+        src_origin,
+        region,
+        dst_offset,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyBufferToImage(
+    const Buffer& src,
+    const Image& dst,
+    size_type src_offset,
+    const array<size_type, 3>& dst_origin,
+    const array<size_type, 3>& region,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferToImage(
+        src,
+        dst,
+        src_offset,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+
+inline cl_int flush(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.flush();
+}
+
+inline cl_int finish(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    } 
+
+
+    return queue.finish();
+}
+
+class EnqueueArgs
+{
+private:
+    CommandQueue queue_;
+    const NDRange offset_;
+    const NDRange global_;
+    const NDRange local_;
+    vector<Event> events_;
+
+    template<typename... Ts>
+    friend class KernelFunctor;
+
+public:
+    EnqueueArgs(NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(Event e, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(const vector<Event> &events, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const vector<Event> &events, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const vector<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, const vector<Event> &events, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const vector<Event> &events, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const vector<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+};
+
+
+//----------------------------------------------------------------------------------------------
+
+
+/**
+ * Type safe kernel functor.
+ * 
+ */
+template<typename... Ts>
+class KernelFunctor
+{
+private:
+    Kernel kernel_;
+
+    template<int index, typename T0, typename... T1s>
+    void setArgs(T0&& t0, T1s&&... t1s)
+    {
+        kernel_.setArg(index, t0);
+        setArgs<index + 1, T1s...>(std::forward<T1s>(t1s)...);
+    }
+
+    template<int index, typename T0>
+    void setArgs(T0&& t0)
+    {
+        kernel_.setArg(index, t0);
+    }
+
+    template<int index>
+    void setArgs()
+    {
+    }
+
+
+public:
+    KernelFunctor(Kernel kernel) : kernel_(kernel)
+    {}
+
+    KernelFunctor(
+        const Program& program,
+        const string name,
+        cl_int * err = NULL) :
+        kernel_(program, name.c_str(), err)
+    {}
+
+    //! \brief Return type of the functor
+    typedef Event result_type;
+
+    /**
+     * Enqueue kernel.
+     * @param args Launch parameters of the kernel.
+     * @param t0... List of kernel arguments based on the template type of the functor.
+     */
+    Event operator() (
+        const EnqueueArgs& args,
+        Ts... ts)
+    {
+        Event event;
+        setArgs<0>(std::forward<Ts>(ts)...);
+        
+        args.queue_.enqueueNDRangeKernel(
+            kernel_,
+            args.offset_,
+            args.global_,
+            args.local_,
+            &args.events_,
+            &event);
+
+        return event;
+    }
+
+    /**
+    * Enqueue kernel with support for error code.
+    * @param args Launch parameters of the kernel.
+    * @param t0... List of kernel arguments based on the template type of the functor.
+    * @param error Out parameter returning the error code from the execution.
+    */
+    Event operator() (
+        const EnqueueArgs& args,
+        Ts... ts,
+        cl_int &error)
+    {
+        Event event;
+        setArgs<0>(std::forward<Ts>(ts)...);
+
+        error = args.queue_.enqueueNDRangeKernel(
+            kernel_,
+            args.offset_,
+            args.global_,
+            args.local_,
+            &args.events_,
+            &event);
+        
+        return event;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    cl_int setSVMPointers(const vector<void*> &pointerList)
+    {
+        return kernel_.setSVMPointers(pointerList);
+    }
+
+    template<typename T0, typename... T1s>
+    cl_int setSVMPointers(const T0 &t0, T1s &... ts)
+    {
+        return kernel_.setSVMPointers(t0, ts...);
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+    Kernel getKernel()
+    {
+        return kernel_;
+    }
+};
+
+namespace compatibility {
+    /**
+     * Backward compatibility class to ensure that cl.hpp code works with cl2.hpp.
+     * Please use KernelFunctor directly.
+     */
+    template<typename... Ts>
+    struct make_kernel
+    {
+        typedef KernelFunctor<Ts...> FunctorType;
+
+        FunctorType functor_;
+
+        make_kernel(
+            const Program& program,
+            const string name,
+            cl_int * err = NULL) :
+            functor_(FunctorType(program, name, err))
+        {}
+
+        make_kernel(
+            const Kernel kernel) :
+            functor_(FunctorType(kernel))
+        {}
+
+        //! \brief Return type of the functor
+        typedef Event result_type;
+
+        //! \brief Function signature of kernel functor with no event dependency.
+        typedef Event type_(
+            const EnqueueArgs&,
+            Ts...);
+
+        Event operator()(
+            const EnqueueArgs& enqueueArgs,
+            Ts... args)
+        {
+            return functor_(
+                enqueueArgs, args...);
+        }
+    };
+} // namespace compatibility
+
+
+//----------------------------------------------------------------------------------------------------------------------
+
+#undef CL_HPP_ERR_STR_
+#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS)
+#undef __GET_DEVICE_INFO_ERR
+#undef __GET_PLATFORM_INFO_ERR
+#undef __GET_DEVICE_IDS_ERR
+#undef __GET_CONTEXT_INFO_ERR
+#undef __GET_EVENT_INFO_ERR
+#undef __GET_EVENT_PROFILE_INFO_ERR
+#undef __GET_MEM_OBJECT_INFO_ERR
+#undef __GET_IMAGE_INFO_ERR
+#undef __GET_SAMPLER_INFO_ERR
+#undef __GET_KERNEL_INFO_ERR
+#undef __GET_KERNEL_ARG_INFO_ERR
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
+#undef __GET_PROGRAM_INFO_ERR
+#undef __GET_PROGRAM_BUILD_INFO_ERR
+#undef __GET_COMMAND_QUEUE_INFO_ERR
+
+#undef __CREATE_CONTEXT_ERR
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
+
+#undef __CREATE_BUFFER_ERR
+#undef __CREATE_SUBBUFFER_ERR
+#undef __CREATE_IMAGE2D_ERR
+#undef __CREATE_IMAGE3D_ERR
+#undef __CREATE_SAMPLER_ERR
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
+
+#undef __CREATE_USER_EVENT_ERR
+#undef __SET_USER_EVENT_STATUS_ERR
+#undef __SET_EVENT_CALLBACK_ERR
+#undef __SET_PRINTF_CALLBACK_ERR
+
+#undef __WAIT_FOR_EVENTS_ERR
+
+#undef __CREATE_KERNEL_ERR
+#undef __SET_KERNEL_ARGS_ERR
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR
+#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
+#undef __BUILD_PROGRAM_ERR
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR
+
+#undef __CREATE_COMMAND_QUEUE_ERR
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
+#undef __ENQUEUE_READ_BUFFER_ERR
+#undef __ENQUEUE_WRITE_BUFFER_ERR
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
+#undef __ENQEUE_COPY_BUFFER_ERR
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR
+#undef __ENQUEUE_READ_IMAGE_ERR
+#undef __ENQUEUE_WRITE_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
+#undef __ENQUEUE_MAP_BUFFER_ERR
+#undef __ENQUEUE_MAP_IMAGE_ERR
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR
+#undef __ENQUEUE_TASK_ERR
+#undef __ENQUEUE_NATIVE_KERNEL
+
+#undef __UNLOAD_COMPILER_ERR
+#undef __CREATE_SUB_DEVICES_ERR
+
+#undef __CREATE_PIPE_ERR
+#undef __GET_PIPE_INFO_ERR
+
+#endif //CL_HPP_USER_OVERRIDE_ERROR_STRINGS
+
+// Extensions
+#undef CL_HPP_INIT_CL_EXT_FCN_PTR_
+#undef CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_
+
+#if defined(CL_HPP_USE_CL_DEVICE_FISSION)
+#undef CL_HPP_PARAM_NAME_DEVICE_FISSION_
+#endif // CL_HPP_USE_CL_DEVICE_FISSION
+
+#undef CL_HPP_NOEXCEPT_
+#undef CL_HPP_DEFINE_STATIC_MEMBER_
+
+} // namespace cl
+
+#endif // CL_HPP_
diff --git a/include/triton/external/CL/cl_d3d10.h b/include/triton/external/CL/cl_d3d10.h
new file mode 100644
index 000000000..aebf6e7f9
--- /dev/null
+++ b/include/triton/external/CL/cl_d3d10.h
@@ -0,0 +1,131 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D10_H
+#define __OPENCL_CL_D3D10_H
+
+#include <d3d10.h>
+#include "cl.h"
+#include "cl_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d10_sharing                                                       */
+#define cl_khr_d3d10_sharing 1
+
+typedef cl_uint cl_d3d10_device_source_khr;
+typedef cl_uint cl_d3d10_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
+
+/* cl_d3d10_device_source_nv */
+#define CL_D3D10_DEVICE_KHR                          0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
+
+/* cl_d3d10_device_set_nv */
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
+
+/* cl_mem_info */
+#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
+
+/* cl_image_info */
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d10_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d10_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D10Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_D3D10_H */
+
diff --git a/include/triton/external/CL/cl_d3d11.h b/include/triton/external/CL/cl_d3d11.h
new file mode 100644
index 000000000..93a25b2a9
--- /dev/null
+++ b/include/triton/external/CL/cl_d3d11.h
@@ -0,0 +1,131 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D11_H
+#define __OPENCL_CL_D3D11_H
+
+#include <d3d11.h>
+#include "cl.h"
+#include "cl_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d11_sharing                                                       */
+#define cl_khr_d3d11_sharing 1
+
+typedef cl_uint cl_d3d11_device_source_khr;
+typedef cl_uint cl_d3d11_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
+#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
+
+/* cl_d3d11_device_source */
+#define CL_D3D11_DEVICE_KHR                          0x4019
+#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
+
+/* cl_d3d11_device_set */
+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
+#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
+
+/* cl_mem_info */
+#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
+
+/* cl_image_info */
+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d11_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d11_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D11Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_D3D11_H */
+
diff --git a/include/triton/external/CL/cl_dx9_media_sharing.h b/include/triton/external/CL/cl_dx9_media_sharing.h
new file mode 100644
index 000000000..784ea6ba8
--- /dev/null
+++ b/include/triton/external/CL/cl_dx9_media_sharing.h
@@ -0,0 +1,132 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_H
+
+#include "cl.h"
+#include "cl_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+/* cl_khr_dx9_media_sharing                                                   */
+#define cl_khr_dx9_media_sharing 1
+
+typedef cl_uint             cl_dx9_media_adapter_type_khr;
+typedef cl_uint             cl_dx9_media_adapter_set_khr;
+    
+#if defined(_WIN32)
+#include <d3d9.h>
+typedef struct _cl_dx9_surface_info_khr
+{
+    IDirect3DSurface9 *resource;
+    HANDLE shared_handle;
+} cl_dx9_surface_info_khr;
+#endif
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
+#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
+
+/* cl_media_adapter_type_khr */
+#define CL_ADAPTER_D3D9_KHR                              0x2020
+#define CL_ADAPTER_D3D9EX_KHR                            0x2021
+#define CL_ADAPTER_DXVA_KHR                              0x2022
+
+/* cl_media_adapter_set_khr */
+#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
+
+/* cl_context_info */
+#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
+#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
+
+/* cl_mem_info */
+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
+#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+    cl_platform_id                   platform,
+    cl_uint                          num_media_adapters,
+    cl_dx9_media_adapter_type_khr *  media_adapter_type,
+    void *                           media_adapters,
+    cl_dx9_media_adapter_set_khr     media_adapter_set,
+    cl_uint                          num_entries,
+    cl_device_id *                   devices,
+    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
+    cl_context                    context,
+    cl_mem_flags                  flags,
+    cl_dx9_media_adapter_type_khr adapter_type,
+    void *                        surface_info,
+    cl_uint                       plane,                                                                          
+    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
+
diff --git a/include/triton/external/CL/cl_dx9_media_sharing_intel.h b/include/triton/external/CL/cl_dx9_media_sharing_intel.h
new file mode 100644
index 000000000..331bab97c
--- /dev/null
+++ b/include/triton/external/CL/cl_dx9_media_sharing_intel.h
@@ -0,0 +1,182 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2016 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/*****************************************************************************\
+
+Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
+
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File Name: cl_dx9_media_sharing_intel.h
+
+Abstract:
+
+Notes:
+
+\*****************************************************************************/
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#include <d3d9.h>
+#include <dxvahd.h>
+#include <wtypes.h>
+#include <d3d9types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************
+* cl_intel_dx9_media_sharing extension *
+****************************************/
+
+#define cl_intel_dx9_media_sharing 1
+
+typedef cl_uint cl_dx9_device_source_intel;
+typedef cl_uint cl_dx9_device_set_intel;
+
+/* error codes */
+#define CL_INVALID_DX9_DEVICE_INTEL                   -1010
+#define CL_INVALID_DX9_RESOURCE_INTEL                 -1011
+#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012
+#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013
+
+/* cl_dx9_device_source_intel */
+#define CL_D3D9_DEVICE_INTEL                          0x4022
+#define CL_D3D9EX_DEVICE_INTEL                        0x4070
+#define CL_DXVA_DEVICE_INTEL                          0x4071
+
+/* cl_dx9_device_set_intel */
+#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024
+#define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026
+#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072
+#define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073
+
+/* cl_mem_info */
+#define CL_MEM_DX9_RESOURCE_INTEL                     0x4027
+#define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_PLANE_INTEL                      0x4075
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A
+#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B
+/******************************************************************************/
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromDX9INTEL(
+    cl_platform_id              /* platform */,
+    cl_dx9_device_source_intel  /* dx9_device_source */,
+    void*                       /* dx9_object */,
+    cl_dx9_device_set_intel     /* dx9_device_set */,
+    cl_uint                     /* num_entries */, 
+    cl_device_id*               /* devices */, 
+    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
+    cl_platform_id              /* platform */,
+    cl_dx9_device_source_intel  /* dx9_device_source */,
+    void*                       /* dx9_object */,
+    cl_dx9_device_set_intel     /* dx9_device_set */,
+    cl_uint                     /* num_entries */, 
+    cl_device_id*               /* devices */, 
+    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromDX9MediaSurfaceINTEL(
+    cl_context                  /* context */,
+    cl_mem_flags                /* flags */,
+    IDirect3DSurface9*          /* resource */,
+    HANDLE                      /* sharedHandle */,
+    UINT                        /* plane */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
+    cl_context                  /* context */,
+    cl_mem_flags                /* flags */,
+    IDirect3DSurface9*          /* resource */,
+    HANDLE                      /* sharedHandle */,
+    UINT                        /* plane */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireDX9ObjectsINTEL(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    const cl_mem*               /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    const cl_mem*               /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseDX9ObjectsINTEL(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    cl_mem*                     /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    cl_mem*                     /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */
+
diff --git a/include/triton/external/CL/cl_egl.h b/include/triton/external/CL/cl_egl.h
new file mode 100644
index 000000000..73811ffd5
--- /dev/null
+++ b/include/triton/external/CL/cl_egl.h
@@ -0,0 +1,136 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_EGL_H
+#define __OPENCL_CL_EGL_H
+
+#ifdef __APPLE__
+
+#else
+#include "cl.h"
+#endif  
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
+
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR             -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
+
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void* CLeglImageKHR;
+
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void* CLeglDisplayKHR;
+
+/* CLeglSyncKHR is an opaque handle to an EGLSync object */
+typedef void* CLeglSyncKHR;
+
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t cl_egl_image_properties_khr;
+
+
+#define cl_khr_egl_image 1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(cl_context                  /* context */,
+                        CLeglDisplayKHR             /* egldisplay */,
+                        CLeglImageKHR               /* eglimage */,
+                        cl_mem_flags                /* flags */,
+                        const cl_egl_image_properties_khr * /* properties */,
+                        cl_int *                    /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+	cl_context                  context,
+	CLeglDisplayKHR             egldisplay,
+	CLeglImageKHR               eglimage,
+	cl_mem_flags                flags,
+	const cl_egl_image_properties_khr * properties,
+	cl_int *                    errcode_ret);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+
+
+#define cl_khr_egl_event 1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(cl_context      /* context */,
+                            CLeglSyncKHR    /* sync */,
+                            CLeglDisplayKHR /* display */,
+                            cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+	cl_context      context,
+	CLeglSyncKHR    sync,
+	CLeglDisplayKHR display,
+	cl_int *        errcode_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_EGL_H */
diff --git a/include/triton/external/CL/cl_ext.h b/include/triton/external/CL/cl_ext.h
new file mode 100644
index 000000000..a0a493545
--- /dev/null
+++ b/include/triton/external/CL/cl_ext.h
@@ -0,0 +1,670 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+        #include <OpenCL/cl.h>
+    #include <AvailabilityMacros.h>
+#else
+        #include "cl.h"
+#endif
+
+/* cl_khr_fp64 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources 
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in 
+ * which they were registered. The user callback functions are called and then the memory object is deleted 
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
+                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
+                                            const void * /* private_info */, 
+                                            size_t       /* cb */, 
+                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************ 
+* cl_khr_icd extension *                                                  
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
+                       cl_platform_id * /* platforms */,
+                       cl_uint *        /* num_platforms */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+    cl_uint          /* num_entries */,
+    cl_platform_id * /* platforms */,
+    cl_uint *        /* num_platforms */);
+
+
+/* Extension: cl_khr_image2D_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
+ * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
+ * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
+ * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
+ * for 2D images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the width,
+ * height, image format (i.e. channel order and channel data type) and optionally the row pitch
+ *
+ * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
+ * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
+ */
+    
+/*************************************
+ * cl_khr_initalize_memory extension *
+ *************************************/
+    
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
+    
+    
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+    
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
+#define CL_CONTEXT_TERMINATE_KHR                    0x2032
+
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+    
+    
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a 
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+
+#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
+
+
+/*****************************************
+ * cl_khr_create_command_queue extension *
+ *****************************************/
+#define cl_khr_create_command_queue 1
+
+typedef cl_bitfield cl_queue_properties_khr;
+
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithPropertiesKHR( cl_context /* context */,
+                                       cl_device_id /* device */,
+                                       const cl_queue_properties_khr* /* properties */,
+                                       cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_command_queue
+(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)( cl_context /* context */,
+                                                         cl_device_id /* device */,
+                                                         const cl_queue_properties_khr* /* properties */,
+                                                         cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+
+/*********************************
+* cl_amd_device_memory_flags *
+*********************************/
+#define cl_amd_device_memory_flags 1
+
+#define CL_MEM_USE_PERSISTENT_MEM_AMD       (1 << 6)        // Alloc from GPU's CPU visible heap
+
+/* cl_device_info */
+#define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT           0x4032
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+#define CL_DEVICE_TOPOLOGY_AMD                      0x4037
+#define CL_DEVICE_BOARD_NAME_AMD                    0x4038
+#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD            0x4039
+#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD         0x4040
+#define CL_DEVICE_SIMD_WIDTH_AMD                    0x4041
+#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD        0x4042
+#define CL_DEVICE_WAVEFRONT_WIDTH_AMD               0x4043
+#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD           0x4044
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD      0x4045
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046
+#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD   0x4047
+#define CL_DEVICE_LOCAL_MEM_BANKS_AMD               0x4048
+
+typedef union
+{
+    struct { cl_uint type; cl_uint data[5]; } raw;
+    struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
+} cl_device_topology_amd;
+
+#define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD            1
+
+
+/**************************
+* cl_amd_offline_devices *
+**************************/
+#define CL_CONTEXT_OFFLINE_DEVICES_AMD              0x403F
+
+/*********************************
+* cl_arm_printf extension
+*********************************/
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
+
+#ifdef CL_VERSION_1_1
+   /***********************************
+    * cl_ext_device_fission extension *
+    ***********************************/
+    #define cl_ext_device_fission   1
+    
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef cl_ulong  cl_device_partition_property_ext;
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
+                            const cl_device_partition_property_ext * /* properties */,
+                            cl_uint /*num_entries*/,
+                            cl_device_id * /*out_devices*/,
+                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef CL_API_ENTRY cl_int 
+    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
+                                                const cl_device_partition_property_ext * /* properties */,
+                                                cl_uint /*num_entries*/,
+                                                cl_device_id * /*out_devices*/,
+                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    /* cl_device_partition_property_ext */
+    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+    #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
+    
+    /* clDeviceGetInfo selectors */
+    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+    
+    /* error codes */
+    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
+    #define CL_INVALID_PARTITION_NAME_EXT               -1059
+    
+    /* CL_AFFINITY_DOMAINs */
+    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+    #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
+    
+    /* cl_device_partition_property_ext list terminators */
+    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+
+    /* cl_ext_atomic_counters_32 and cl_ext_atomic_counters_64 extensions
+     * no extension #define since they have no functions
+     */
+    #define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT           0x4032
+
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0      
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
+    cl_uint  allocation_type;
+            
+    /* Host cache policy for this external memory allocation. */
+    cl_uint  host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* ION file descriptor */
+    int                  ion_filedesc;
+            
+    /* Host pointer to the ION allocated memory */
+    void*                ion_hostptr;
+
+} cl_mem_ion_host_ptr;
+
+#endif /* CL_VERSION_1_1 */
+
+#if defined(CL_VERSION_1_2)
+
+/******************************************
+ * cl_img_yuv_image extension *
+ ******************************************/
+
+/* Image formats used in clCreateImage */
+#define CL_NV21_IMG                                 0x40D0
+#define CL_YV12_IMG                                 0x40D1
+
+/******************************************
+ * cl_img_cached_allocations extension *
+ ******************************************/
+
+/* Flag values used by clCreteBuffer */
+#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG         	(1 << 26)
+#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG           	(1 << 27)
+
+/******************************************
+ * cl_img_use_gralloc_ptr extension *
+ ******************************************/
+
+/* Flag values used by clCreteBuffer */
+#define CL_MEM_USE_GRALLOC_PTR_IMG                 	(1 << 28)
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
+#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3
+
+/* Error code from clEnqueueReleaseGrallocObjectsIMG */
+#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      /* command_queue */,
+                                  cl_uint               /* num_objects */,
+                                  const cl_mem *        /* mem_objects */,
+                                  cl_uint               /* num_events_in_wait_list */,
+                                  const cl_event *      /* event_wait_list */,
+                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      /* command_queue */,
+                                  cl_uint               /* num_objects */,
+                                  const cl_mem *        /* mem_objects */,
+                                  cl_uint               /* num_events_in_wait_list */,
+                                  const cl_event *      /* event_wait_list */,
+                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+#endif /* CL_VERSION_1_2 */
+
+#ifdef CL_VERSION_2_0
+/*********************************
+* cl_khr_subgroups extension
+*********************************/
+#define cl_khr_subgroups 1
+
+/* cl_kernel_sub_group_info is declared in CL.h. */
+
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR	0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR		0x2034
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */,
+						   cl_device_id /*in_device*/,
+						   cl_kernel_sub_group_info /* param_name */,
+						   size_t /*input_value_size*/,
+						   const void * /*input_value*/,
+						   size_t /*param_value_size*/,
+						   void* /*param_value*/,
+						   size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+						   
+typedef CL_API_ENTRY cl_int
+     ( CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */,
+						      cl_device_id /*in_device*/,
+						      cl_kernel_sub_group_info /* param_name */,
+						      size_t /*input_value_size*/,
+						      const void * /*input_value*/,
+						      size_t /*param_value_size*/,
+						      void* /*param_value*/,
+						      size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+#endif /* CL_VERSION_2_0 */
+
+#ifdef CL_VERSION_2_1
+/*********************************
+* cl_khr_priority_hints extension
+*********************************/
+#define cl_khr_priority_hints 1
+
+typedef cl_uint  cl_queue_priority_khr;
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_PRIORITY_KHR 0x1096
+
+/* cl_queue_priority_khr */
+#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
+#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
+#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
+
+#endif /* CL_VERSION_2_1 */
+
+#ifdef CL_VERSION_2_1
+/*********************************
+* cl_khr_throttle_hints extension
+*********************************/
+#define cl_khr_throttle_hints 1
+
+typedef cl_uint  cl_queue_throttle_khr;
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_THROTTLE_KHR 0x1097
+
+/* cl_queue_throttle_khr */
+#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
+#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
+#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
+
+#endif /* CL_VERSION_2_1 */
+
+#ifdef CL_VERSION_2_2
+/*********************************
+* cl_khr_subgroup_named_barrier
+*********************************/
+#define cl_khr_subgroup_named_barrier 1
+
+/* cl_device_info */
+#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035
+
+#endif /* CL_VERSION_2_2 */
+
+/**********************************
+ * cl_arm_import_memory extension *
+ **********************************/
+
+#ifdef CL_VERSION_1_0
+
+typedef intptr_t cl_import_properties_arm;
+
+/* Default and valid proporties name for cl_arm_import_memory */
+#define CL_IMPORT_TYPE_ARM                        0x40B2
+
+/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_HOST_ARM                   0x40B3
+
+/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4
+
+/* Secure DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_SECURE_ARM                 0x40B5
+
+/* This extension adds a new function that allows for direct memory import into
+ * OpenCL via the clImportMemoryARM function.
+ *
+ * Memory imported through this interface will be mapped into the device's page
+ * tables directly, providing zero copy access. It will never fall back to copy
+ * operations and aliased buffers.
+ *
+ * Types of memory supported for import are specified as additional extension
+ * strings.
+ *
+ * This extension produces cl_mem allocations which are compatible with all other
+ * users of cl_mem in the standard API.
+ *
+ * This extension maps pages with the same properties as the normal buffer creation
+ * function clCreateBuffer.
+ */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clImportMemoryARM( cl_context context,
+                   cl_mem_flags flags,
+                   const cl_import_properties_arm *properties,
+                   void *memory,
+                   size_t size,
+                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
+
+
+#endif /* CL_VERSION_1_0 */
+
+/******************************************
+ * cl_arm_shared_virtual_memory extension *
+ ******************************************/
+
+#ifdef CL_VERSION_1_2
+
+/* Used by clGetDeviceInfo */
+#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
+
+/* Used by clGetMemObjectInfo */
+#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
+
+/* Used by clSetKernelExecInfoARM: */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_SVM_FREE_ARM                         0x40BA
+#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
+#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
+#define CL_COMMAND_SVM_MAP_ARM                          0x40BD
+#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
+
+/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
+
+/* Flag values used by clSVMAllocARM: */
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
+#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
+
+typedef cl_bitfield cl_svm_mem_flags_arm;
+typedef cl_uint     cl_kernel_exec_info_arm;
+typedef cl_bitfield cl_device_svm_capabilities_arm;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAllocARM(cl_context       /* context */,
+              cl_svm_mem_flags_arm /* flags */,
+              size_t           /* size */,
+              cl_uint          /* alignment */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFreeARM(cl_context        /* context */,
+             void *            /* svm_pointer */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFreeARM(cl_command_queue  /* command_queue */,
+                    cl_uint           /* num_svm_pointers */,
+                    void *[]          /* svm_pointers[] */,
+                    void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                                           cl_uint          /* num_svm_pointers */,
+                                                           void *[]         /* svm_pointers[] */,
+                                                           void *           /* user_data */),
+                    void *            /* user_data */,
+                    cl_uint           /* num_events_in_wait_list */,
+                    const cl_event *  /* event_wait_list */,
+                    cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpyARM(cl_command_queue  /* command_queue */,
+                      cl_bool           /* blocking_copy */,
+                      void *            /* dst_ptr */,
+                      const void *      /* src_ptr */,
+                      size_t            /* size */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFillARM(cl_command_queue  /* command_queue */,
+                       void *            /* svm_ptr */,
+                       const void *      /* pattern */,
+                       size_t            /* pattern_size */,
+                       size_t            /* size */,
+                       cl_uint           /* num_events_in_wait_list */,
+                       const cl_event *  /* event_wait_list */,
+                       cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMapARM(cl_command_queue  /* command_queue */,
+                   cl_bool           /* blocking_map */,
+                   cl_map_flags      /* flags */,
+                   void *            /* svm_ptr */,
+                   size_t            /* size */,
+                   cl_uint           /* num_events_in_wait_list */,
+                   const cl_event *  /* event_wait_list */,
+                   cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmapARM(cl_command_queue  /* command_queue */,
+                     void *            /* svm_ptr */,
+                     cl_uint           /* num_events_in_wait_list */,
+                     const cl_event *  /* event_wait_list */,
+                     cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointerARM(cl_kernel    /* kernel */,
+                            cl_uint      /* arg_index */,
+                            const void * /* arg_value */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfoARM(cl_kernel            /* kernel */,
+                       cl_kernel_exec_info_arm  /* param_name */,
+                       size_t               /* param_value_size */,
+                       const void *         /* param_value */) CL_EXT_SUFFIX__VERSION_1_2;
+
+#endif /* CL_VERSION_1_2 */
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
diff --git a/include/triton/external/CL/cl_ext_intel.h b/include/triton/external/CL/cl_ext_intel.h
new file mode 100644
index 000000000..ca1971df6
--- /dev/null
+++ b/include/triton/external/CL/cl_ext_intel.h
@@ -0,0 +1,429 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2017 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+/*****************************************************************************\
+
+Copyright (c) 2013-2017 Intel Corporation All Rights Reserved.
+
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File Name: cl_ext_intel.h
+
+Abstract:
+
+Notes:
+
+\*****************************************************************************/
+
+#ifndef __CL_EXT_INTEL_H
+#define __CL_EXT_INTEL_H
+
+#ifdef __APPLE__
+    #include <OpenCL/cl.h>
+    #include <OpenCL/cl_platform.h>
+#else
+    #include "cl.h"
+    #include "cl_platform.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************
+* cl_intel_thread_local_exec extension *
+****************************************/
+
+#define cl_intel_thread_local_exec 1
+
+#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)
+
+/***********************************************
+* cl_intel_device_partition_by_names extension *
+************************************************/
+
+#define cl_intel_device_partition_by_names 1
+
+#define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052
+#define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1
+
+/************************************************
+* cl_intel_accelerator extension                *
+* cl_intel_motion_estimation extension          *
+* cl_intel_advanced_motion_estimation extension *
+*************************************************/
+
+#define cl_intel_accelerator 1
+#define cl_intel_motion_estimation 1
+#define cl_intel_advanced_motion_estimation 1
+
+typedef struct _cl_accelerator_intel* cl_accelerator_intel;
+typedef cl_uint cl_accelerator_type_intel;
+typedef cl_uint cl_accelerator_info_intel;
+
+typedef struct _cl_motion_estimation_desc_intel {
+    cl_uint mb_block_type;
+    cl_uint subpixel_mode;
+    cl_uint sad_adjust_mode;
+    cl_uint search_path_type;
+} cl_motion_estimation_desc_intel;
+
+/* error codes */
+#define CL_INVALID_ACCELERATOR_INTEL                              -1094
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097
+
+/* cl_accelerator_type_intel */
+#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0
+
+/* cl_accelerator_info_intel */
+#define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090
+#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091
+#define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092
+#define CL_ACCELERATOR_TYPE_INTEL                                 0x4093
+
+/* cl_motion_detect_desc_intel flags */
+#define CL_ME_MB_TYPE_16x16_INTEL                                 0x0
+#define CL_ME_MB_TYPE_8x8_INTEL                                   0x1
+#define CL_ME_MB_TYPE_4x4_INTEL                                   0x2
+
+#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0
+#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1
+#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2
+
+#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0
+#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1
+
+#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0
+#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1
+#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5
+
+#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0
+#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1
+#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2
+#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4
+
+#define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1
+#define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2
+#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3
+
+#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16
+#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21
+#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32
+#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43
+#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48
+
+#define CL_ME_COST_PENALTY_NONE_INTEL                             0x0
+#define CL_ME_COST_PENALTY_LOW_INTEL                              0x1
+#define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2
+#define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3
+
+#define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0
+#define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1
+#define CL_ME_COST_PRECISION_PEL_INTEL                            0x2
+#define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3
+
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1
+#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3
+
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8
+
+#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0
+#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1
+#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2
+#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3
+
+/* cl_device_info */
+#define CL_DEVICE_ME_VERSION_INTEL                                0x407E
+
+#define CL_ME_VERSION_LEGACY_INTEL                                0x0
+#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1
+#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2
+
+extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
+clCreateAcceleratorINTEL(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetAcceleratorInfoINTEL(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+/******************************************
+* cl_intel_simultaneous_sharing extension *
+*******************************************/
+
+#define cl_intel_simultaneous_sharing 1
+
+#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104
+#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105
+
+/***********************************
+* cl_intel_egl_image_yuv extension *
+************************************/
+
+#define cl_intel_egl_image_yuv 1
+
+#define CL_EGL_YUV_PLANE_INTEL                           0x4107
+
+/********************************
+* cl_intel_packed_yuv extension *
+*********************************/
+
+#define cl_intel_packed_yuv 1
+
+#define CL_YUYV_INTEL                                    0x4076
+#define CL_UYVY_INTEL                                    0x4077
+#define CL_YVYU_INTEL                                    0x4078
+#define CL_VYUY_INTEL                                    0x4079
+
+/********************************************
+* cl_intel_required_subgroup_size extension *
+*********************************************/
+
+#define cl_intel_required_subgroup_size 1
+
+#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108
+#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109
+#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A
+
+/****************************************
+* cl_intel_driver_diagnostics extension *
+*****************************************/
+
+#define cl_intel_driver_diagnostics 1
+
+typedef cl_uint cl_diagnostics_verbose_level;
+
+#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106
+
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )
+
+/********************************
+* cl_intel_planar_yuv extension *
+*********************************/
+
+#define CL_NV12_INTEL                                       0x410E
+
+#define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )
+#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )
+
+#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
+#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F
+
+/*******************************************************
+* cl_intel_device_side_avc_motion_estimation extension *
+********************************************************/
+
+#define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
+#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
+#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D
+
+#define CL_AVC_ME_VERSION_0_INTEL                           0x0;  // No support.
+#define CL_AVC_ME_VERSION_1_INTEL                           0x1;  // First supported version.
+
+#define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
+#define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
+#define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
+#define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3
+
+#define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
+#define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
+#define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
+#define CL_AVC_ME_MINOR_4x4_INTEL                           0x3
+
+#define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
+#define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
+#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2
+
+#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
+#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
+#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
+#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B
+#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77
+#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F
+#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
+#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F
+
+#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
+#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
+#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
+#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3
+#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4
+#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5
+#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6
+#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7
+#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8
+#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9
+#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
+#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa
+
+#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
+#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2
+
+#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
+#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
+#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3
+
+#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
+#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
+#define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
+#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3
+
+#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
+#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
+#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
+#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
+#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30
+
+#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
+#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
+#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
+#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8
+
+#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
+#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000
+
+#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )
+
+#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
+#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80
+
+#define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
+#define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
+#define CL_AVC_ME_INTRA_4x4_INTEL                           0x2
+
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3 
+
+#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4
+
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3
+
+#define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
+#define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
+#define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3
+
+#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
+#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
+#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2
+
+#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
+#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1  
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CL_EXT_INTEL_H */
+
diff --git a/include/triton/external/CL/cl_gl.h b/include/triton/external/CL/cl_gl.h
new file mode 100644
index 000000000..e18b0cf2d
--- /dev/null
+++ b/include/triton/external/CL/cl_gl.h
@@ -0,0 +1,167 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include "cl.h"
+#endif	
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#define CL_GL_NUM_SAMPLES                       0x2012
+
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     /* context */,
+                     cl_mem_flags   /* flags */,
+                     cl_GLuint      /* bufobj */,
+                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context      /* context */,
+                      cl_mem_flags    /* flags */,
+                      cl_GLenum       /* target */,
+                      cl_GLint        /* miplevel */,
+                      cl_GLuint       /* texture */,
+                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+    
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   /* context */,
+                           cl_mem_flags /* flags */,
+                           cl_GLuint    /* renderbuffer */,
+                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                /* memobj */,
+                  cl_gl_object_type *   /* gl_object_type */,
+                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+                  
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               /* memobj */,
+                   cl_gl_texture_info   /* param_name */,
+                   size_t               /* param_value_size */,
+                   void *               /* param_value */,
+                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+/* cl_khr_gl_sharing extension  */
+    
+#define cl_khr_gl_sharing 1
+    
+typedef cl_uint     cl_gl_context_info;
+    
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+    
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+    
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+                      cl_gl_context_info            /* param_name */,
+                      size_t                        /* param_value_size */,
+                      void *                        /* param_value */,
+                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+    
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_GL_H */
diff --git a/include/triton/external/CL/cl_gl_ext.h b/include/triton/external/CL/cl_gl_ext.h
new file mode 100644
index 000000000..65f34891e
--- /dev/null
+++ b/include/triton/external/CL/cl_gl_ext.h
@@ -0,0 +1,74 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
+/* OpenGL dependencies.                                                         */
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+    #include <OpenCL/cl_gl.h>
+#else
+    #include "cl_gl.h"
+#endif
+
+/*
+ * For each extension, follow this template
+ *  cl_VEN_extname extension  */
+/* #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ *  This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+
+/* 
+ *  cl_khr_gl_event  extension
+ *  See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context           /* context */,
+                           cl_GLsync            /* cl_GLsync */,
+                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* __OPENCL_CL_GL_EXT_H  */
diff --git a/include/triton/external/CL/cl_platform.h b/include/triton/external/CL/cl_platform.h
new file mode 100644
index 000000000..33ffb8cdc
--- /dev/null
+++ b/include/triton/external/CL/cl_platform.h
@@ -0,0 +1,1458 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#ifdef __APPLE__
+    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+    #include <AvailabilityMacros.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+
+/*
+ * Deprecation flags refer to the last version of the header in which the
+ * feature was not deprecated.
+ *
+ * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
+ * deprecation but is deprecated in versions later than 1.1.
+ */
+
+#ifdef __APPLE__
+    #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
+    #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_API_SUFFIX__VERSION_1_1                  AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define GCL_API_SUFFIX__VERSION_1_1                 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_1                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED       CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
+    
+    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
+    #else
+        #warning  This path should never happen outside of internal operating system development.  AvailabilityMacros do not function correctly here!
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #endif
+#else
+    #define CL_EXTENSION_WEAK_LINK  
+    #define CL_API_SUFFIX__VERSION_1_0
+    #define CL_EXT_SUFFIX__VERSION_1_0
+    #define CL_API_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_1
+    #define CL_API_SUFFIX__VERSION_1_2
+    #define CL_EXT_SUFFIX__VERSION_1_2
+    #define CL_API_SUFFIX__VERSION_2_0
+    #define CL_EXT_SUFFIX__VERSION_2_0
+    #define CL_API_SUFFIX__VERSION_2_1
+    #define CL_EXT_SUFFIX__VERSION_2_1
+    #define CL_API_SUFFIX__VERSION_2_2
+    #define CL_EXT_SUFFIX__VERSION_2_2
+    
+    #ifdef __GNUC__
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #endif
+    
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+         #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+        #endif
+    #elif defined(_WIN32)
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED 
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)     
+        #endif
+    
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED 
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)     
+        #endif
+    
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED __declspec(deprecated)
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED 
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED __declspec(deprecated)
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED __declspec(deprecated)
+        #endif
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+    
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    
+        #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+
+        #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+
+        #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+    #endif
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          1.7976931348623158e+308
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short    __attribute__((aligned(2)));
+typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
+typedef int32_t         cl_int      __attribute__((aligned(4)));
+typedef uint32_t        cl_uint     __attribute__((aligned(4)));
+typedef int64_t         cl_long     __attribute__((aligned(8)));
+typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
+
+typedef uint16_t        cl_half     __attribute__((aligned(2)));
+typedef float           cl_float    __attribute__((aligned(4)));
+typedef double          cl_double   __attribute__((aligned(8)));
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+#else
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )  
+#endif
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types 
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned. 
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte 
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned. 
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+   typedef __vector unsigned char     __cl_uchar16;
+   typedef __vector signed char       __cl_char16;
+   typedef __vector unsigned short    __cl_ushort8;
+   typedef __vector signed short      __cl_short8;
+   typedef __vector unsigned int      __cl_uint4;
+   typedef __vector signed int        __cl_int4;
+   typedef __vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#endif
+
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#endif
+
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h> 
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+#endif
+
+/* Define capabilities for anonymous struct members. */
+#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__
+#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__ __extension__
+#elif defined( _WIN32) && defined(_MSC_VER)
+    #if _MSC_VER >= 1500
+   /* Microsoft Developer Studio 2008 supports anonymous structs, but
+    * complains by default. */
+    #define  __CL_HAS_ANON_STRUCT__ 1
+    #define  __CL_ANON_STRUCT__
+   /* Disable warning C4201: nonstandard extension used : nameless
+    * struct/union */
+    #pragma warning( push )
+    #pragma warning( disable : 4201 )
+    #endif
+#else
+#define  __CL_HAS_ANON_STRUCT__ 0
+#define  __CL_ANON_STRUCT__
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+#else
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if __CL_HAS_ANON_STRUCT__
+    /* .xyzw and .s0123...{f|F} are supported */
+    #define CL_HAS_NAMED_VECTOR_FIELDS 1
+    /* .hi and .lo are supported */
+    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+    cl_char  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[2];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+
+typedef union
+{
+    cl_char   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[4];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[8];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+    cl_uchar  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
+#endif
+#if defined( __cl_uchar2__) 
+    __cl_uchar2     v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[2];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+
+typedef union
+{
+    cl_uchar   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[4];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[8];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+    cl_short  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[2];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+
+typedef union
+{
+    cl_short   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[4];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[8];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+    cl_ushort  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[2];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+
+typedef union
+{
+    cl_ushort   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[4];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[8];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+#endif
+}cl_ushort16;
+
+
+/* ---- cl_halfn ---- */
+typedef union
+{
+    cl_half  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1; };
+    __CL_ANON_STRUCT__ struct{ cl_half  lo, hi; };
+#endif
+#if defined( __CL_HALF2__) 
+    __cl_half2     v2;
+#endif
+}cl_half2;
+
+typedef union
+{
+    cl_half  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3; };
+    __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
+#endif
+#if defined( __CL_HALF2__) 
+    __cl_half2     v2[2];
+#endif
+#if defined( __CL_HALF4__) 
+    __cl_half4     v4;
+#endif
+}cl_half4;
+
+/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
+typedef  cl_half4  cl_half3;
+
+typedef union
+{
+    cl_half   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7; };
+    __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
+#endif
+#if defined( __CL_HALF2__) 
+    __cl_half2     v2[4];
+#endif
+#if defined( __CL_HALF4__) 
+    __cl_half4     v4[2];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8;
+#endif
+}cl_half8;
+
+typedef union
+{
+    cl_half  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+    __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
+#endif
+#if defined( __CL_HALF2__) 
+    __cl_half2     v2[8];
+#endif
+#if defined( __CL_HALF4__) 
+    __cl_half4     v4[4];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8[2];
+#endif
+#if defined( __CL_HALF16__ )
+    __cl_half16    v16;
+#endif
+}cl_half16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+    cl_int  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[2];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+
+typedef union
+{
+    cl_int   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[4];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[2];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[8];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[4];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#endif
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+    cl_uint  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[2];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+
+typedef union
+{
+    cl_uint   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[4];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[8];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+    cl_long  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[2];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+
+typedef union
+{
+    cl_long   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[4];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[8];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+    cl_ulong  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[2];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+
+typedef union
+{
+    cl_ulong   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[4];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[8];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+    cl_float  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[2];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+
+typedef union
+{
+    cl_float   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[4];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[8];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+    cl_double  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[2];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+
+typedef union
+{
+    cl_double   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[4];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[8];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging 
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. 
+ *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source 
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" 
+  
+#ifdef __cplusplus
+}
+#endif
+
+#undef __CL_HAS_ANON_STRUCT__
+#undef __CL_ANON_STRUCT__
+#if defined( _WIN32) && defined(_MSC_VER)
+    #if _MSC_VER >=1500
+    #pragma warning( pop )
+    #endif
+#endif
+
+#endif  /* __CL_PLATFORM_H  */
diff --git a/include/triton/external/CL/cl_va_api_media_sharing_intel.h b/include/triton/external/CL/cl_va_api_media_sharing_intel.h
new file mode 100644
index 000000000..e1d6d79a9
--- /dev/null
+++ b/include/triton/external/CL/cl_va_api_media_sharing_intel.h
@@ -0,0 +1,172 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2016 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/*****************************************************************************\
+
+Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
+
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File Name: cl_va_api_media_sharing_intel.h
+
+Abstract:
+
+Notes:
+
+\*****************************************************************************/
+
+
+#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
+#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
+
+#include "cl.h"
+#include "cl_platform.h"
+#include <va/va.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************
+* cl_intel_va_api_media_sharing extension *
+*******************************************/
+
+#define cl_intel_va_api_media_sharing 1
+
+/* error codes */
+#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098
+#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099
+#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100
+#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101
+
+/* cl_va_api_device_source_intel */
+#define CL_VA_API_DISPLAY_INTEL                             0x4094
+
+/* cl_va_api_device_set_intel */
+#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095
+#define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096
+
+/* cl_context_info */
+#define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097
+
+/* cl_mem_info */
+#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098
+
+/* cl_image_info */
+#define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A
+#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B
+
+typedef cl_uint cl_va_api_device_source_intel;
+typedef cl_uint cl_va_api_device_set_intel;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
+    cl_platform_id                /* platform */,
+    cl_va_api_device_source_intel /* media_adapter_type */,
+    void*                         /* media_adapter */,
+    cl_va_api_device_set_intel    /* media_adapter_set */,
+    cl_uint                       /* num_entries */,
+    cl_device_id*                 /* devices */,
+    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
+    cl_platform_id                /* platform */,
+    cl_va_api_device_source_intel /* media_adapter_type */,
+    void*                         /* media_adapter */,
+    cl_va_api_device_set_intel    /* media_adapter_set */,
+    cl_uint                       /* num_entries */,
+    cl_device_id*                 /* devices */,
+    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromVA_APIMediaSurfaceINTEL(
+    cl_context                    /* context */,
+    cl_mem_flags                  /* flags */,
+    VASurfaceID*                  /* surface */,
+    cl_uint                       /* plane */,
+    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
+    cl_context                    /* context */,
+    cl_mem_flags                  /* flags */,
+    VASurfaceID*                  /* surface */,
+    cl_uint                       /* plane */,
+    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireVA_APIMediaSurfacesINTEL(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseVA_APIMediaSurfacesINTEL(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+	
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */
+
diff --git a/include/triton/external/CL/opencl.h b/include/triton/external/CL/opencl.h
new file mode 100644
index 000000000..f207750d6
--- /dev/null
+++ b/include/triton/external/CL/opencl.h
@@ -0,0 +1,59 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+
+#else
+
+#include "cl.h"
+#include "cl_gl.h"
+#include "cl_gl_ext.h"
+#include "cl_ext.h"
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_H   */
+
diff --git a/include/triton/external/CUDA/cuda.h b/include/triton/external/CUDA/cuda.h
new file mode 100755
index 000000000..24d96bd6c
--- /dev/null
+++ b/include/triton/external/CUDA/cuda.h
@@ -0,0 +1,14503 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef __cuda_cuda_h__
+#define __cuda_cuda_h__
+
+#include <stdlib.h>
+#ifdef _MSC_VER
+typedef unsigned __int32 cuuint32_t;
+typedef unsigned __int64 cuuint64_t;
+#else
+#include <stdint.h>
+typedef uint32_t cuuint32_t;
+typedef uint64_t cuuint64_t;
+#endif
+
+/**
+ * CUDA API versioning support
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+
+#if defined(CUDA_FORCE_API_VERSION)
+    #if (CUDA_FORCE_API_VERSION == 3010)
+        #define __CUDA_API_VERSION 3010
+    #else
+        #error "Unsupported value of CUDA_FORCE_API_VERSION"
+    #endif
+#else
+    #define __CUDA_API_VERSION 10000
+#endif /* CUDA_FORCE_API_VERSION */
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __CUDA_API_PER_THREAD_DEFAULT_STREAM
+    #define __CUDA_API_PTDS(api) api ## _ptds
+    #define __CUDA_API_PTSZ(api) api ## _ptsz
+#else
+    #define __CUDA_API_PTDS(api) api
+    #define __CUDA_API_PTSZ(api) api
+#endif
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 3020
+    #define cuDeviceTotalMem                    cuDeviceTotalMem_v2
+    #define cuCtxCreate                         cuCtxCreate_v2
+    #define cuModuleGetGlobal                   cuModuleGetGlobal_v2
+    #define cuMemGetInfo                        cuMemGetInfo_v2
+    #define cuMemAlloc                          cuMemAlloc_v2
+    #define cuMemAllocPitch                     cuMemAllocPitch_v2
+    #define cuMemFree                           cuMemFree_v2
+    #define cuMemGetAddressRange                cuMemGetAddressRange_v2
+    #define cuMemAllocHost                      cuMemAllocHost_v2
+    #define cuMemHostGetDevicePointer           cuMemHostGetDevicePointer_v2
+    #define cuMemcpyHtoD                        __CUDA_API_PTDS(cuMemcpyHtoD_v2)
+    #define cuMemcpyDtoH                        __CUDA_API_PTDS(cuMemcpyDtoH_v2)
+    #define cuMemcpyDtoD                        __CUDA_API_PTDS(cuMemcpyDtoD_v2)
+    #define cuMemcpyDtoA                        __CUDA_API_PTDS(cuMemcpyDtoA_v2)
+    #define cuMemcpyAtoD                        __CUDA_API_PTDS(cuMemcpyAtoD_v2)
+    #define cuMemcpyHtoA                        __CUDA_API_PTDS(cuMemcpyHtoA_v2)
+    #define cuMemcpyAtoH                        __CUDA_API_PTDS(cuMemcpyAtoH_v2)
+    #define cuMemcpyAtoA                        __CUDA_API_PTDS(cuMemcpyAtoA_v2)
+    #define cuMemcpyHtoAAsync                   __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2)
+    #define cuMemcpyAtoHAsync                   __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2)
+    #define cuMemcpy2D                          __CUDA_API_PTDS(cuMemcpy2D_v2)
+    #define cuMemcpy2DUnaligned                 __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2)
+    #define cuMemcpy3D                          __CUDA_API_PTDS(cuMemcpy3D_v2)
+    #define cuMemcpyHtoDAsync                   __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2)
+    #define cuMemcpyDtoHAsync                   __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2)
+    #define cuMemcpyDtoDAsync                   __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2)
+    #define cuMemcpy2DAsync                     __CUDA_API_PTSZ(cuMemcpy2DAsync_v2)
+    #define cuMemcpy3DAsync                     __CUDA_API_PTSZ(cuMemcpy3DAsync_v2)
+    #define cuMemsetD8                          __CUDA_API_PTDS(cuMemsetD8_v2)
+    #define cuMemsetD16                         __CUDA_API_PTDS(cuMemsetD16_v2)
+    #define cuMemsetD32                         __CUDA_API_PTDS(cuMemsetD32_v2)
+    #define cuMemsetD2D8                        __CUDA_API_PTDS(cuMemsetD2D8_v2)
+    #define cuMemsetD2D16                       __CUDA_API_PTDS(cuMemsetD2D16_v2)
+    #define cuMemsetD2D32                       __CUDA_API_PTDS(cuMemsetD2D32_v2)
+    #define cuArrayCreate                       cuArrayCreate_v2
+    #define cuArrayGetDescriptor                cuArrayGetDescriptor_v2
+    #define cuArray3DCreate                     cuArray3DCreate_v2
+    #define cuArray3DGetDescriptor              cuArray3DGetDescriptor_v2
+    #define cuTexRefSetAddress                  cuTexRefSetAddress_v2
+    #define cuTexRefGetAddress                  cuTexRefGetAddress_v2
+    #define cuGraphicsResourceGetMappedPointer  cuGraphicsResourceGetMappedPointer_v2
+#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 3020 */
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4000
+    #define cuCtxDestroy                        cuCtxDestroy_v2
+    #define cuCtxPopCurrent                     cuCtxPopCurrent_v2
+    #define cuCtxPushCurrent                    cuCtxPushCurrent_v2
+    #define cuStreamDestroy                     cuStreamDestroy_v2
+    #define cuEventDestroy                      cuEventDestroy_v2
+#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4000 */
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4010
+    #define cuTexRefSetAddress2D                cuTexRefSetAddress2D_v3
+#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4010 */
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050
+    #define cuLinkCreate                        cuLinkCreate_v2
+    #define cuLinkAddData                       cuLinkAddData_v2
+    #define cuLinkAddFile                       cuLinkAddFile_v2
+#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050
+    #define cuMemHostRegister                   cuMemHostRegister_v2
+    #define cuGraphicsResourceSetMapFlags       cuGraphicsResourceSetMapFlags_v2
+#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */
+
+#if !defined(__CUDA_API_VERSION_INTERNAL)
+#if defined(__CUDA_API_VERSION) && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010
+    #define cuTexRefSetAddress2D                cuTexRefSetAddress2D_v2
+#endif /* __CUDA_API_VERSION && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010 */
+#endif /* __CUDA_API_VERSION_INTERNAL */
+
+#if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define cuMemcpy                            __CUDA_API_PTDS(cuMemcpy)
+    #define cuMemcpyAsync                       __CUDA_API_PTSZ(cuMemcpyAsync)
+    #define cuMemcpyPeer                        __CUDA_API_PTDS(cuMemcpyPeer)
+    #define cuMemcpyPeerAsync                   __CUDA_API_PTSZ(cuMemcpyPeerAsync)
+    #define cuMemcpy3DPeer                      __CUDA_API_PTDS(cuMemcpy3DPeer)
+    #define cuMemcpy3DPeerAsync                 __CUDA_API_PTSZ(cuMemcpy3DPeerAsync)
+    #define cuMemPrefetchAsync                  __CUDA_API_PTSZ(cuMemPrefetchAsync)
+
+    #define cuMemsetD8Async                     __CUDA_API_PTSZ(cuMemsetD8Async)
+    #define cuMemsetD16Async                    __CUDA_API_PTSZ(cuMemsetD16Async)
+    #define cuMemsetD32Async                    __CUDA_API_PTSZ(cuMemsetD32Async)
+    #define cuMemsetD2D8Async                   __CUDA_API_PTSZ(cuMemsetD2D8Async)
+    #define cuMemsetD2D16Async                  __CUDA_API_PTSZ(cuMemsetD2D16Async)
+    #define cuMemsetD2D32Async                  __CUDA_API_PTSZ(cuMemsetD2D32Async)
+
+    #define cuStreamGetPriority                 __CUDA_API_PTSZ(cuStreamGetPriority)
+    #define cuStreamGetFlags                    __CUDA_API_PTSZ(cuStreamGetFlags)
+    #define cuStreamGetCtx                      __CUDA_API_PTSZ(cuStreamGetCtx)
+    #define cuStreamWaitEvent                   __CUDA_API_PTSZ(cuStreamWaitEvent)
+    #define cuStreamBeginCapture                __CUDA_API_PTSZ(cuStreamBeginCapture)
+    #define cuStreamEndCapture                  __CUDA_API_PTSZ(cuStreamEndCapture)
+    #define cuStreamIsCapturing                 __CUDA_API_PTSZ(cuStreamIsCapturing)
+    #define cuStreamAddCallback                 __CUDA_API_PTSZ(cuStreamAddCallback)
+    #define cuStreamAttachMemAsync              __CUDA_API_PTSZ(cuStreamAttachMemAsync)
+    #define cuStreamQuery                       __CUDA_API_PTSZ(cuStreamQuery)
+    #define cuStreamSynchronize                 __CUDA_API_PTSZ(cuStreamSynchronize)
+    #define cuEventRecord                       __CUDA_API_PTSZ(cuEventRecord)
+    #define cuLaunchKernel                      __CUDA_API_PTSZ(cuLaunchKernel)
+    #define cuLaunchHostFunc                    __CUDA_API_PTSZ(cuLaunchHostFunc)
+    #define cuGraphicsMapResources              __CUDA_API_PTSZ(cuGraphicsMapResources)
+    #define cuGraphicsUnmapResources            __CUDA_API_PTSZ(cuGraphicsUnmapResources)
+
+    #define cuStreamWriteValue32                __CUDA_API_PTSZ(cuStreamWriteValue32)
+    #define cuStreamWaitValue32                 __CUDA_API_PTSZ(cuStreamWaitValue32)
+    #define cuStreamWriteValue64                __CUDA_API_PTSZ(cuStreamWriteValue64)
+    #define cuStreamWaitValue64                 __CUDA_API_PTSZ(cuStreamWaitValue64)
+    #define cuStreamBatchMemOp                  __CUDA_API_PTSZ(cuStreamBatchMemOp)
+
+    #define cuLaunchCooperativeKernel           __CUDA_API_PTSZ(cuLaunchCooperativeKernel)
+
+    #define cuSignalExternalSemaphoresAsync     __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync)
+    #define cuWaitExternalSemaphoresAsync       __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync)
+
+    #define cuGraphLaunch                       __CUDA_API_PTSZ(cuGraphLaunch)
+#endif
+
+/**
+ * \file cuda.h
+ * \brief Header file for the CUDA Toolkit application programming interface.
+ *
+ * \file cudaGL.h
+ * \brief Header file for the OpenGL interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * \file cudaD3D9.h
+ * \brief Header file for the Direct3D 9 interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ */
+
+/**
+ * \defgroup CUDA_TYPES Data types used by CUDA driver
+ * @{
+ */
+
+/**
+ * CUDA API version number
+ */
+#define CUDA_VERSION 10000
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * CUDA device pointer
+ * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
+ */
+#if __CUDA_API_VERSION >= 3020
+
+#if defined(_WIN64) || defined(__LP64__)
+typedef unsigned long long CUdeviceptr;
+#else
+typedef unsigned int CUdeviceptr;
+#endif
+
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+typedef int CUdevice;                                     /**< CUDA device */
+typedef struct CUctx_st *CUcontext;                       /**< CUDA context */
+typedef struct CUmod_st *CUmodule;                        /**< CUDA module */
+typedef struct CUfunc_st *CUfunction;                     /**< CUDA function */
+typedef struct CUarray_st *CUarray;                       /**< CUDA array */
+typedef struct CUmipmappedArray_st *CUmipmappedArray;     /**< CUDA mipmapped array */
+typedef struct CUtexref_st *CUtexref;                     /**< CUDA texture reference */
+typedef struct CUsurfref_st *CUsurfref;                   /**< CUDA surface reference */
+typedef struct CUevent_st *CUevent;                       /**< CUDA event */
+typedef struct CUstream_st *CUstream;                     /**< CUDA stream */
+typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */
+typedef unsigned long long CUtexObject;                   /**< An opaque value that represents a CUDA texture object */
+typedef unsigned long long CUsurfObject;                  /**< An opaque value that represents a CUDA surface object */
+typedef struct CUextMemory_st *CUexternalMemory;          /**< CUDA external memory */
+typedef struct CUextSemaphore_st *CUexternalSemaphore;    /**< CUDA external semaphore */
+typedef struct CUgraph_st *CUgraph;                       /**< CUDA graph */
+typedef struct CUgraphNode_st *CUgraphNode;               /**< CUDA graph node */
+typedef struct CUgraphExec_st *CUgraphExec;               /**< CUDA executable graph */
+
+#ifndef CU_UUID_HAS_BEEN_DEFINED
+#define CU_UUID_HAS_BEEN_DEFINED
+typedef struct CUuuid_st {                                /**< CUDA definition of UUID */
+    char bytes[16];
+} CUuuid;
+#endif
+
+#if __CUDA_API_VERSION >= 4010
+
+/**
+ * CUDA IPC handle size
+ */
+#define CU_IPC_HANDLE_SIZE 64
+
+/**
+ * CUDA IPC event handle
+ */
+typedef struct CUipcEventHandle_st {
+    char reserved[CU_IPC_HANDLE_SIZE];
+} CUipcEventHandle;
+
+/**
+ * CUDA IPC mem handle
+ */
+typedef struct CUipcMemHandle_st {
+    char reserved[CU_IPC_HANDLE_SIZE];
+} CUipcMemHandle;
+
+/**
+ * CUDA Ipc Mem Flags
+ */
+typedef enum CUipcMem_flags_enum {
+    CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */
+} CUipcMem_flags;
+
+#endif
+
+/**
+ * CUDA Mem Attach Flags
+ */
+typedef enum CUmemAttach_flags_enum {
+    CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */
+    CU_MEM_ATTACH_HOST   = 0x2, /**< Memory cannot be accessed by any stream on any device */
+    CU_MEM_ATTACH_SINGLE = 0x4  /**< Memory can only be accessed by a single stream on the associated device */
+} CUmemAttach_flags;
+
+/**
+ * Context creation flags
+ */
+typedef enum CUctx_flags_enum {
+    CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
+    CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
+    CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
+    CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+    CU_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling
+                                         *  \deprecated This flag was deprecated as of CUDA 4.0
+                                         *  and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */
+    CU_CTX_SCHED_MASK          = 0x07,
+    CU_CTX_MAP_HOST            = 0x08, /**< Support mapped pinned allocations */
+    CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
+    CU_CTX_FLAGS_MASK          = 0x1f
+} CUctx_flags;
+
+/**
+ * Stream creation flags
+ */
+typedef enum CUstream_flags_enum {
+    CU_STREAM_DEFAULT      = 0x0, /**< Default stream flag */
+    CU_STREAM_NON_BLOCKING = 0x1  /**< Stream does not synchronize with stream 0 (the NULL stream) */
+} CUstream_flags;
+
+/**
+ * Legacy stream handle
+ *
+ * Stream handle that can be passed as a CUstream to use an implicit stream
+ * with legacy synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define CU_STREAM_LEGACY     ((CUstream)0x1)
+
+/**
+ * Per-thread stream handle
+ *
+ * Stream handle that can be passed as a CUstream to use an implicit stream
+ * with per-thread synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define CU_STREAM_PER_THREAD ((CUstream)0x2)
+
+/**
+ * Event creation flags
+ */
+typedef enum CUevent_flags_enum {
+    CU_EVENT_DEFAULT        = 0x0, /**< Default event flag */
+    CU_EVENT_BLOCKING_SYNC  = 0x1, /**< Event uses blocking synchronization */
+    CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */
+    CU_EVENT_INTERPROCESS   = 0x4  /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
+} CUevent_flags;
+
+#if __CUDA_API_VERSION >= 8000
+/**
+ * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64
+ */
+typedef enum CUstreamWaitValue_flags_enum {
+    CU_STREAM_WAIT_VALUE_GEQ   = 0x0,   /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit
+                                             values). Note this is a cyclic comparison which ignores wraparound.
+                                             (Default behavior.) */
+    CU_STREAM_WAIT_VALUE_EQ    = 0x1,   /**< Wait until *addr == value. */
+    CU_STREAM_WAIT_VALUE_AND   = 0x2,   /**< Wait until (*addr & value) != 0. */
+    CU_STREAM_WAIT_VALUE_NOR   = 0x3,   /**< Wait until ~(*addr | value) != 0. Support for this operation can be
+                                             queried with ::cuDeviceGetAttribute() and
+                                             ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/
+    CU_STREAM_WAIT_VALUE_FLUSH = 1<<30  /**< Follow the wait operation with a flush of outstanding remote writes. This
+                                             means that, if a remote write operation is guaranteed to have reached the
+                                             device before the wait can be satisfied, that write is guaranteed to be
+                                             visible to downstream device work. The device is permitted to reorder
+                                             remote writes internally. For example, this flag would be required if
+                                             two remote writes arrive in a defined order, the wait is satisfied by the
+                                             second write, and downstream work needs to observe the first write.
+                                             Support for this operation is restricted to selected platforms and can be
+                                             queried with ::CU_DEVICE_ATTRIBUTE_CAN_USE_WAIT_VALUE_FLUSH.*/
+} CUstreamWaitValue_flags;
+
+/**
+ * Flags for ::cuStreamWriteValue32
+ */
+typedef enum CUstreamWriteValue_flags_enum {
+    CU_STREAM_WRITE_VALUE_DEFAULT           = 0x0, /**< Default behavior */
+    CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1  /**< Permits the write to be reordered with writes which were issued
+                                                        before it, as a performance optimization. Normally,
+                                                        ::cuStreamWriteValue32 will provide a memory fence before the
+                                                        write, which has similar semantics to
+                                                        __threadfence_system() but is scoped to the stream
+                                                        rather than a CUDA thread. */
+} CUstreamWriteValue_flags;
+
+/**
+ * Operations for ::cuStreamBatchMemOp
+ */
+typedef enum CUstreamBatchMemOpType_enum {
+    CU_STREAM_MEM_OP_WAIT_VALUE_32  = 1,     /**< Represents a ::cuStreamWaitValue32 operation */
+    CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2,     /**< Represents a ::cuStreamWriteValue32 operation */
+    CU_STREAM_MEM_OP_WAIT_VALUE_64  = 4,     /**< Represents a ::cuStreamWaitValue64 operation */
+    CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5,     /**< Represents a ::cuStreamWriteValue64 operation */
+    CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a
+                                                  standalone operation. */
+} CUstreamBatchMemOpType;
+
+/**
+ * Per-operation parameters for ::cuStreamBatchMemOp
+ */
+typedef union CUstreamBatchMemOpParams_union {
+    CUstreamBatchMemOpType operation;
+    struct CUstreamMemOpWaitValueParams_st {
+        CUstreamBatchMemOpType operation;
+        CUdeviceptr address;
+        union {
+            cuuint32_t value;
+            cuuint64_t value64;
+        };
+        unsigned int flags;
+        CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
+    } waitValue;
+    struct CUstreamMemOpWriteValueParams_st {
+        CUstreamBatchMemOpType operation;
+        CUdeviceptr address;
+        union {
+            cuuint32_t value;
+            cuuint64_t value64;
+        };
+        unsigned int flags;
+        CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
+    } writeValue;
+    struct CUstreamMemOpFlushRemoteWritesParams_st {
+        CUstreamBatchMemOpType operation;
+        unsigned int flags;
+    } flushRemoteWrites;
+    cuuint64_t pad[6];
+} CUstreamBatchMemOpParams;
+#endif /* __CUDA_API_VERSION >= 8000 */
+
+/**
+ * Occupancy calculator flag
+ */
+typedef enum CUoccupancy_flags_enum {
+    CU_OCCUPANCY_DEFAULT                  = 0x0, /**< Default behavior */
+    CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1  /**< Assume global caching is enabled and cannot be automatically turned off */
+} CUoccupancy_flags;
+
+/**
+ * Array formats
+ */
+typedef enum CUarray_format_enum {
+    CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
+    CU_AD_FORMAT_SIGNED_INT8    = 0x08, /**< Signed 8-bit integers */
+    CU_AD_FORMAT_SIGNED_INT16   = 0x09, /**< Signed 16-bit integers */
+    CU_AD_FORMAT_SIGNED_INT32   = 0x0a, /**< Signed 32-bit integers */
+    CU_AD_FORMAT_HALF           = 0x10, /**< 16-bit floating point */
+    CU_AD_FORMAT_FLOAT          = 0x20  /**< 32-bit floating point */
+} CUarray_format;
+
+/**
+ * Texture reference addressing modes
+ */
+typedef enum CUaddress_mode_enum {
+    CU_TR_ADDRESS_MODE_WRAP   = 0, /**< Wrapping address mode */
+    CU_TR_ADDRESS_MODE_CLAMP  = 1, /**< Clamp to edge address mode */
+    CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
+    CU_TR_ADDRESS_MODE_BORDER = 3  /**< Border address mode */
+} CUaddress_mode;
+
+/**
+ * Texture reference filtering modes
+ */
+typedef enum CUfilter_mode_enum {
+    CU_TR_FILTER_MODE_POINT  = 0, /**< Point filter mode */
+    CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
+} CUfilter_mode;
+
+/**
+ * Device properties
+ */
+typedef enum CUdevice_attribute_enum {
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,              /**< Maximum number of threads per block */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                    /**< Maximum block dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                    /**< Maximum block dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                    /**< Maximum block dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                     /**< Maximum grid dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                     /**< Maximum grid dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                     /**< Maximum grid dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,        /**< Maximum shared memory available per block in bytes */
+    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,            /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,              /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                         /**< Warp size in threads */
+    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                         /**< Maximum pitch in bytes allowed by memory copies */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,           /**< Maximum number of 32-bit registers available per block */
+    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,               /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                        /**< Typical clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                 /**< Alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                       /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
+    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,              /**< Number of multiprocessors on device */
+    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,               /**< Specifies whether there is a run time limit on kernels */
+    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                        /**< Device is integrated with host memory */
+    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,               /**< Device can map host memory into CUDA address space */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                      /**< Compute mode (See ::CUcomputemode for details) */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,           /**< Maximum 1D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,           /**< Maximum 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,          /**< Maximum 2D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,           /**< Maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,          /**< Maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,           /**< Maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,   /**< Maximum 2D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,  /**< Maximum 2D layered texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,  /**< Maximum layers in a 2D layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,     /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,    /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
+    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                 /**< Alignment requirement for surfaces */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                /**< Device can possibly execute multiple kernels concurrently */
+    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                       /**< Device has ECC support enabled */
+    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                        /**< PCI bus ID of the device */
+    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                     /**< PCI device ID of the device */
+    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,                        /**< Device is using TCC driver model */
+    CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                 /**< Peak memory clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,           /**< Global memory bus width in bits */
+    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                     /**< Size of L2 cache in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,    /**< Maximum resident threads per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                /**< Number of asynchronous engines */
+    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                /**< Device shares a unified address space with the host */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,   /**< Maximum 1D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,  /**< Maximum layers in a 1D layered texture */
+    CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,                  /**< Deprecated, do not use. */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,    /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,   /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,/**< Alternate maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,                     /**< PCI domain ID of the device */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,           /**< Pitch alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,      /**< Maximum cubemap texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,  /**< Maximum cubemap layered texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,           /**< Maximum 1D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,           /**< Maximum 2D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,          /**< Maximum 2D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,           /**< Maximum 3D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,          /**< Maximum 3D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,           /**< Maximum 3D surface depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,   /**< Maximum 1D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,  /**< Maximum layers in a 1D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,   /**< Maximum 2D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,  /**< Maximum 2D layered surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,  /**< Maximum layers in a 2D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,      /**< Maximum cubemap surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,  /**< Maximum cubemap layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,    /**< Maximum 1D linear texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,    /**< Maximum 2D linear texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,   /**< Maximum 2D linear texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,    /**< Maximum 2D linear texture pitch in bytes */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,/**< Maximum mipmapped 2D texture height */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,          /**< Major compute capability version number */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,          /**< Minor compute capability version number */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */
+    CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,       /**< Device supports stream priorities */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,         /**< Device supports caching globals in L1 */
+    CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,          /**< Device supports caching locals in L1 */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,  /**< Maximum shared memory available per multiprocessor in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,  /**< Maximum number of 32-bit registers available per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,                    /**< Device can allocate managed memory on this system */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,                    /**< Device is on a multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,           /**< Unique id for a group of devices on the same multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,       /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
+    CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,  /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,            /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,         /**< Device can coherently access managed memory concurrently with the CPU */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,      /**< Device supports compute preemption. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,            /**< ::cuStreamBatchMemOp and related APIs are supported. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,     /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,     /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */
+    CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,                /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
+    CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,   /**< Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */
+    CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,           /**< Both the ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
+    CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,           /**< Device supports host memory registration via ::cudaHostRegister. */
+    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */
+    CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */
+    CU_DEVICE_ATTRIBUTE_MAX
+} CUdevice_attribute;
+
+/**
+ * Legacy device properties
+ */
+typedef struct CUdevprop_st {
+    int maxThreadsPerBlock;     /**< Maximum number of threads per block */
+    int maxThreadsDim[3];       /**< Maximum size of each dimension of a block */
+    int maxGridSize[3];         /**< Maximum size of each dimension of a grid */
+    int sharedMemPerBlock;      /**< Shared memory available per block in bytes */
+    int totalConstantMemory;    /**< Constant memory available on device in bytes */
+    int SIMDWidth;              /**< Warp size in threads */
+    int memPitch;               /**< Maximum pitch in bytes allowed by memory copies */
+    int regsPerBlock;           /**< 32-bit registers available per block */
+    int clockRate;              /**< Clock frequency in kilohertz */
+    int textureAlign;           /**< Alignment requirement for textures */
+} CUdevprop;
+
+/**
+ * Pointer information
+ */
+typedef enum CUpointer_attribute_enum {
+    CU_POINTER_ATTRIBUTE_CONTEXT = 1,        /**< The ::CUcontext on which a pointer was allocated or registered */
+    CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2,    /**< The ::CUmemorytype describing the physical location of a pointer */
+    CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */
+    CU_POINTER_ATTRIBUTE_HOST_POINTER = 4,   /**< The address at which a pointer's memory may be accessed on the host */
+    CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5,     /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */
+    CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6,    /**< Synchronize every synchronous memory operation initiated on this region */
+    CU_POINTER_ATTRIBUTE_BUFFER_ID = 7,      /**< A process-wide unique ID for an allocated memory region*/
+    CU_POINTER_ATTRIBUTE_IS_MANAGED = 8,     /**< Indicates if the pointer points to managed memory */
+    CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9  /**< A device ordinal of a device on which a pointer was allocated or registered */
+} CUpointer_attribute;
+
+/**
+ * Function properties
+ */
+typedef enum CUfunction_attribute_enum {
+    /**
+     * The maximum number of threads per block, beyond which a launch of the
+     * function would fail. This number depends on both the function and the
+     * device on which the function is currently loaded.
+     */
+    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+
+    /**
+     * The size in bytes of statically-allocated shared memory required by
+     * this function. This does not include dynamically-allocated shared
+     * memory requested by the user at runtime.
+     */
+    CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
+
+    /**
+     * The size in bytes of user-allocated constant memory required by this
+     * function.
+     */
+    CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
+
+    /**
+     * The size in bytes of local memory used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
+
+    /**
+     * The number of registers used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
+
+    /**
+     * The PTX virtual architecture version for which the function was
+     * compiled. This value is the major PTX version * 10 + the minor PTX
+     * version, so a PTX version 1.3 function would return the value 13.
+     * Note that this may return the undefined value of 0 for cubins
+     * compiled prior to CUDA 3.0.
+     */
+    CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
+
+    /**
+     * The binary architecture version for which the function was compiled.
+     * This value is the major binary version * 10 + the minor binary version,
+     * so a binary version 1.3 function would return the value 13. Note that
+     * this will return a value of 10 for legacy cubins that do not have a
+     * properly-encoded binary architecture version.
+     */
+    CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
+
+    /**
+     * The attribute to indicate whether the function has been compiled with
+     * user specified option "-Xptxas --dlcm=ca" set .
+     */
+    CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
+
+    /**
+     * The maximum size in bytes of dynamically-allocated shared memory that can be used by
+     * this function. If the user-specified dynamic shared memory size is larger than this
+     * value, the launch will fail.
+     */
+    CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
+
+    /**
+     * On devices where the L1 cache and shared memory use the same hardware resources,
+     * this sets the shared memory carveout preference, in percent of the total resources.
+     * This is only a hint, and the driver can choose a different ratio if required to execute the function.
+     */
+    CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
+
+    CU_FUNC_ATTRIBUTE_MAX
+} CUfunction_attribute;
+
+/**
+ * Function cache configurations
+ */
+typedef enum CUfunc_cache_enum {
+    CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
+    CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
+    CU_FUNC_CACHE_PREFER_L1      = 0x02, /**< prefer larger L1 cache and smaller shared memory */
+    CU_FUNC_CACHE_PREFER_EQUAL   = 0x03  /**< prefer equal sized L1 cache and shared memory */
+} CUfunc_cache;
+
+/**
+ * Shared memory configurations
+ */
+typedef enum CUsharedconfig_enum {
+    CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00, /**< set default shared memory bank size */
+    CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01, /**< set shared memory bank width to four bytes */
+    CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02  /**< set shared memory bank width to eight bytes */
+} CUsharedconfig;
+
+/**
+ * Shared memory carveout configurations
+ */
+typedef enum CUshared_carveout_enum {
+    CU_SHAREDMEM_CARVEOUT_DEFAULT       = -1,  /** < no preference for shared memory or L1 (default) */
+    CU_SHAREDMEM_CARVEOUT_MAX_SHARED    = 100, /** < prefer maximum available shared memory, minimum L1 cache */
+    CU_SHAREDMEM_CARVEOUT_MAX_L1        = 0    /** < prefer maximum available L1 cache, minimum shared memory */
+} CUshared_carveout;
+
+/**
+ * Memory types
+ */
+typedef enum CUmemorytype_enum {
+    CU_MEMORYTYPE_HOST    = 0x01,    /**< Host memory */
+    CU_MEMORYTYPE_DEVICE  = 0x02,    /**< Device memory */
+    CU_MEMORYTYPE_ARRAY   = 0x03,    /**< Array memory */
+    CU_MEMORYTYPE_UNIFIED = 0x04     /**< Unified device or host memory */
+} CUmemorytype;
+
+/**
+ * Compute Modes
+ */
+typedef enum CUcomputemode_enum {
+    CU_COMPUTEMODE_DEFAULT           = 0, /**< Default compute mode (Multiple contexts allowed per device) */
+    CU_COMPUTEMODE_PROHIBITED        = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
+    CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3  /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
+} CUcomputemode;
+
+/**
+ * Memory advise values
+ */
+typedef enum CUmem_advise_enum {
+    CU_MEM_ADVISE_SET_READ_MOSTLY          = 1, /**< Data will mostly be read and only occassionally be written to */
+    CU_MEM_ADVISE_UNSET_READ_MOSTLY        = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */
+    CU_MEM_ADVISE_SET_PREFERRED_LOCATION   = 3, /**< Set the preferred location for the data as the specified device */
+    CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */
+    CU_MEM_ADVISE_SET_ACCESSED_BY          = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */
+    CU_MEM_ADVISE_UNSET_ACCESSED_BY        = 6  /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */
+} CUmem_advise;
+
+typedef enum CUmem_range_attribute_enum {
+    CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY            = 1, /**< Whether the range will mostly be read and only occassionally be written to */
+    CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION     = 2, /**< The preferred location of the range */
+    CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY            = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */
+    CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4  /**< The last location to which the range was prefetched */
+} CUmem_range_attribute;
+
+/**
+ * Online compiler and linker options
+ */
+typedef enum CUjit_option_enum
+{
+    /**
+     * Max number of registers that a thread may use.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_MAX_REGISTERS = 0,
+
+    /**
+     * IN: Specifies minimum number of threads per block to target compilation
+     * for\n
+     * OUT: Returns the number of threads the compiler actually targeted.
+     * This restricts the resource utilization fo the compiler (e.g. max
+     * registers) such that a block with the given number of threads should be
+     * able to launch based on register limitations. Note, this option does not
+     * currently take into account any other resource limitations, such as
+     * shared memory utilization.\n
+     * Cannot be combined with ::CU_JIT_TARGET.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_THREADS_PER_BLOCK,
+
+    /**
+     * Overwrites the option value with the total wall clock time, in
+     * milliseconds, spent in the compiler and linker\n
+     * Option type: float\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_WALL_TIME,
+
+    /**
+     * Pointer to a buffer in which to print any log messages
+     * that are informational in nature (the buffer size is specified via
+     * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_INFO_LOG_BUFFER,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+
+    /**
+     * Pointer to a buffer in which to print any log messages that
+     * reflect errors (the buffer size is specified via option
+     * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_ERROR_LOG_BUFFER,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+
+    /**
+     * Level of optimizations to apply to generated code (0 - 4), with 4
+     * being the default and highest level of optimizations.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_OPTIMIZATION_LEVEL,
+
+    /**
+     * No option value required. Determines the target based on the current
+     * attached context (default)\n
+     * Option type: No option value needed\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_TARGET_FROM_CUCONTEXT,
+
+    /**
+     * Target is chosen based on supplied ::CUjit_target.  Cannot be
+     * combined with ::CU_JIT_THREADS_PER_BLOCK.\n
+     * Option type: unsigned int for enumerated type ::CUjit_target\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_TARGET,
+
+    /**
+     * Specifies choice of fallback strategy if matching cubin is not found.
+     * Choice is based on supplied ::CUjit_fallback.  This option cannot be
+     * used with cuLink* APIs as the linker requires exact matches.\n
+     * Option type: unsigned int for enumerated type ::CUjit_fallback\n
+     * Applies to: compiler only
+     */
+    CU_JIT_FALLBACK_STRATEGY,
+
+    /**
+     * Specifies whether to create debug information in output (-g)
+     * (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_GENERATE_DEBUG_INFO,
+
+    /**
+     * Generate verbose log messages (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_LOG_VERBOSE,
+
+    /**
+     * Generate line number information (-lineinfo) (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_GENERATE_LINE_INFO,
+
+    /**
+     * Specifies whether to enable caching explicitly (-dlcm) \n
+     * Choice is based on supplied ::CUjit_cacheMode_enum.\n
+     * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n
+     * Applies to: compiler only
+     */
+    CU_JIT_CACHE_MODE,
+
+    /**
+     * The below jit options are used for internal purposes only, in this version of CUDA
+     */
+    CU_JIT_NEW_SM3X_OPT,
+    CU_JIT_FAST_COMPILE,
+
+    /**
+     * Array of device symbol names that will be relocated to the corresponing
+     * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n
+     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+     * When loding a device module, driver will relocate all encountered
+     * unresolved symbols to the host addresses.\n
+     * It is only allowed to register symbols that correspond to unresolved
+     * global variables.\n
+     * It is illegal to register the same device symbol at multiple addresses.\n
+     * Option type: const char **\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_NAMES,
+
+    /**
+     * Array of host addresses that will be used to relocate corresponding
+     * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n
+     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+     * Option type: void **\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_ADDRESSES,
+
+    /**
+     * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and
+     * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n
+     * Option type: unsigned int\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_COUNT,
+
+    CU_JIT_NUM_OPTIONS
+
+} CUjit_option;
+
+/**
+ * Online compilation targets
+ */
+typedef enum CUjit_target_enum
+{
+    CU_TARGET_COMPUTE_20 = 20,       /**< Compute device class 2.0 */
+    CU_TARGET_COMPUTE_21 = 21,       /**< Compute device class 2.1 */
+    CU_TARGET_COMPUTE_30 = 30,       /**< Compute device class 3.0 */
+    CU_TARGET_COMPUTE_32 = 32,       /**< Compute device class 3.2 */
+    CU_TARGET_COMPUTE_35 = 35,       /**< Compute device class 3.5 */
+    CU_TARGET_COMPUTE_37 = 37,       /**< Compute device class 3.7 */
+    CU_TARGET_COMPUTE_50 = 50,       /**< Compute device class 5.0 */
+    CU_TARGET_COMPUTE_52 = 52,       /**< Compute device class 5.2 */
+    CU_TARGET_COMPUTE_53 = 53,       /**< Compute device class 5.3 */
+    CU_TARGET_COMPUTE_60 = 60,       /**< Compute device class 6.0.*/
+    CU_TARGET_COMPUTE_61 = 61,       /**< Compute device class 6.1.*/
+    CU_TARGET_COMPUTE_62 = 62,       /**< Compute device class 6.2.*/
+    CU_TARGET_COMPUTE_70 = 70,       /**< Compute device class 7.0.*/
+
+    CU_TARGET_COMPUTE_75 = 75        /**< Compute device class 7.5.*/
+} CUjit_target;
+
+/**
+ * Cubin matching fallback strategies
+ */
+typedef enum CUjit_fallback_enum
+{
+    CU_PREFER_PTX = 0,  /**< Prefer to compile ptx if exact binary match not found */
+
+    CU_PREFER_BINARY    /**< Prefer to fall back to compatible binary code if exact match not found */
+
+} CUjit_fallback;
+
+/**
+ * Caching modes for dlcm
+ */
+typedef enum CUjit_cacheMode_enum
+{
+    CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */
+    CU_JIT_CACHE_OPTION_CG,       /**< Compile with L1 cache disabled */
+    CU_JIT_CACHE_OPTION_CA        /**< Compile with L1 cache enabled */
+} CUjit_cacheMode;
+
+/**
+ * Device code formats
+ */
+typedef enum CUjitInputType_enum
+{
+    /**
+     * Compiled device-class-specific device code\n
+     * Applicable options: none
+     */
+    CU_JIT_INPUT_CUBIN = 0,
+
+    /**
+     * PTX source code\n
+     * Applicable options: PTX compiler options
+     */
+    CU_JIT_INPUT_PTX,
+
+    /**
+     * Bundle of multiple cubins and/or PTX of some device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_FATBINARY,
+
+    /**
+     * Host object with embedded device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_OBJECT,
+
+    /**
+     * Archive of host objects with embedded device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_LIBRARY,
+
+    CU_JIT_NUM_INPUT_TYPES
+} CUjitInputType;
+
+#if __CUDA_API_VERSION >= 5050
+typedef struct CUlinkState_st *CUlinkState;
+#endif /* __CUDA_API_VERSION >= 5050 */
+
+/**
+ * Flags to register a graphics resource
+ */
+typedef enum CUgraphicsRegisterFlags_enum {
+    CU_GRAPHICS_REGISTER_FLAGS_NONE           = 0x00,
+    CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY      = 0x01,
+    CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD  = 0x02,
+    CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST   = 0x04,
+    CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08
+} CUgraphicsRegisterFlags;
+
+/**
+ * Flags for mapping and unmapping interop resources
+ */
+typedef enum CUgraphicsMapResourceFlags_enum {
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
+} CUgraphicsMapResourceFlags;
+
+/**
+ * Array indices for cube faces
+ */
+typedef enum CUarray_cubemap_face_enum {
+    CU_CUBEMAP_FACE_POSITIVE_X  = 0x00, /**< Positive X face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_X  = 0x01, /**< Negative X face of cubemap */
+    CU_CUBEMAP_FACE_POSITIVE_Y  = 0x02, /**< Positive Y face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_Y  = 0x03, /**< Negative Y face of cubemap */
+    CU_CUBEMAP_FACE_POSITIVE_Z  = 0x04, /**< Positive Z face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_Z  = 0x05  /**< Negative Z face of cubemap */
+} CUarray_cubemap_face;
+
+/**
+ * Limits
+ */
+typedef enum CUlimit_enum {
+    CU_LIMIT_STACK_SIZE                       = 0x00, /**< GPU thread stack size */
+    CU_LIMIT_PRINTF_FIFO_SIZE                 = 0x01, /**< GPU printf FIFO size */
+    CU_LIMIT_MALLOC_HEAP_SIZE                 = 0x02, /**< GPU malloc heap size */
+    CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH           = 0x03, /**< GPU device runtime launch synchronize depth */
+    CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */
+    CU_LIMIT_MAX_L2_FETCH_GRANULARITY         = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */
+    CU_LIMIT_MAX
+} CUlimit;
+
+/**
+ * Resource types
+ */
+typedef enum CUresourcetype_enum {
+    CU_RESOURCE_TYPE_ARRAY           = 0x00, /**< Array resoure */
+    CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
+    CU_RESOURCE_TYPE_LINEAR          = 0x02, /**< Linear resource */
+    CU_RESOURCE_TYPE_PITCH2D         = 0x03  /**< Pitch 2D resource */
+} CUresourcetype;
+
+#ifdef _WIN32
+#define CUDA_CB __stdcall
+#else
+#define CUDA_CB
+#endif
+
+#if __CUDA_API_VERSION >= 10000
+
+/**
+ * CUDA host function
+ * \param userData Argument value passed to the function
+ */
+typedef void (CUDA_CB *CUhostFn)(void *userData);
+
+/**
+ * GPU kernel node parameters
+ */
+typedef struct CUDA_KERNEL_NODE_PARAMS_st {
+    CUfunction func;             /**< Kernel to launch */
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    void **kernelParams;         /**< Array of pointers to kernel parameters */
+    void **extra;                /**< Extra options */
+} CUDA_KERNEL_NODE_PARAMS;
+
+/**
+ * Memset node parameters
+ */
+typedef struct CUDA_MEMSET_NODE_PARAMS_st {
+    CUdeviceptr dst;                        /**< Destination device pointer */
+    size_t pitch;                           /**< Pitch of destination device pointer. Unused if height is 1 */
+    unsigned int value;                     /**< Value to be set */
+    unsigned int elementSize;               /**< Size of each element in bytes. Must be 1, 2, or 4. */
+    size_t width;                           /**< Width in bytes, of the row */
+    size_t height;                          /**< Number of rows */
+} CUDA_MEMSET_NODE_PARAMS;
+
+/**
+ * Host node parameters
+ */
+typedef struct CUDA_HOST_NODE_PARAMS_st {
+    CUhostFn fn;    /**< The function to call when the node executes */
+    void* userData; /**< Argument to pass to the function */
+} CUDA_HOST_NODE_PARAMS;
+
+/**
+ * Graph node types
+ */
+typedef enum CUgraphNodeType_enum {
+    CU_GRAPH_NODE_TYPE_KERNEL = 0, /**< GPU kernel node */
+    CU_GRAPH_NODE_TYPE_MEMCPY = 1, /**< Memcpy node */
+    CU_GRAPH_NODE_TYPE_MEMSET = 2, /**< Memset node */
+    CU_GRAPH_NODE_TYPE_HOST   = 3, /**< Host (executable) node */
+    CU_GRAPH_NODE_TYPE_GRAPH  = 4, /**< Node which executes an embedded graph */
+    CU_GRAPH_NODE_TYPE_EMPTY  = 5, /**< Empty (no-op) node */
+    CU_GRAPH_NODE_TYPE_COUNT
+} CUgraphNodeType;
+
+/**
+ * Possible stream capture statuses returned by ::cuStreamIsCapturing
+ */
+typedef enum CUstreamCaptureStatus_enum {
+    CU_STREAM_CAPTURE_STATUS_NONE        = 0, /**< Stream is not capturing */
+    CU_STREAM_CAPTURE_STATUS_ACTIVE      = 1, /**< Stream is actively capturing */
+    CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2  /**< Stream is part of a capture sequence that
+                                                   has been invalidated, but not terminated */
+} CUstreamCaptureStatus;
+
+#endif /* __CUDA_API_VERSION >= 10000 */
+
+/**
+ * Error codes
+ */
+typedef enum cudaError_enum {
+    /**
+     * The API call returned with no errors. In the case of query calls, this
+     * also means that the operation being queried is complete (see
+     * ::cuEventQuery() and ::cuStreamQuery()).
+     */
+    CUDA_SUCCESS                              = 0,
+
+    /**
+     * This indicates that one or more of the parameters passed to the API call
+     * is not within an acceptable range of values.
+     */
+    CUDA_ERROR_INVALID_VALUE                  = 1,
+
+    /**
+     * The API call failed because it was unable to allocate enough memory to
+     * perform the requested operation.
+     */
+    CUDA_ERROR_OUT_OF_MEMORY                  = 2,
+
+    /**
+     * This indicates that the CUDA driver has not been initialized with
+     * ::cuInit() or that initialization has failed.
+     */
+    CUDA_ERROR_NOT_INITIALIZED                = 3,
+
+    /**
+     * This indicates that the CUDA driver is in the process of shutting down.
+     */
+    CUDA_ERROR_DEINITIALIZED                  = 4,
+
+    /**
+     * This indicates profiler is not initialized for this run. This can
+     * happen when the application is running with external profiling tools
+     * like visual profiler.
+     */
+    CUDA_ERROR_PROFILER_DISABLED              = 5,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to attempt to enable/disable the profiling via ::cuProfilerStart or
+     * ::cuProfilerStop without initialization.
+     */
+    CUDA_ERROR_PROFILER_NOT_INITIALIZED       = 6,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cuProfilerStart() when profiling is already enabled.
+     */
+    CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cuProfilerStop() when profiling is already disabled.
+     */
+    CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
+
+    /**
+     * This indicates that no CUDA-capable devices were detected by the installed
+     * CUDA driver.
+     */
+    CUDA_ERROR_NO_DEVICE                      = 100,
+
+    /**
+     * This indicates that the device ordinal supplied by the user does not
+     * correspond to a valid CUDA device.
+     */
+    CUDA_ERROR_INVALID_DEVICE                 = 101,
+
+
+    /**
+     * This indicates that the device kernel image is invalid. This can also
+     * indicate an invalid CUDA module.
+     */
+    CUDA_ERROR_INVALID_IMAGE                  = 200,
+
+    /**
+     * This most frequently indicates that there is no context bound to the
+     * current thread. This can also be returned if the context passed to an
+     * API call is not a valid handle (such as a context that has had
+     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
+     * mixes different API versions (i.e. 3010 context with 3020 API calls).
+     * See ::cuCtxGetApiVersion() for more details.
+     */
+    CUDA_ERROR_INVALID_CONTEXT                = 201,
+
+    /**
+     * This indicated that the context being supplied as a parameter to the
+     * API call was already the active context.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.2. It is no longer an
+     * error to attempt to push the active context via ::cuCtxPushCurrent().
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
+
+    /**
+     * This indicates that a map or register operation has failed.
+     */
+    CUDA_ERROR_MAP_FAILED                     = 205,
+
+    /**
+     * This indicates that an unmap or unregister operation has failed.
+     */
+    CUDA_ERROR_UNMAP_FAILED                   = 206,
+
+    /**
+     * This indicates that the specified array is currently mapped and thus
+     * cannot be destroyed.
+     */
+    CUDA_ERROR_ARRAY_IS_MAPPED                = 207,
+
+    /**
+     * This indicates that the resource is already mapped.
+     */
+    CUDA_ERROR_ALREADY_MAPPED                 = 208,
+
+    /**
+     * This indicates that there is no kernel image available that is suitable
+     * for the device. This can occur when a user specifies code generation
+     * options for a particular CUDA source file that do not include the
+     * corresponding device configuration.
+     */
+    CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,
+
+    /**
+     * This indicates that a resource has already been acquired.
+     */
+    CUDA_ERROR_ALREADY_ACQUIRED               = 210,
+
+    /**
+     * This indicates that a resource is not mapped.
+     */
+    CUDA_ERROR_NOT_MAPPED                     = 211,
+
+    /**
+     * This indicates that a mapped resource is not available for access as an
+     * array.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
+
+    /**
+     * This indicates that a mapped resource is not available for access as a
+     * pointer.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,
+
+    /**
+     * This indicates that an uncorrectable ECC error was detected during
+     * execution.
+     */
+    CUDA_ERROR_ECC_UNCORRECTABLE              = 214,
+
+    /**
+     * This indicates that the ::CUlimit passed to the API call is not
+     * supported by the active device.
+     */
+    CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,
+
+    /**
+     * This indicates that the ::CUcontext passed to the API call can
+     * only be bound to a single CPU thread at a time but is already
+     * bound to a CPU thread.
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_IN_USE         = 216,
+
+    /**
+     * This indicates that peer access is not supported across the given
+     * devices.
+     */
+    CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        = 217,
+
+    /**
+     * This indicates that a PTX JIT compilation failed.
+     */
+    CUDA_ERROR_INVALID_PTX                    = 218,
+
+    /**
+     * This indicates an error with OpenGL or DirectX context.
+     */
+    CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       = 219,
+
+    /**
+    * This indicates that an uncorrectable NVLink error was detected during the
+    * execution.
+    */
+    CUDA_ERROR_NVLINK_UNCORRECTABLE           = 220,
+
+    /**
+    * This indicates that the PTX JIT compiler library was not found.
+    */
+    CUDA_ERROR_JIT_COMPILER_NOT_FOUND         = 221,
+
+    /**
+     * This indicates that the device kernel source is invalid.
+     */
+    CUDA_ERROR_INVALID_SOURCE                 = 300,
+
+    /**
+     * This indicates that the file specified was not found.
+     */
+    CUDA_ERROR_FILE_NOT_FOUND                 = 301,
+
+    /**
+     * This indicates that a link to a shared object failed to resolve.
+     */
+    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
+
+    /**
+     * This indicates that initialization of a shared object failed.
+     */
+    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
+
+    /**
+     * This indicates that an OS call failed.
+     */
+    CUDA_ERROR_OPERATING_SYSTEM               = 304,
+
+    /**
+     * This indicates that a resource handle passed to the API call was not
+     * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
+     */
+    CUDA_ERROR_INVALID_HANDLE                 = 400,
+
+    /**
+     * This indicates that a resource required by the API call is not in a
+     * valid state to perform the requested operation.
+     */
+    CUDA_ERROR_ILLEGAL_STATE                  = 401,
+
+    /**
+     * This indicates that a named symbol was not found. Examples of symbols
+     * are global/constant variable names, texture names, and surface names.
+     */
+    CUDA_ERROR_NOT_FOUND                      = 500,
+
+    /**
+     * This indicates that asynchronous operations issued previously have not
+     * completed yet. This result is not actually an error, but must be indicated
+     * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
+     * may return this value include ::cuEventQuery() and ::cuStreamQuery().
+     */
+    CUDA_ERROR_NOT_READY                      = 600,
+
+    /**
+     * While executing a kernel, the device encountered a
+     * load or store instruction on an invalid memory address.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_ILLEGAL_ADDRESS                = 700,
+
+    /**
+     * This indicates that a launch did not occur because it did not have
+     * appropriate resources. This error usually indicates that the user has
+     * attempted to pass too many arguments to the device kernel, or the
+     * kernel launch specifies too many threads for the kernel's register
+     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
+     * when a 32-bit int is expected) is equivalent to passing too many
+     * arguments and can also result in this error.
+     */
+    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
+
+    /**
+     * This indicates that the device kernel took too long to execute. This can
+     * only occur if timeouts are enabled - see the device attribute
+     * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,
+
+    /**
+     * This error indicates a kernel launch that uses an incompatible texturing
+     * mode.
+     */
+    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
+
+    /**
+     * This error indicates that a call to ::cuCtxEnablePeerAccess() is
+     * trying to re-enable peer access to a context which has already
+     * had peer access to it enabled.
+     */
+    CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    = 704,
+
+    /**
+     * This error indicates that ::cuCtxDisablePeerAccess() is
+     * trying to disable peer access which has not been enabled yet
+     * via ::cuCtxEnablePeerAccess().
+     */
+    CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        = 705,
+
+    /**
+     * This error indicates that the primary context for the specified device
+     * has already been initialized.
+     */
+    CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,
+
+    /**
+     * This error indicates that the context current to the calling thread
+     * has been destroyed using ::cuCtxDestroy, or is a primary context which
+     * has not yet been initialized.
+     */
+    CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,
+
+    /**
+     * A device-side assert triggered during kernel execution. The context
+     * cannot be used anymore, and must be destroyed. All existing device
+     * memory allocations from this context are invalid and must be
+     * reconstructed if the program is to continue using CUDA.
+     */
+    CUDA_ERROR_ASSERT                         = 710,
+
+    /**
+     * This error indicates that the hardware resources required to enable
+     * peer access have been exhausted for one or more of the devices
+     * passed to ::cuCtxEnablePeerAccess().
+     */
+    CUDA_ERROR_TOO_MANY_PEERS                 = 711,
+
+    /**
+     * This error indicates that the memory range passed to ::cuMemHostRegister()
+     * has already been registered.
+     */
+    CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
+
+    /**
+     * This error indicates that the pointer passed to ::cuMemHostUnregister()
+     * does not correspond to any currently registered memory region.
+     */
+    CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     = 713,
+
+    /**
+     * While executing a kernel, the device encountered a stack error.
+     * This can be due to stack corruption or exceeding the stack size limit.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_HARDWARE_STACK_ERROR           = 714,
+
+    /**
+     * While executing a kernel, the device encountered an illegal instruction.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_ILLEGAL_INSTRUCTION            = 715,
+
+    /**
+     * While executing a kernel, the device encountered a load or store instruction
+     * on a memory address which is not aligned.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_MISALIGNED_ADDRESS             = 716,
+
+    /**
+     * While executing a kernel, the device encountered an instruction
+     * which can only operate on memory locations in certain address spaces
+     * (global, shared, or local), but was supplied a memory address not
+     * belonging to an allowed address space.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_INVALID_ADDRESS_SPACE          = 717,
+
+    /**
+     * While executing a kernel, the device program counter wrapped its address space.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_INVALID_PC                     = 718,
+
+    /**
+     * An exception occurred on the device while executing a kernel. Common
+     * causes include dereferencing an invalid device pointer and accessing
+     * out of bounds shared memory.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_LAUNCH_FAILED                  = 719,
+
+    /**
+     * This error indicates that the number of blocks launched per grid for a kernel that was
+     * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
+     * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
+     * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
+     * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+     */
+    CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE   = 720,
+
+    /**
+     * This error indicates that the attempted operation is not permitted.
+     */
+    CUDA_ERROR_NOT_PERMITTED                  = 800,
+
+    /**
+     * This error indicates that the attempted operation is not supported
+     * on the current system or device.
+     */
+    CUDA_ERROR_NOT_SUPPORTED                  = 801,
+
+    /**
+     * This error indicates that the system is not yet ready to start any CUDA
+     * work.  To continue using CUDA, verify the system configuration is in a
+     * valid state and all required driver daemons are actively running.
+     */
+    CUDA_ERROR_SYSTEM_NOT_READY               = 802,
+
+    /**
+     * This error indicates that the operation is not permitted when
+     * the stream is capturing.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED     = 900,
+
+    /**
+     * This error indicates that the current capture sequence on the stream
+     * has been invalidated due to a previous error.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_INVALIDATED     = 901,
+
+    /**
+     * This error indicates that the operation would have resulted in a merge
+     * of two independent capture sequences.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_MERGE           = 902,
+
+    /**
+     * This error indicates that the capture was not initiated in this stream.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNMATCHED       = 903,
+
+    /**
+     * This error indicates that the capture sequence contains a fork that was
+     * not joined to the primary stream.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNJOINED        = 904,
+
+    /**
+     * This error indicates that a dependency would have been created which
+     * crosses the capture sequence boundary. Only implicit in-stream ordering
+     * dependencies are allowed to cross the boundary.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_ISOLATION       = 905,
+
+    /**
+     * This error indicates a disallowed implicit dependency on a current capture
+     * sequence from cudaStreamLegacy.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_IMPLICIT        = 906,
+
+    /**
+     * This error indicates that the operation is not permitted on an event which
+     * was last recorded in a capturing stream.
+     */
+    CUDA_ERROR_CAPTURED_EVENT                 = 907,
+
+    /**
+     * This indicates that an unknown internal error has occurred.
+     */
+    CUDA_ERROR_UNKNOWN                        = 999
+} CUresult;
+
+/**
+ * P2P Attributes
+ */
+typedef enum CUdevice_P2PAttribute_enum {
+    CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK                     = 0x01,  /**< A relative value indicating the performance of the link between two devices */
+    CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED                     = 0x02,  /**< P2P Access is enable */
+    CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED              = 0x03,  /**< Atomic operation over the link supported */
+    CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED        = 0x04,  /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */
+    CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED          = 0x04   /**< Accessing CUDA arrays over the link supported */
+} CUdevice_P2PAttribute;
+
+/**
+ * CUDA stream callback
+ * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback.  May be NULL.
+ * \param status ::CUDA_SUCCESS or any persistent error on the stream.
+ * \param userData User parameter provided at registration.
+ */
+typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData);
+
+/**
+ * Block size to per-block dynamic shared memory mapping for a certain
+ * kernel \param blockSize Block size of the kernel.
+ *
+ * \return The dynamic shared memory needed by a block.
+ */
+typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize);
+
+/**
+ * If set, host memory is portable between CUDA contexts.
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_PORTABLE        0x01
+
+/**
+ * If set, host memory is mapped into CUDA address space and
+ * ::cuMemHostGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_DEVICEMAP       0x02
+
+/**
+ * If set, host memory is allocated as write-combined - fast to write,
+ * faster to DMA, slow to read except via SSE4 streaming load instruction
+ * (MOVNTDQA).
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_WRITECOMBINED   0x04
+
+/**
+ * If set, host memory is portable between CUDA contexts.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_PORTABLE     0x01
+
+/**
+ * If set, host memory is mapped into CUDA address space and
+ * ::cuMemHostGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_DEVICEMAP    0x02
+
+/**
+ * If set, the passed memory pointer is treated as pointing to some
+ * memory-mapped I/O space, e.g. belonging to a third-party PCIe device.
+ * On Windows the flag is a no-op.
+ * On Linux that memory is marked as non cache-coherent for the GPU and
+ * is expected to be physically contiguous. It may return
+ * CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user,
+ * CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions.
+ * On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED
+ * is returned.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_IOMEMORY     0x04
+
+#if __CUDA_API_VERSION >= 3020
+
+/**
+ * 2D memory copy parameters
+ */
+typedef struct CUDA_MEMCPY2D_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+
+    size_t WidthInBytes;        /**< Width of 2D memory copy in bytes */
+    size_t Height;              /**< Height of 2D memory copy */
+} CUDA_MEMCPY2D;
+
+/**
+ * 3D memory copy parameters
+ */
+typedef struct CUDA_MEMCPY3D_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+    size_t srcZ;                /**< Source Z */
+    size_t srcLOD;              /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    void *reserved0;            /**< Must be NULL */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+    size_t dstZ;                /**< Destination Z */
+    size_t dstLOD;              /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    void *reserved1;            /**< Must be NULL */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
+    size_t Height;              /**< Height of 3D memory copy */
+    size_t Depth;               /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D;
+
+/**
+ * 3D memory cross-context copy parameters
+ */
+typedef struct CUDA_MEMCPY3D_PEER_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+    size_t srcZ;                /**< Source Z */
+    size_t srcLOD;              /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    CUcontext srcContext;       /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+    size_t dstZ;                /**< Destination Z */
+    size_t dstLOD;              /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    CUcontext dstContext;       /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
+    size_t Height;              /**< Height of 3D memory copy */
+    size_t Depth;               /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D_PEER;
+
+/**
+ * Array descriptor
+ */
+typedef struct CUDA_ARRAY_DESCRIPTOR_st
+{
+    size_t Width;             /**< Width of array */
+    size_t Height;            /**< Height of array */
+
+    CUarray_format Format;    /**< Array format */
+    unsigned int NumChannels; /**< Channels per array element */
+} CUDA_ARRAY_DESCRIPTOR;
+
+/**
+ * 3D array descriptor
+ */
+typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
+{
+    size_t Width;             /**< Width of 3D array */
+    size_t Height;            /**< Height of 3D array */
+    size_t Depth;             /**< Depth of 3D array */
+
+    CUarray_format Format;    /**< Array format */
+    unsigned int NumChannels; /**< Channels per array element */
+    unsigned int Flags;       /**< Flags */
+} CUDA_ARRAY3D_DESCRIPTOR;
+
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+#if __CUDA_API_VERSION >= 5000
+
+/**
+ * CUDA Resource descriptor
+ */
+typedef struct CUDA_RESOURCE_DESC_st
+{
+    CUresourcetype resType;                   /**< Resource type */
+
+    union {
+        struct {
+            CUarray hArray;                   /**< CUDA array */
+        } array;
+        struct {
+            CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */
+        } mipmap;
+        struct {
+            CUdeviceptr devPtr;               /**< Device pointer */
+            CUarray_format format;            /**< Array format */
+            unsigned int numChannels;         /**< Channels per array element */
+            size_t sizeInBytes;               /**< Size in bytes */
+        } linear;
+        struct {
+            CUdeviceptr devPtr;               /**< Device pointer */
+            CUarray_format format;            /**< Array format */
+            unsigned int numChannels;         /**< Channels per array element */
+            size_t width;                     /**< Width of the array in elements */
+            size_t height;                    /**< Height of the array in elements */
+            size_t pitchInBytes;              /**< Pitch between two rows in bytes */
+        } pitch2D;
+        struct {
+            int reserved[32];
+        } reserved;
+    } res;
+
+    unsigned int flags;                       /**< Flags (must be zero) */
+} CUDA_RESOURCE_DESC;
+
+/**
+ * Texture descriptor
+ */
+typedef struct CUDA_TEXTURE_DESC_st {
+    CUaddress_mode addressMode[3];  /**< Address modes */
+    CUfilter_mode filterMode;       /**< Filter mode */
+    unsigned int flags;             /**< Flags */
+    unsigned int maxAnisotropy;     /**< Maximum anisotropy ratio */
+    CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */
+    float mipmapLevelBias;          /**< Mipmap level bias */
+    float minMipmapLevelClamp;      /**< Mipmap minimum level clamp */
+    float maxMipmapLevelClamp;      /**< Mipmap maximum level clamp */
+    float borderColor[4];           /**< Border Color */
+    int reserved[12];
+} CUDA_TEXTURE_DESC;
+
+/**
+ * Resource view format
+ */
+typedef enum CUresourceViewFormat_enum
+{
+    CU_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
+    CU_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X8      = 0x03, /**< 4 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X8      = 0x04, /**< 1 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X8      = 0x05, /**< 2 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X8      = 0x06, /**< 4 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X16     = 0x07, /**< 1 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X16     = 0x08, /**< 2 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X16     = 0x09, /**< 4 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X16     = 0x0a, /**< 1 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X16     = 0x0b, /**< 2 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X16     = 0x0c, /**< 4 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X32     = 0x0d, /**< 1 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X32     = 0x0e, /**< 2 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X32     = 0x0f, /**< 4 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X32     = 0x10, /**< 1 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X32     = 0x11, /**< 2 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X32     = 0x12, /**< 4 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_FLOAT_1X16    = 0x13, /**< 1 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X16    = 0x14, /**< 2 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X16    = 0x15, /**< 4 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_1X32    = 0x16, /**< 1 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X32    = 0x17, /**< 2 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X32    = 0x18, /**< 4 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC1  = 0x19, /**< Block compressed 1 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC2  = 0x1a, /**< Block compressed 2 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC3  = 0x1b, /**< Block compressed 3 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC4  = 0x1c, /**< Block compressed 4 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC4    = 0x1d, /**< Block compressed 4 signed */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC5  = 0x1e, /**< Block compressed 5 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC5    = 0x1f, /**< Block compressed 5 signed */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
+    CU_RES_VIEW_FORMAT_SIGNED_BC6H   = 0x21, /**< Block compressed 6 signed half-float */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC7  = 0x22  /**< Block compressed 7 */
+} CUresourceViewFormat;
+
+/**
+ * Resource view descriptor
+ */
+typedef struct CUDA_RESOURCE_VIEW_DESC_st
+{
+    CUresourceViewFormat format;   /**< Resource view format */
+    size_t width;                  /**< Width of the resource view */
+    size_t height;                 /**< Height of the resource view */
+    size_t depth;                  /**< Depth of the resource view */
+    unsigned int firstMipmapLevel; /**< First defined mipmap level */
+    unsigned int lastMipmapLevel;  /**< Last defined mipmap level */
+    unsigned int firstLayer;       /**< First layer index */
+    unsigned int lastLayer;        /**< Last layer index */
+    unsigned int reserved[16];
+} CUDA_RESOURCE_VIEW_DESC;
+
+/**
+ * GPU Direct v3 tokens
+ */
+typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st {
+    unsigned long long p2pToken;
+    unsigned int vaSpaceToken;
+} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS;
+
+#endif /* __CUDA_API_VERSION >= 5000 */
+
+#if __CUDA_API_VERSION >= 9000
+
+/**
+ * Kernel launch parameters
+ */
+typedef struct CUDA_LAUNCH_PARAMS_st {
+    CUfunction function;         /**< Kernel to launch */
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    CUstream hStream;            /**< Stream identifier */
+    void **kernelParams;         /**< Array of pointers to kernel parameters */
+} CUDA_LAUNCH_PARAMS;
+
+#endif /* __CUDA_API_VERSION >= 9000 */
+
+#if __CUDA_API_VERSION >= 10000
+
+/**
+ * External memory handle types
+ */
+typedef enum CUexternalMemoryHandleType_enum {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD        = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32     = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3,
+    /**
+     * Handle is a D3D12 heap object
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP       = 4,
+    /**
+     * Handle is a D3D12 committed resource
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE   = 5
+} CUexternalMemoryHandleType;
+
+/**
+ * Indicates that the external memory object is a dedicated resource
+ */
+#define CUDA_EXTERNAL_MEMORY_DEDICATED   0x1
+
+/**
+ * External memory handle descriptor
+ */
+typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
+    /**
+     * Type of the handle
+     */
+    CUexternalMemoryHandleType type;
+    union {
+        /**
+         * File descriptor referencing the memory object. Valid
+         * when type is
+         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is
+         * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid memory object.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+    } handle;
+    /**
+     * Size of the memory allocation
+     */
+    unsigned long long size;
+    /**
+     * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
+
+/**
+ * External memory buffer descriptor
+ */
+typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
+    /**
+     * Offset into the memory object where the buffer's base is
+     */
+    unsigned long long offset;
+    /**
+     * Size of the buffer
+     */
+    unsigned long long size;
+    /**
+     * Flags reserved for future use. Must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
+
+/**
+ * External memory mipmap descriptor
+ */
+typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
+    /**
+     * Offset into the memory object where the base level of the
+     * mipmap chain is.
+     */
+    unsigned long long offset;
+    /**
+     * Format, dimension and type of base level of the mipmap chain
+     */
+    CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+    /**
+     * Total number of levels in the mipmap chain
+     */
+    unsigned int numLevels;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
+
+/**
+ * External semaphore handle types
+ */
+typedef enum CUexternalSemaphoreHandleType_enum {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD        = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32     = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3,
+    /**
+     * Handle is a shared NT handle referencing a D3D12 fence object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE     = 4
+} CUexternalSemaphoreHandleType;
+
+/**
+ * External semaphore handle descriptor
+ */
+typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
+    /**
+     * Type of the handle
+     */
+    CUexternalSemaphoreHandleType type;
+    union {
+        /**
+         * File descriptor referencing the semaphore object. Valid
+         * when type is
+         * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is
+         * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid synchronization primitive.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+    } handle;
+    /**
+     * Flags reserved for the future. Must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
+
+/**
+ * External semaphore signal parameters
+ */
+typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st {
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be signaled
+             */
+            unsigned long long value;
+        } fence;
+        unsigned int reserved[16];
+    } params;
+    /**
+     * Flags reserved for the future. Must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS;
+
+/**
+ * External semaphore wait parameters
+ */
+typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be waited on
+             */
+            unsigned long long value;
+        } fence;
+        unsigned int reserved[16];
+    } params;
+    /**
+     * Flags reserved for the future. Must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS;
+
+
+#endif /* __CUDA_API_VERSION >= 10000 */
+
+/**
+ * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only
+ * waits for prior work in the stream corresponding to that GPU to complete before the
+ * kernel begins execution.
+ */
+#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC   0x01
+
+/**
+ * If set, any subsequent work pushed in a stream that participated in a call to
+ * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on
+ * the GPU corresponding to that stream to complete before it begins execution.
+ */
+#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC  0x02
+
+/**
+ * If set, the CUDA array is a collection of layers, where each layer is either a 1D
+ * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
+ * of layers, not the depth of a 3D array.
+ */
+#define CUDA_ARRAY3D_LAYERED        0x01
+
+/**
+ * Deprecated, use CUDA_ARRAY3D_LAYERED
+ */
+#define CUDA_ARRAY3D_2DARRAY        0x01
+
+/**
+ * This flag must be set in order to bind a surface reference
+ * to the CUDA array
+ */
+#define CUDA_ARRAY3D_SURFACE_LDST   0x02
+
+/**
+ * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The
+ * width of such a CUDA array must be equal to its height, and Depth must be six.
+ * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps
+ * and Depth must be a multiple of six.
+ */
+#define CUDA_ARRAY3D_CUBEMAP        0x04
+
+/**
+ * This flag must be set in order to perform texture gather operations
+ * on a CUDA array.
+ */
+#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08
+
+/**
+ * This flag if set indicates that the CUDA
+ * array is a DEPTH_TEXTURE.
+ */
+#define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10
+
+/**
+ * This flag indicates that the CUDA array may be bound as a color target
+ * in an external graphics API
+ */
+#define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20
+
+/**
+ * Override the texref format with a format inferred from the array.
+ * Flag for ::cuTexRefSetArray()
+ */
+#define CU_TRSA_OVERRIDE_FORMAT 0x01
+
+/**
+ * Read the texture as integers rather than promoting the values to floats
+ * in the range [0,1].
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_READ_AS_INTEGER         0x01
+
+/**
+ * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_NORMALIZED_COORDINATES  0x02
+
+/**
+ * Perform sRGB->linear conversion during texture read.
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_SRGB  0x10
+
+/**
+ * End of array terminator for the \p extra parameter to
+ * ::cuLaunchKernel
+ */
+#define CU_LAUNCH_PARAM_END            ((void*)0x00)
+
+/**
+ * Indicator that the next value in the \p extra parameter to
+ * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
+ * parameters used for launching kernel \p f.  This buffer needs to
+ * honor all alignment/padding requirements of the individual parameters.
+ * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
+ * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
+ * effect.
+ */
+#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
+
+/**
+ * Indicator that the next value in the \p extra parameter to
+ * ::cuLaunchKernel will be a pointer to a size_t which contains the
+ * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
+ * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
+ * in the \p extra array if the value associated with
+ * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
+ */
+#define CU_LAUNCH_PARAM_BUFFER_SIZE    ((void*)0x02)
+
+/**
+ * For texture references loaded into the module, use default texunit from
+ * texture reference.
+ */
+#define CU_PARAM_TR_DEFAULT -1
+
+/**
+ * Device that represents the CPU
+ */
+#define CU_DEVICE_CPU               ((CUdevice)-1)
+
+/**
+ * Device that represents an invalid device
+ */
+#define CU_DEVICE_INVALID           ((CUdevice)-2)
+
+/** @} */ /* END CUDA_TYPES */
+
+#ifdef _WIN32
+#define CUDAAPI __stdcall
+#else
+#define CUDAAPI
+#endif
+
+/**
+ * \defgroup CUDA_ERROR Error Handling
+ *
+ * ___MANBRIEF___ error handling functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the error handling functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Gets the string description of an error code
+ *
+ * Sets \p *pStr to the address of a NULL-terminated string description
+ * of the error code \p error.
+ * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
+ * will be returned and \p *pStr will be set to the NULL address.
+ *
+ * \param error - Error code to convert to string
+ * \param pStr - Address of the string pointer.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::CUresult,
+ * ::cudaGetErrorString
+ */
+CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr);
+
+/**
+ * \brief Gets the string representation of an error code enum name
+ *
+ * Sets \p *pStr to the address of a NULL-terminated string representation
+ * of the name of the enum error code \p error.
+ * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
+ * will be returned and \p *pStr will be set to the NULL address.
+ *
+ * \param error - Error code to convert to string
+ * \param pStr - Address of the string pointer.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::CUresult,
+ * ::cudaGetErrorName
+ */
+CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr);
+
+/** @} */ /* END CUDA_ERROR */
+
+/**
+ * \defgroup CUDA_INITIALIZE Initialization
+ *
+ * ___MANBRIEF___ initialization functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the initialization functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Initialize the CUDA driver API
+ *
+ * Initializes the driver API and must be called before any other function from
+ * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit()
+ * has not been called, any function from the driver API will return
+ * ::CUDA_ERROR_NOT_INITIALIZED.
+ *
+ * \param Flags - Initialization flag for CUDA.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ */
+CUresult CUDAAPI cuInit(unsigned int Flags);
+
+/** @} */ /* END CUDA_INITIALIZE */
+
+/**
+ * \defgroup CUDA_VERSION Version Management
+ *
+ * ___MANBRIEF___ version management functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the version management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the latest CUDA version supported by driver
+ *
+ * Returns in \p *driverVersion the version of CUDA supported by
+ * the driver.  The version is returned as
+ * (1000 &times; major + 10 &times; minor). For example, CUDA 9.2
+ * would be represented by 9020.
+ *
+ * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if
+ * \p driverVersion is NULL.
+ *
+ * \param driverVersion - Returns the CUDA driver version
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaDriverGetVersion,
+ * ::cudaRuntimeGetVersion
+ */
+CUresult CUDAAPI cuDriverGetVersion(int *driverVersion);
+
+/** @} */ /* END CUDA_VERSION */
+
+/**
+ * \defgroup CUDA_DEVICE Device Management
+ *
+ * ___MANBRIEF___ device management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the device management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns a handle to a compute device
+ *
+ * Returns in \p *device a device handle given an ordinal in the range <b>[0,
+ * ::cuDeviceGetCount()-1]</b>.
+ *
+ * \param device  - Returned device handle
+ * \param ordinal - Device number to get handle for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceTotalMem
+ */
+CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
+
+/**
+ * \brief Returns the number of compute-capable devices
+ *
+ * Returns in \p *count the number of devices with compute capability greater
+ * than or equal to 2.0 that are available for execution. If there is no such
+ * device, ::cuDeviceGetCount() returns 0.
+ *
+ * \param count - Returned number of compute-capable devices
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cudaGetDeviceCount
+ */
+CUresult CUDAAPI cuDeviceGetCount(int *count);
+
+/**
+ * \brief Returns an identifer string for the device
+ *
+ * Returns an ASCII string identifying the device \p dev in the NULL-terminated
+ * string pointed to by \p name. \p len specifies the maximum length of the
+ * string that may be returned.
+ *
+ * \param name - Returned identifier string for the device
+ * \param len  - Maximum length of string to store in \p name
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
+
+#if __CUDA_API_VERSION >= 9020
+/**
+ * \brief Return an UUID for the device
+ *
+ * Returns 16-octets identifing the device \p dev in the structure
+ * pointed by the \p uuid.
+ *
+ * \param uuid - Returned UUID
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev);
+#endif
+
+#if defined(_WIN32) && __CUDA_API_VERSION >= 10000
+/**
+ * \brief Return an LUID and device node mask for the device
+ *
+ * Return identifying information (\p luid and \p deviceNodeMask) to allow
+ * matching device with graphics APIs.
+ *
+ * \param luid - Returned LUID
+ * \param deviceNodeMask - Returned device node mask
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev);
+#endif
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Returns the total amount of memory on the device
+ *
+ * Returns in \p *bytes the total amount of memory available on the device
+ * \p dev in bytes.
+ *
+ * \param bytes - Returned memory available on device in bytes
+ * \param dev   - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cudaMemGetInfo
+ */
+CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * \brief Returns information about the device
+ *
+ * Returns in \p *pi the integer value of the attribute \p attrib on device
+ * \p dev. The supported attributes are:
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per
+ *   block;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of
+ *   shared memory available to a thread block in bytes;
+ * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for
+ *   __constant__ variables in a CUDA C kernel in bytes;
+ * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the
+ *   memory copy functions that involve memory regions allocated through
+ *   ::cuMemAllocPitch();
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D
+ *  texture width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width
+ *  for a 1D texture bound to linear memory;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum
+ *  mipmapped 1D texture width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D
+ *  texture width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D
+ *  texture height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width
+ *  for a 2D texture bound to linear memory;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height
+ *  for a 2D texture bound to linear memory;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch
+ *  in bytes for a 2D texture bound to linear memory;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum
+ *  mipmapped 2D texture width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum
+ *  mipmapped 2D texture height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D
+ *  texture width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D
+ *  texture height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D
+ *  texture depth;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE:
+ *  Alternate maximum 3D texture width, 0 if no alternate
+ *  maximum 3D texture size is supported;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE:
+ *  Alternate maximum 3D texture height, 0 if no alternate
+ *  maximum 3D texture size is supported;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE:
+ *  Alternate maximum 3D texture depth, 0 if no alternate
+ *  maximum 3D texture size is supported;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH:
+ *  Maximum cubemap texture width or height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH:
+ *  Maximum 1D layered texture width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:
+ *   Maximum layers in a 1D layered texture;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH:
+ *  Maximum 2D layered texture width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:
+ *   Maximum 2D layered texture height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:
+ *   Maximum layers in a 2D layered texture;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH:
+ *   Maximum cubemap layered texture width or height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS:
+ *   Maximum layers in a cubemap layered texture;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH:
+ *   Maximum 1D surface width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH:
+ *   Maximum 2D surface width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT:
+ *   Maximum 2D surface height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH:
+ *   Maximum 3D surface width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT:
+ *   Maximum 3D surface height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH:
+ *   Maximum 3D surface depth;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH:
+ *   Maximum 1D layered surface width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS:
+ *   Maximum layers in a 1D layered surface;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH:
+ *   Maximum 2D layered surface width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT:
+ *   Maximum 2D layered surface height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS:
+ *   Maximum layers in a 2D layered surface;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH:
+ *   Maximum cubemap surface width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH:
+ *   Maximum cubemap layered surface width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS:
+ *   Maximum layers in a cubemap layered surface;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit
+ *   registers available to a thread block;
+ * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz;
+ * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture
+ *   base addresses aligned to ::textureAlign bytes do not need an offset
+ *   applied to texture fetches;
+ * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement
+ *   for 2D texture references bound to pitched memory;
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy
+ *   memory between host and device while executing a kernel, or 0 if not;
+ * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on
+ *   the device;
+ * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit
+ *   for kernels executed on the device, or 0 if not;
+ * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the
+ *   memory subsystem, or 0 if not;
+ * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host
+ *   memory into the CUDA address space, or 0 if not;
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently
+ *   in. Available modes are as follows:
+ *   - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and
+ *     can have multiple CUDA contexts present at a single time.
+ *   - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is
+ *     prohibited from creating new CUDA contexts.
+ *   - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS:  Compute-exclusive-process mode - Device
+ *     can have only one context used by a single process at a time.
+ * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports
+ *   executing multiple kernels within the same context simultaneously, or 0 if
+ *   not. It is not guaranteed that multiple kernels will be resident
+ *   on the device concurrently so this feature should not be relied upon for
+ *   correctness;
+ * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the
+ *    device, 0 if error correction is disabled or not supported by the device;
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device;
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier
+ *   of the device;
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device
+ * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC
+ *    is only available on Tesla hardware running Windows Vista or later;
+ * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz;
+ * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits;
+ * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor;
+ * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with
+ *   the host, or 0 if not;
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number;
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number;
+ * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals
+ *    in L1 cache, 0 if caching globals in L1 cache is not supported by the device;
+ * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals
+ *    in L1 cache, 0 if caching locals in L1 cache is not supported by the device;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of
+ *   shared memory available to a multiprocessor in bytes; this amount is shared
+ *   by all thread blocks simultaneously resident on a multiprocessor;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit
+ *   registers available to a multiprocessor; this number is shared by all thread
+ *   blocks simultaneously resident on a multiprocessor;
+ * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory
+ *   on this system, 0 if allocating managed memory is not supported by the device on this system.
+ * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not.
+ * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices
+ *   associated with the same board. Devices on the same multi-GPU board will share the same identifier.
+ * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host
+ *   supports native atomic operations.
+ * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance
+ *   (in floating-point operations per second) to double precision performance.
+ * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device suppports coherently accessing
+ *   pageable memory without calling cudaHostRegister on it.
+ * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory
+ *   concurrently with the CPU.
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption.
+ * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered
+ *   memory at the same virtual address as the CPU.
+ * -  ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size
+ *    suported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call.
+ *    For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+ * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's
+ *   page tables.
+ * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration.
+ *
+ * \param pi     - Returned device attribute value
+ * \param attrib - Device attribute to query
+ * \param dev    - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cudaDeviceGetAttribute,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
+
+/** @} */ /* END CUDA_DEVICE */
+
+/**
+ * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated device management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the device management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns properties for a selected device
+ *
+ * \deprecated
+ *
+ * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute().
+ *
+ * Returns in \p *prop the properties of device \p dev. The ::CUdevprop
+ * structure is defined as:
+ *
+ * \code
+     typedef struct CUdevprop_st {
+     int maxThreadsPerBlock;
+     int maxThreadsDim[3];
+     int maxGridSize[3];
+     int sharedMemPerBlock;
+     int totalConstantMemory;
+     int SIMDWidth;
+     int memPitch;
+     int regsPerBlock;
+     int clockRate;
+     int textureAlign
+  } CUdevprop;
+ * \endcode
+ * where:
+ *
+ * - ::maxThreadsPerBlock is the maximum number of threads per block;
+ * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block;
+ * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid;
+ * - ::sharedMemPerBlock is the total amount of shared memory available per
+ *   block in bytes;
+ * - ::totalConstantMemory is the total amount of constant memory available on
+ *   the device in bytes;
+ * - ::SIMDWidth is the warp size;
+ * - ::memPitch is the maximum pitch allowed by the memory copy functions that
+ *   involve memory regions allocated through ::cuMemAllocPitch();
+ * - ::regsPerBlock is the total number of registers available per block;
+ * - ::clockRate is the clock frequency in kilohertz;
+ * - ::textureAlign is the alignment requirement; texture base addresses that
+ *   are aligned to ::textureAlign bytes do not need an offset applied to
+ *   texture fetches.
+ *
+ * \param prop - Returned properties of device
+ * \param dev  - Device to get properties for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
+
+/**
+ * \brief Returns the compute capability of the device
+ *
+ * \deprecated
+ *
+ * This function was deprecated as of CUDA 5.0 and its functionality superceded
+ * by ::cuDeviceGetAttribute().
+ *
+ * Returns in \p *major and \p *minor the major and minor revision numbers that
+ * define the compute capability of the device \p dev.
+ *
+ * \param major - Major revision number
+ * \param minor - Minor revision number
+ * \param dev   - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
+
+/** @} */ /* END CUDA_DEVICE_DEPRECATED */
+
+/**
+ * \defgroup CUDA_PRIMARY_CTX Primary Context Management
+ *
+ * ___MANBRIEF___ primary context management functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the primary context management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * The primary context is unique per device and shared with the CUDA runtime API.
+ * These functions allow integration with other libraries using CUDA.
+ *
+ * @{
+ */
+
+#if __CUDA_API_VERSION >= 7000
+
+/**
+ * \brief Retain the primary context on the GPU
+ *
+ * Retains the primary context on the device, creating it if necessary,
+ * increasing its usage count. The caller must call
+ * ::cuDevicePrimaryCtxRelease() when done using the context.
+ * Unlike ::cuCtxCreate() the newly created context is not pushed onto the stack.
+ *
+ * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ * the device is ::CU_COMPUTEMODE_PROHIBITED.  The function ::cuDeviceGetAttribute()
+ * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode
+ * of the device.
+ * The <i>nvidia-smi</i> tool can be used to set the compute mode for
+ * devices. Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * Please note that the primary context always supports pinned allocations. Other
+ * flags can be specified by ::cuDevicePrimaryCtxSetFlags().
+ *
+ * \param pctx  - Returned context handle of the new context
+ * \param dev   - Device for which primary context is requested
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRelease,
+ * ::cuDevicePrimaryCtxSetFlags,
+ * ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev);
+
+/**
+ * \brief Release the primary context on the GPU
+ *
+ * Releases the primary context interop on the device by decreasing the usage
+ * count by 1. If the usage drops to 0 the primary context of device \p dev
+ * will be destroyed regardless of how many threads it is current to.
+ *
+ * Please note that unlike ::cuCtxDestroy() this method does not pop the context
+ * from stack in any circumstances.
+ *
+ * \param dev - Device which primary context is released
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRetain,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
+
+/**
+ * \brief Set flags for the primary context
+ *
+ * Sets the flags for the primary context on the device overwriting perviously
+ * set ones. If the primary context is already created
+ * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE is returned.
+ *
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * However, on low power devices like Tegra, it always defaults to
+ * ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage.
+ *
+ * \param dev   - Device for which the primary context flags are set
+ * \param flags - New flags for the device
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRetain,
+ * ::cuDevicePrimaryCtxGetState,
+ * ::cuCtxCreate,
+ * ::cuCtxGetFlags,
+ * ::cudaSetDeviceFlags
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
+
+/**
+ * \brief Get the state of the primary context
+ *
+ * Returns in \p *flags the flags for the primary context of \p dev, and in
+ * \p *active whether it is active.  See ::cuDevicePrimaryCtxSetFlags for flag
+ * values.
+ *
+ * \param dev    - Device to get primary context flags for
+ * \param flags  - Pointer to store flags
+ * \param active - Pointer to store context state; 0 = inactive, 1 = active
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDevicePrimaryCtxSetFlags,
+ * ::cuCtxGetFlags,
+ * ::cudaGetDeviceFlags
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active);
+
+/**
+ * \brief Destroy all allocations and reset all state on the primary context
+ *
+ * Explicitly destroys and cleans up all resources associated with the current
+ * device in the current process.
+ *
+ * Note that it is responsibility of the calling function to ensure that no
+ * other module in the process is using the device any more. For that reason
+ * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases.
+ * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease()
+ * even after resetting the device.
+ *
+ * \param dev - Device for which primary context is destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRetain,
+ * ::cuDevicePrimaryCtxRelease,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceReset
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
+
+#endif /* __CUDA_API_VERSION >= 7000 */
+
+/** @} */ /* END CUDA_PRIMARY_CTX */
+
+
+/**
+ * \defgroup CUDA_CTX Context Management
+ *
+ * ___MANBRIEF___ context management functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the context management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Create a CUDA context
+ *
+ * Creates a new CUDA context and associates it with the calling thread. The
+ * \p flags parameter is described below. The context is created with a usage
+ * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or
+ * when done using the context. If a context is already current to the thread,
+ * it is supplanted by the newly created context and may be restored by a subsequent
+ * call to ::cuCtxPopCurrent().
+ *
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * However, on low power devices like Tegra, it always defaults to
+ * ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
+ * This flag must be set in order to allocate pinned host memory that is
+ * accessible to the GPU.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage.
+ *
+ * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
+ * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
+ * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
+ * the compute mode for * devices.
+ * Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * \param pctx  - Returned context handle of the new context
+ * \param flags - Context creation flags
+ * \param dev   - Device to create context on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Destroy a CUDA context
+ *
+ * Destroys the CUDA context specified by \p ctx.  The context \p ctx will be
+ * destroyed regardless of how many threads it is current to.
+ * It is the responsibility of the calling function to ensure that no API
+ * call issues using \p ctx while ::cuCtxDestroy() is executing.
+ *
+ * If \p ctx is current to the calling thread then \p ctx will also be
+ * popped from the current thread's context stack (as though ::cuCtxPopCurrent()
+ * were called).  If \p ctx is current to other threads, then \p ctx will
+ * remain current to those threads, and attempting to access \p ctx from
+ * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED.
+ *
+ * \param ctx - Context to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Pushes a context on the current CPU thread
+ *
+ * Pushes the given context \p ctx onto the CPU thread's stack of current
+ * contexts. The specified context becomes the CPU thread's current context, so
+ * all CUDA functions that operate on the current context are affected.
+ *
+ * The previous current context may be made current again by calling
+ * ::cuCtxDestroy() or ::cuCtxPopCurrent().
+ *
+ * \param ctx - Context to push
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
+
+/**
+ * \brief Pops the current CUDA context from the current CPU thread.
+ *
+ * Pops the current CUDA context from the CPU thread and passes back the
+ * old context handle in \p *pctx. That context may then be made current
+ * to a different CPU thread by calling ::cuCtxPushCurrent().
+ *
+ * If a context was current to the CPU thread before ::cuCtxCreate() or
+ * ::cuCtxPushCurrent() was called, this function makes that context current to
+ * the CPU thread again.
+ *
+ * \param pctx - Returned new context handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
+
+/**
+ * \brief Binds the specified CUDA context to the calling CPU thread
+ *
+ * Binds the specified CUDA context to the calling CPU thread.
+ * If \p ctx is NULL then the CUDA context previously bound to the
+ * calling CPU thread is unbound and ::CUDA_SUCCESS is returned.
+ *
+ * If there exists a CUDA context stack on the calling CPU thread, this
+ * will replace the top of that stack with \p ctx.
+ * If \p ctx is NULL then this will be equivalent to popping the top
+ * of the calling CPU thread's CUDA context stack (or a no-op if the
+ * calling CPU thread's CUDA context stack is empty).
+ *
+ * \param ctx - Context to bind to the calling CPU thread
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxGetCurrent,
+ * ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cudaSetDevice
+ */
+CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx);
+
+/**
+ * \brief Returns the CUDA context bound to the calling CPU thread.
+ *
+ * Returns in \p *pctx the CUDA context bound to the calling CPU thread.
+ * If no context is bound to the calling CPU thread then \p *pctx is
+ * set to NULL and ::CUDA_SUCCESS is returned.
+ *
+ * \param pctx - Returned context handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxSetCurrent,
+ * ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cudaGetDevice
+ */
+CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx);
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+/**
+ * \brief Returns the device ID for the current context
+ *
+ * Returns in \p *device the ordinal of the current context's device.
+ *
+ * \param device - Returned device ID for the current context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaGetDevice
+ */
+CUresult CUDAAPI cuCtxGetDevice(CUdevice *device);
+
+#if __CUDA_API_VERSION >= 7000
+/**
+ * \brief Returns the flags for the current context
+ *
+ * Returns in \p *flags the flags of the current context. See ::cuCtxCreate
+ * for flag values.
+ *
+ * \param flags - Pointer to store flags of current context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetCurrent,
+ * ::cuCtxGetDevice
+ * ::cuCtxGetLimit,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cudaGetDeviceFlags
+ */
+CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags);
+#endif /* __CUDA_API_VERSION >= 7000 */
+
+/**
+ * \brief Block for a context's tasks to complete
+ *
+ * Blocks until the device has completed all preceding requested tasks.
+ * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed.
+ * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the
+ * CPU thread will block until the GPU context has finished its work.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cudaDeviceSynchronize
+ */
+CUresult CUDAAPI cuCtxSynchronize(void);
+
+/**
+ * \brief Set resource limits
+ *
+ * Setting \p limit to \p value is a request by the application to update
+ * the current limit maintained by the context. The driver is free to
+ * modify the requested value to meet h/w requirements (this could be
+ * clamping to minimum or maximum values, rounding up to nearest element
+ * size, etc). The application can use ::cuCtxGetLimit() to find out exactly
+ * what the limit has been set to.
+ *
+ * Setting each ::CUlimit has its own specific restrictions, so each is
+ * discussed here.
+ *
+ * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread.
+ *
+ * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used
+ *   by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE
+ *   must be performed before launching any kernel that uses the ::printf()
+ *   device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used
+ *   by the ::malloc() and ::free() device system calls. Setting
+ *   ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel
+ *   that uses the ::malloc() or ::free() device system calls, otherwise
+ *   ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of
+ *   a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting
+ *   this limit must be performed before any launch of a kernel that uses the
+ *   device runtime and calls ::cudaDeviceSynchronize() above the default sync
+ *   depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail
+ *   with error code ::cudaErrorSyncDepthExceeded if the limitation is
+ *   violated. This limit can be set smaller than the default or up the maximum
+ *   launch depth of 24. When setting this limit, keep in mind that additional
+ *   levels of sync depth require the driver to reserve large amounts of device
+ *   memory which can no longer be used for user allocations. If these
+ *   reservations of device memory fail, ::cuCtxSetLimit will return
+ *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability 3.5 and
+ *   higher. Attempting to set this limit on devices of compute capability less
+ *   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
+ *   returned.
+ *
+ * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of
+ *   outstanding device runtime launches that can be made from the current
+ *   context. A grid is outstanding from the point of launch up until the grid
+ *   is known to have been completed. Device runtime launches which violate
+ *   this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when
+ *   ::cudaGetLastError() is called after launch. If more pending launches than
+ *   the default (2048 launches) are needed for a module using the device
+ *   runtime, this limit can be increased. Keep in mind that being able to
+ *   sustain additional pending launches will require the driver to reserve
+ *   larger amounts of device memory upfront which can no longer be used for
+ *   allocations. If these reservations fail, ::cuCtxSetLimit will return
+ *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability 3.5 and
+ *   higher. Attempting to set this limit on devices of compute capability less
+ *   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
+ *   returned.
+ *
+ * \param limit - Limit to set
+ * \param value - Size of limit
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNSUPPORTED_LIMIT,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceSetLimit
+ */
+CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value);
+
+/**
+ * \brief Returns resource limits
+ *
+ * Returns in \p *pvalue the current size of \p limit.  The supported
+ * ::CUlimit values are:
+ * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread.
+ * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the
+ *   ::printf() device system call.
+ * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the
+ *   ::malloc() and ::free() device system calls.
+ * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread
+ *   can issue the device runtime call ::cudaDeviceSynchronize() to wait on
+ *   child grid launches to complete.
+ * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding
+ *   device runtime launches that can be made from this context.
+ *
+ * \param limit  - Limit to query
+ * \param pvalue - Returned size of limit
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNSUPPORTED_LIMIT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceGetLimit
+ */
+CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit);
+
+/**
+ * \brief Returns the preferred cache configuration for the current context.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this function returns through \p pconfig the preferred cache configuration
+ * for the current context. This is only a preference. The driver will use
+ * the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute functions.
+ *
+ * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices
+ * where the size of the L1 cache and shared memory are fixed.
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \param pconfig - Returned cache configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceGetCacheConfig
+ */
+CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig);
+
+/**
+ * \brief Sets the preferred cache configuration for the current context.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p config the preferred cache configuration for
+ * the current context. This is only a preference. The driver will use
+ * the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute the function. Any function preference
+ * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide
+ * setting. Setting the context-wide cache configuration to
+ * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer
+ * to not change the cache configuration unless required to launch the kernel.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \param config - Requested cache configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceSetCacheConfig
+ */
+CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config);
+
+#if __CUDA_API_VERSION >= 4020
+/**
+ * \brief Returns the current shared memory configuration for the current context.
+ *
+ * This function will return in \p pConfig the current size of shared memory banks
+ * in the current context. On devices with configurable shared memory banks,
+ * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all
+ * subsequent kernel launches will by default use the new bank size. When
+ * ::cuCtxGetSharedMemConfig is called on devices without configurable shared
+ * memory, it will return the fixed bank size of the hardware.
+ *
+ * The returned bank configurations can be either:
+ * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE:  shared memory bank width is
+ *   four bytes.
+ * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will
+ *   eight bytes.
+ *
+ * \param pConfig - returned shared memory configuration
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceGetSharedMemConfig
+ */
+CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig);
+
+/**
+ * \brief Sets the shared memory configuration for the current context.
+ *
+ * On devices with configurable shared memory banks, this function will set
+ * the context's shared memory bank size which is used for subsequent kernel
+ * launches.
+ *
+ * Changed the shared memory configuration between launches may insert a device
+ * side synchronization point between those launches.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance.
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * The supported bank configurations are:
+ * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial
+ *   setting (currently, four bytes).
+ * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively four bytes.
+ * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively eight bytes.
+ *
+ * \param config - requested shared memory configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceSetSharedMemConfig
+ */
+CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config);
+#endif
+
+/**
+ * \brief Gets the context's API version.
+ *
+ * Returns a version number in \p version corresponding to the capabilities of
+ * the context (e.g. 3010 or 3020), which library developers can use to direct
+ * callers to a specific API version. If \p ctx is NULL, returns the API version
+ * used to create the currently bound context.
+ *
+ * Note that new API versions are only introduced when context capabilities are
+ * changed that break binary compatibility, so the API version and driver version
+ * may be different. For example, it is valid for the API version to be 3020 while
+ * the driver version is 4020.
+ *
+ * \param ctx     - Context to check
+ * \param version - Pointer to version
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version);
+
+/**
+ * \brief Returns numerical values that correspond to the least and
+ * greatest stream priorities.
+ *
+ * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond
+ * to the least and greatest stream priorities respectively. Stream priorities
+ * follow a convention where lower numbers imply greater priorities. The range of
+ * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority].
+ * If the user attempts to create a stream with a priority value that is
+ * outside the meaningful range as specified by this API, the priority is
+ * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority
+ * respectively. See ::cuStreamCreateWithPriority for details on creating a
+ * priority stream.
+ * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value
+ * is not desired.
+ *
+ * This function will return '0' in both \p *leastPriority and \p *greatestPriority if
+ * the current context's device does not support stream priorities
+ * (see ::cuDeviceGetAttribute).
+ *
+ * \param leastPriority    - Pointer to an int in which the numerical value for least
+ *                           stream priority is returned
+ * \param greatestPriority - Pointer to an int in which the numerical value for greatest
+ *                           stream priority is returned
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreateWithPriority,
+ * ::cuStreamGetPriority,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceGetStreamPriorityRange
+ */
+CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority);
+
+/** @} */ /* END CUDA_CTX */
+
+/**
+ * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated context management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated context management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Increment a context's usage-count
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated and should not be used.
+ *
+ * Increments the usage count of the context and passes back a context handle
+ * in \p *pctx that must be passed to ::cuCtxDetach() when the application is
+ * done with the context. ::cuCtxAttach() fails if there is no context current
+ * to the thread.
+ *
+ * Currently, the \p flags parameter must be 0.
+ *
+ * \param pctx  - Returned context handle of the current context
+ * \param flags - Context attach flags (must be 0)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxDetach,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags);
+
+/**
+ * \brief Decrement a context's usage-count
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated and should not be used.
+ *
+ * Decrements the usage count of the context \p ctx, and destroys the context
+ * if the usage count goes to 0. The context must be a handle that was passed
+ * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the
+ * calling thread.
+ *
+ * \param ctx - Context to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx);
+
+/** @} */ /* END CUDA_CTX_DEPRECATED */
+
+
+/**
+ * \defgroup CUDA_MODULE Module Management
+ *
+ * ___MANBRIEF___ module management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the module management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Loads a compute module
+ *
+ * Takes a filename \p fname and loads the corresponding module \p module into
+ * the current context. The CUDA driver API does not attempt to lazily
+ * allocate the resources needed by a module; if the memory for functions and
+ * data (constant and global) needed by the module cannot be allocated,
+ * ::cuModuleLoad() fails. The file should be a \e cubin file as output by
+ * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or
+ * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later.
+ *
+ * \param module - Returned module
+ * \param fname  - Filename of module to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_FILE_NOT_FOUND,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
+
+/**
+ * \brief Load a module's data
+ *
+ * Takes a pointer \p image and loads the corresponding module \p module into
+ * the current context. The pointer may be obtained by mapping a \e cubin or
+ * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
+ * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
+ * object into the executable resources and using operating system calls such
+ * as Windows \c FindResource() to obtain the pointer.
+ *
+ * \param module - Returned module
+ * \param image  - Module data to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image);
+
+/**
+ * \brief Load a module's data with options
+ *
+ * Takes a pointer \p image and loads the corresponding module \p module into
+ * the current context. The pointer may be obtained by mapping a \e cubin or
+ * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
+ * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
+ * object into the executable resources and using operating system calls such
+ * as Windows \c FindResource() to obtain the pointer. Options are passed as
+ * an array via \p options and any corresponding parameters are passed in
+ * \p optionValues. The number of total options is supplied via \p numOptions.
+ * Any outputs will be returned via \p optionValues.
+ *
+ * \param module       - Returned module
+ * \param image        - Module data to load
+ * \param numOptions   - Number of options
+ * \param options      - Options for JIT
+ * \param optionValues - Option values for JIT
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+
+/**
+ * \brief Load a module's data
+ *
+ * Takes a pointer \p fatCubin and loads the corresponding module \p module
+ * into the current context. The pointer represents a <i>fat binary</i> object,
+ * which is a collection of different \e cubin and/or \e PTX files, all
+ * representing the same device code, but compiled and optimized for different
+ * architectures.
+ *
+ * Prior to CUDA 4.0, there was no documented API for constructing and using
+ * fat binary objects by programmers.  Starting with CUDA 4.0, fat binary
+ * objects can be constructed by providing the <i>-fatbin option</i> to \b nvcc.
+ * More information can be found in the \b nvcc document.
+ *
+ * \param module   - Returned module
+ * \param fatCubin - Fat binary to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
+
+/**
+ * \brief Unloads a module
+ *
+ * Unloads a module \p hmod from the current context.
+ *
+ * \param hmod - Module to unload
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary
+ */
+CUresult CUDAAPI cuModuleUnload(CUmodule hmod);
+
+/**
+ * \brief Returns a function handle
+ *
+ * Returns in \p *hfunc the handle of the function of name \p name located in
+ * module \p hmod. If no function of that name exists, ::cuModuleGetFunction()
+ * returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param hfunc - Returned function handle
+ * \param hmod  - Module to retrieve function from
+ * \param name  - Name of function to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Returns a global pointer from a module
+ *
+ * Returns in \p *dptr and \p *bytes the base pointer and size of the
+ * global of name \p name located in module \p hmod. If no variable of that name
+ * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both
+ * parameters \p dptr and \p bytes are optional. If one of them is
+ * NULL, it is ignored.
+ *
+ * \param dptr  - Returned global device pointer
+ * \param bytes - Returned global size in bytes
+ * \param hmod  - Module to retrieve global from
+ * \param name  - Name of global to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload,
+ * ::cudaGetSymbolAddress,
+ * ::cudaGetSymbolSize
+ */
+CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * \brief Returns a handle to a texture reference
+ *
+ * Returns in \p *pTexRef the handle of the texture reference of name \p name
+ * in the module \p hmod. If no texture reference of that name exists,
+ * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference
+ * handle should not be destroyed, since it will be destroyed when the module
+ * is unloaded.
+ *
+ * \param pTexRef  - Returned texture reference
+ * \param hmod     - Module to retrieve texture reference from
+ * \param name     - Name of texture reference to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetSurfRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload,
+ * ::cudaGetTextureReference
+ */
+CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
+
+/**
+ * \brief Returns a handle to a surface reference
+ *
+ * Returns in \p *pSurfRef the handle of the surface reference of name \p name
+ * in the module \p hmod. If no surface reference of that name exists,
+ * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param pSurfRef  - Returned surface reference
+ * \param hmod     - Module to retrieve surface reference from
+ * \param name     - Name of surface reference to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload,
+ * ::cudaGetSurfaceReference
+ */
+CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
+
+#if __CUDA_API_VERSION >= 5050
+
+/**
+ * \brief Creates a pending JIT linker invocation.
+ *
+ * If the call is successful, the caller owns the returned CUlinkState, which
+ * should eventually be destroyed with ::cuLinkDestroy.  The
+ * device code machine size (32 or 64 bit) will match the calling application.
+ *
+ * Both linker and compiler options may be specified.  Compiler options will
+ * be applied to inputs to this linker action which must be compiled from PTX.
+ * The options ::CU_JIT_WALL_TIME,
+ * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+ * will accumulate data until the CUlinkState is destroyed.
+ *
+ * \p optionValues must remain valid for the life of the CUlinkState if output
+ * options are used.  No other references to inputs are maintained after this
+ * call returns.
+ *
+ * \param numOptions   Size of options arrays
+ * \param options      Array of linker and compiler options
+ * \param optionValues Array of option values, each cast to void *
+ * \param stateOut     On success, this will contain a CUlinkState to specify
+ *                     and complete this action
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuLinkAddData,
+ * ::cuLinkAddFile,
+ * ::cuLinkComplete,
+ * ::cuLinkDestroy
+ */
+CUresult CUDAAPI
+cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+
+/**
+ * \brief Add an input to a pending linker invocation
+ *
+ * Ownership of \p data is retained by the caller.  No reference is retained to any
+ * inputs after this call returns.
+ *
+ * This method accepts only compiler options, which are used if the data must
+ * be compiled from PTX, and does not accept any of
+ * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
+ * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
+ *
+ * \param state        A pending linker action.
+ * \param type         The type of the input data.
+ * \param data         The input data.  PTX must be NULL-terminated.
+ * \param size         The length of the input data.
+ * \param name         An optional name for this input in log messages.
+ * \param numOptions   Size of options.
+ * \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate).
+ * \param optionValues Array of option values, each cast to void *.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU
+ *
+ * \sa ::cuLinkCreate,
+ * ::cuLinkAddFile,
+ * ::cuLinkComplete,
+ * ::cuLinkDestroy
+ */
+CUresult CUDAAPI
+cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
+    unsigned int numOptions, CUjit_option *options, void **optionValues);
+
+/**
+ * \brief Add a file input to a pending linker invocation
+ *
+ * No reference is retained to any inputs after this call returns.
+ *
+ * This method accepts only compiler options, which are used if the input
+ * must be compiled from PTX, and does not accept any of
+ * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
+ * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
+ *
+ * This method is equivalent to invoking ::cuLinkAddData on the contents
+ * of the file.
+ *
+ * \param state        A pending linker action
+ * \param type         The type of the input data
+ * \param path         Path to the input file
+ * \param numOptions   Size of options
+ * \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate)
+ * \param optionValues Array of option values, each cast to void *
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_FILE_NOT_FOUND
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU
+ *
+ * \sa ::cuLinkCreate,
+ * ::cuLinkAddData,
+ * ::cuLinkComplete,
+ * ::cuLinkDestroy
+ */
+CUresult CUDAAPI
+cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
+    unsigned int numOptions, CUjit_option *options, void **optionValues);
+
+/**
+ * \brief Complete a pending linker invocation
+ *
+ * Completes the pending linker action and returns the cubin image for the linked
+ * device code, which can be used with ::cuModuleLoadData.  The cubin is owned by
+ * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy.
+ * This call does not destroy \p state.
+ *
+ * \param state    A pending linker invocation
+ * \param cubinOut On success, this will point to the output image
+ * \param sizeOut  Optional parameter to receive the size of the generated image
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuLinkCreate,
+ * ::cuLinkAddData,
+ * ::cuLinkAddFile,
+ * ::cuLinkDestroy,
+ * ::cuModuleLoadData
+ */
+CUresult CUDAAPI
+cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut);
+
+/**
+ * \brief Destroys state for a JIT linker invocation.
+ *
+ * \param state State object for the linker invocation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ *
+ * \sa ::cuLinkCreate
+ */
+CUresult CUDAAPI
+cuLinkDestroy(CUlinkState state);
+
+#endif /* __CUDA_API_VERSION >= 5050 */
+
+/** @} */ /* END CUDA_MODULE */
+
+
+/**
+ * \defgroup CUDA_MEM Memory Management
+ *
+ * ___MANBRIEF___ memory management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the memory management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Gets free and total memory
+ *
+ * Returns in \p *free and \p *total respectively, the free and total amount of
+ * memory available for allocation by the CUDA context, in bytes.
+ *
+ * \param free  - Returned free memory in bytes
+ * \param total - Returned total memory in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemGetInfo
+ */
+CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total);
+
+/**
+ * \brief Allocates device memory
+ *
+ * Allocates \p bytesize bytes of linear memory on the device and returns in
+ * \p *dptr a pointer to the allocated memory. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p bytesize
+ * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE.
+ *
+ * \param dptr     - Returned device pointer
+ * \param bytesize - Requested allocation size in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMalloc
+ */
+CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
+
+/**
+ * \brief Allocates pitched device memory
+ *
+ * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on
+ * the device and returns in \p *dptr a pointer to the allocated memory. The
+ * function may pad the allocation to ensure that corresponding pointers in
+ * any given row will continue to meet the alignment requirements for
+ * coalescing as the address is updated from row to row. \p ElementSizeBytes
+ * specifies the size of the largest reads and writes that will be performed
+ * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced
+ * memory transactions are not possible on other data sizes). If
+ * \p ElementSizeBytes is smaller than the actual read/write size of a kernel,
+ * the kernel will run correctly, but possibly at reduced speed. The pitch
+ * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the
+ * allocation. The intended usage of pitch is as a separate parameter of the
+ * allocation, used to compute addresses within the 2D array. Given the row
+ * and column of an array element of type \b T, the address is computed as:
+ * \code
+   T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
+ * \endcode
+ *
+ * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with
+ * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is
+ * recommended that programmers consider performing pitch allocations using
+ * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is
+ * especially true if the application will be performing 2D memory copies
+ * between different regions of device memory (whether linear memory or CUDA
+ * arrays).
+ *
+ * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed
+ * to match or exceed the alignment requirement for texture binding with
+ * ::cuTexRefSetAddress2D().
+ *
+ * \param dptr             - Returned device pointer
+ * \param pPitch           - Returned pitch of allocation in bytes
+ * \param WidthInBytes     - Requested allocation width in bytes
+ * \param Height           - Requested allocation height in rows
+ * \param ElementSizeBytes - Size of largest reads/writes for range
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMallocPitch
+ */
+CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
+
+/**
+ * \brief Frees device memory
+ *
+ * Frees the memory space pointed to by \p dptr, which must have been returned
+ * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch().
+ *
+ * \param dptr - Pointer to memory to free
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaFree
+ */
+CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
+
+/**
+ * \brief Get information on memory allocations
+ *
+ * Returns the base address in \p *pbase and size in \p *psize of the
+ * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input
+ * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one
+ * of them is NULL, it is ignored.
+ *
+ * \param pbase - Returned base address
+ * \param psize - Returned size of device memory allocation
+ * \param dptr  - Device pointer to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
+ */
+CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
+
+/**
+ * \brief Allocates page-locked host memory
+ *
+ * Allocates \p bytesize bytes of host memory that is page-locked and
+ * accessible to the device. The driver tracks the virtual memory ranges
+ * allocated with this function and automatically accelerates calls to
+ * functions such as ::cuMemcpy(). Since the memory can be accessed directly by
+ * the device, it can be read or written with much higher bandwidth than
+ * pageable memory obtained with functions such as ::malloc(). Allocating
+ * excessive amounts of memory with ::cuMemAllocHost() may degrade system
+ * performance, since it reduces the amount of memory available to the system
+ * for paging. As a result, this function is best used sparingly to allocate
+ * staging areas for data exchange between host and device.
+ *
+ * Note all host memory allocated using ::cuMemHostAlloc() will automatically
+ * be immediately accessible to all contexts on all devices which support unified
+ * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
+ * The device pointer that may be used to access this host memory from those
+ * contexts is always equal to the returned host pointer \p *pp.
+ * See \ref CUDA_UNIFIED for additional details.
+ *
+ * \param pp       - Returned host pointer to page-locked memory
+ * \param bytesize - Requested allocation size in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMallocHost
+ */
+CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * \brief Frees page-locked host memory
+ *
+ * Frees the memory space pointed to by \p p, which must have been returned by
+ * a previous call to ::cuMemAllocHost().
+ *
+ * \param p - Pointer to memory to free
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaFreeHost
+ */
+CUresult CUDAAPI cuMemFreeHost(void *p);
+
+/**
+ * \brief Allocates page-locked host memory
+ *
+ * Allocates \p bytesize bytes of host memory that is page-locked and accessible
+ * to the device. The driver tracks the virtual memory ranges allocated with
+ * this function and automatically accelerates calls to functions such as
+ * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device,
+ * it can be read or written with much higher bandwidth than pageable memory
+ * obtained with functions such as ::malloc(). Allocating excessive amounts of
+ * pinned memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to allocate staging areas for data exchange between
+ * host and device.
+ *
+ * The \p Flags parameter enables different options to be specified that
+ * affect the allocation, as follows.
+ *
+ * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be
+ *   considered as pinned memory by all CUDA contexts, not just the one that
+ *   performed the allocation.
+ *
+ * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address
+ *   space. The device pointer to the memory may be obtained by calling
+ *   ::cuMemHostGetDevicePointer().
+ *
+ * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined
+ *   (WC). WC memory can be transferred across the PCI Express bus more
+ *   quickly on some system configurations, but cannot be read efficiently by
+ *   most CPUs. WC memory is a good option for buffers that will be written by
+ *   the CPU and read by the GPU via mapped pinned memory or host->device
+ *   transfers.
+ *
+ * All of these flags are orthogonal to one another: a developer may allocate
+ * memory that is portable, mapped and/or write-combined with no restrictions.
+ *
+ * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in
+ * order for the ::CU_MEMHOSTALLOC_DEVICEMAP flag to have any effect.
+ *
+ * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for
+ * devices that do not support mapped pinned memory. The failure is deferred
+ * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
+ * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag.
+ *
+ * The memory allocated by this function must be freed with ::cuMemFreeHost().
+ *
+ * Note all host memory allocated using ::cuMemHostAlloc() will automatically
+ * be immediately accessible to all contexts on all devices which support unified
+ * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
+ * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer
+ * that may be used to access this host memory from those contexts is always equal
+ * to the returned host pointer \p *pp.  If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED
+ * is specified, then the function ::cuMemHostGetDevicePointer() must be used
+ * to query the device pointer, even if the context supports unified addressing.
+ * See \ref CUDA_UNIFIED for additional details.
+ *
+ * \param pp       - Returned host pointer to page-locked memory
+ * \param bytesize - Requested allocation size in bytes
+ * \param Flags    - Flags for allocation request
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaHostAlloc
+ */
+CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Passes back device pointer of mapped pinned memory
+ *
+ * Passes back the device pointer \p pdptr corresponding to the mapped, pinned
+ * host buffer \p p allocated by ::cuMemHostAlloc.
+ *
+ * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP
+ * flag was not specified at the time the memory was allocated, or if the
+ * function is called on a GPU that does not support mapped pinned memory.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
+ * can also be accessed from the device using the host pointer \p p.
+ * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
+ * match the original host pointer \p p and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
+ * will match the original pointer \p p. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only of the two pointers and not both.
+ *
+ * \p Flags provides for future releases. For now, it must be set to 0.
+ *
+ * \param pdptr - Returned device pointer
+ * \param p     - Host pointer
+ * \param Flags - Options (must be 0)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaHostGetDevicePointer
+ */
+CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * \brief Passes back flags that were used for a pinned allocation
+ *
+ * Passes back the flags \p pFlags that were specified when allocating
+ * the pinned host buffer \p p allocated by ::cuMemHostAlloc.
+ *
+ * ::cuMemHostGetFlags() will fail if the pointer does not reside in
+ * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc().
+ *
+ * \param pFlags - Returned flags word
+ * \param p     - Host pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemAllocHost,
+ * ::cuMemHostAlloc,
+ * ::cudaHostGetFlags
+ */
+CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p);
+
+#if __CUDA_API_VERSION >= 6000
+
+/**
+ * \brief Allocates memory that will be automatically managed by the Unified Memory system
+ *
+ * Allocates \p bytesize bytes of managed memory on the device and returns in
+ * \p *dptr a pointer to the allocated memory. If the device doesn't support
+ * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support
+ * for managed memory can be queried using the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p bytesize
+ * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer
+ * is valid on the CPU and on all GPUs in the system that support managed memory.
+ * All accesses to this pointer must obey the Unified Memory programming model.
+ *
+ * \p flags specifies the default stream association for this allocation.
+ * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If
+ * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from
+ * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the
+ * allocation should not be accessed from devices that have a zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to
+ * ::cuStreamAttachMemAsync will be required to enable access on such devices.
+ *
+ * If the association is later changed via ::cuStreamAttachMemAsync to
+ * a single stream, the default association as specifed during ::cuMemAllocManaged
+ * is restored when that stream is destroyed. For __managed__ variables, the
+ * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a
+ * stream is an asynchronous operation, and as a result, the change to default
+ * association won't happen until all work in the stream has completed.
+ *
+ * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree.
+ *
+ * Device memory oversubscription is possible for GPUs that have a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on
+ * such GPUs may be evicted from device memory to host memory at any time by the Unified
+ * Memory driver in order to make room for other allocations.
+ *
+ * In a multi-GPU system where all GPUs have a non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this
+ * API returns and instead may be populated on access. In such systems, managed memory can
+ * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
+ * maintain data locality and prevent excessive page faults to the extent possible. The application
+ * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application
+ * can also explicitly migrate memory to a desired processor's memory via
+ * ::cuMemPrefetchAsync.
+ *
+ * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support
+ * with each other, the physical storage for managed memory is created on the GPU which is active
+ * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced
+ * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
+ * memory among such GPUs.
+ *
+ * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
+ * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+ * is zero for at least one of those GPUs, the location chosen for physical storage of managed
+ * memory is system-dependent.
+ * - On Linux, the location chosen will be device memory as long as the current set of active
+ * contexts are on devices that either have peer-to-peer support with each other or have a
+ * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * If there is an active context on a GPU that does not have a non-zero value for that device
+ * attribute and it does not have peer-to-peer support with the other devices that have active
+ * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
+ * Note that this means that managed memory that is located in device memory is migrated to
+ * host memory if a new context is created on a GPU that doesn't have a non-zero value for
+ * the device attribute and does not support peer-to-peer with at least one of the other devices
+ * that has an active context. This in turn implies that context creation may fail if there is
+ * insufficient host memory to migrate all managed allocations.
+ * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
+ * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
+ * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
+ * restrict CUDA to only use those GPUs that have peer-to-peer support.
+ * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a
+ * non-zero value to force the driver to always use device memory for physical storage.
+ * When this environment variable is set to a non-zero value, all contexts created in
+ * that process on devices that support managed memory have to be peer-to-peer compatible
+ * with each other. Context creation will fail if a context is created on a device that
+ * supports managed memory and is not peer-to-peer compatible with any of the other
+ * managed memory supporting devices on which contexts were previously created, even if
+ * those contexts have been destroyed. These environment variables are described
+ * in the CUDA programming guide under the "CUDA environment variables" section.
+ * - On ARM, managed memory is not available on discrete gpu with Drive PX-2.
+ *
+ * \param dptr     - Returned device pointer
+ * \param bytesize - Requested allocation size in bytes
+ * \param flags    - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync,
+ * ::cudaMallocManaged
+ */
+CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags);
+
+#endif /* __CUDA_API_VERSION >= 6000 */
+
+#if __CUDA_API_VERSION >= 4010
+
+/**
+ * \brief Returns a handle to a compute device
+ *
+ * Returns in \p *device a device handle given a PCI bus ID string.
+ *
+ * \param dev      - Returned device handle
+ *
+ * \param pciBusId - String in one of the following forms:
+ * [domain]:[bus]:[device].[function]
+ * [domain]:[bus]:[device]
+ * [bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGet,
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetPCIBusId,
+ * ::cudaDeviceGetByPCIBusId
+ */
+CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId);
+
+/**
+ * \brief Returns a PCI Bus Id string for the device
+ *
+ * Returns an ASCII string identifying the device \p dev in the NULL-terminated
+ * string pointed to by \p pciBusId. \p len specifies the maximum length of the
+ * string that may be returned.
+ *
+ * \param pciBusId - Returned identifier string for the device in the following format
+ * [domain]:[bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
+ * pciBusId should be large enough to store 13 characters including the NULL-terminator.
+ *
+ * \param len      - Maximum length of string to store in \p name
+ *
+ * \param dev      - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGet,
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetByPCIBusId,
+ * ::cudaDeviceGetPCIBusId
+ */
+CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
+
+/**
+ * \brief Gets an interprocess handle for a previously allocated event
+ *
+ * Takes as input a previously allocated event. This event must have been
+ * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING
+ * flags set. This opaque handle may be copied into other processes and
+ * opened with ::cuIpcOpenEventHandle to allow efficient hardware
+ * synchronization between GPU work in different processes.
+ *
+ * After the event has been opened in the importing process,
+ * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and
+ * ::cuEventQuery may be used in either process. Performing operations
+ * on the imported event after the exported event has been freed
+ * with ::cuEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode.
+ * IPC functionality is not supported on Tegra platforms.
+ *
+ * \param pHandle - Pointer to a user allocated CUipcEventHandle
+ *                    in which to return the opaque event handle
+ * \param event   - Event allocated with ::CU_EVENT_INTERPROCESS and
+ *                    ::CU_EVENT_DISABLE_TIMING flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuEventCreate,
+ * ::cuEventDestroy,
+ * ::cuEventSynchronize,
+ * ::cuEventQuery,
+ * ::cuStreamWaitEvent,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cudaIpcGetEventHandle
+ */
+CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
+
+/**
+ * \brief Opens an interprocess event handle for use in the current process
+ *
+ * Opens an interprocess event handle exported from another process with
+ * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like
+ * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified.
+ * This event must be freed with ::cuEventDestroy.
+ *
+ * Performing operations on the imported event after the exported event has
+ * been freed with ::cuEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode.
+ * IPC functionality is not supported on Tegra platforms.
+ *
+ * \param phEvent - Returns the imported event
+ * \param handle  - Interprocess handle to open
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuEventCreate,
+ * ::cuEventDestroy,
+ * ::cuEventSynchronize,
+ * ::cuEventQuery,
+ * ::cuStreamWaitEvent,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cudaIpcOpenEventHandle
+ */
+CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle);
+
+/**
+ * \brief Gets an interprocess memory handle for an existing device memory
+ * allocation
+ *
+ * Takes a pointer to the base of an existing device memory allocation created
+ * with ::cuMemAlloc and exports it for use in another process. This is a
+ * lightweight operation and may be called multiple times on an allocation
+ * without adverse effects.
+ *
+ * If a region of memory is freed with ::cuMemFree and a subsequent call
+ * to ::cuMemAlloc returns memory with the same device address,
+ * ::cuIpcGetMemHandle will return a unique handle for the
+ * new memory.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode.
+ * IPC functionality is not supported on Tegra platforms.
+ *
+ * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return
+ *                    the handle in.
+ * \param dptr    - Base pointer to previously allocated device memory
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cudaIpcGetMemHandle
+ */
+CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
+
+/**
+ * \brief Opens an interprocess memory handle exported from another process
+ * and returns a device pointer usable in the local process.
+ *
+ * Maps memory exported from another process with ::cuIpcGetMemHandle into
+ * the current device address space. For contexts on different devices
+ * ::cuIpcOpenMemHandle can attempt to enable peer access between the
+ * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is
+ * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag.
+ * ::cuDeviceCanAccessPeer can determine if a mapping is possible.
+ *
+ * Contexts that may open ::CUipcMemHandles are restricted in the following way.
+ * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened
+ * by one ::CUcontext per ::CUdevice per other process.
+ *
+ * Memory returned from ::cuIpcOpenMemHandle must be freed with
+ * ::cuIpcCloseMemHandle.
+ *
+ * Calling ::cuMemFree on an exported memory region before calling
+ * ::cuIpcCloseMemHandle in the importing context will result in undefined
+ * behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode.
+ * IPC functionality is not supported on Tegra platforms.
+ *
+ * \param pdptr  - Returned device pointer
+ * \param handle - ::CUipcMemHandle to open
+ * \param Flags  - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_TOO_MANY_PEERS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \note No guarantees are made about the address returned in \p *pdptr.
+ * In particular, multiple processes may not receive the same address for the same \p handle.
+ *
+ * \sa
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cuCtxEnablePeerAccess,
+ * ::cuDeviceCanAccessPeer,
+ * ::cudaIpcOpenMemHandle
+ */
+CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
+
+/**
+ * \brief Close memory mapped with ::cuIpcOpenMemHandle
+ *
+ * Unmaps memory returnd by ::cuIpcOpenMemHandle. The original allocation
+ * in the exporting process as well as imported mappings in other processes
+ * will be unaffected.
+ *
+ * Any resources used to enable peer access will be freed if this is the
+ * last mapping using them.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode.
+ * IPC functionality is not supported on Tegra platforms.
+ *
+ * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \sa
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cudaIpcCloseMemHandle
+ */
+CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr);
+
+#endif /* __CUDA_API_VERSION >= 4010 */
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Registers an existing host memory range for use by CUDA
+ *
+ * Page-locks the memory range specified by \p p and \p bytesize and maps it
+ * for the device(s) as specified by \p Flags. This memory range also is added
+ * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate
+ * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed
+ * directly by the device, it can be read or written with much higher bandwidth
+ * than pageable memory that has not been registered.  Page-locking excessive
+ * amounts of memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to register staging areas for data exchange between
+ * host and device.
+ *
+ * This function has limited support on Mac OS X. OS 10.7 or higher is required.
+ *
+ * This function is supported only on I/O coherent devices that have a non-zero value
+ * for the device attribute ::CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED.
+ *
+ * The \p Flags parameter enables different options to be specified that
+ * affect the allocation, as follows.
+ *
+ * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be
+ *   considered as pinned memory by all CUDA contexts, not just the one that
+ *   performed the allocation.
+ *
+ * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address
+ *   space. The device pointer to the memory may be obtained by calling
+ *   ::cuMemHostGetDevicePointer().
+ *
+ * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some
+ *   I/O memory space, e.g. the PCI Express resource of a 3rd party device.
+ *
+ * All of these flags are orthogonal to one another: a developer may page-lock
+ * memory that is portable or mapped with no restrictions.
+ *
+ * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in
+ * order for the ::CU_MEMHOSTREGISTER_DEVICEMAP flag to have any effect.
+ *
+ * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for
+ * devices that do not support mapped pinned memory. The failure is deferred
+ * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
+ * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
+ * can also be accessed from the device using the host pointer \p p.
+ * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
+ * match the original host pointer \p ptr and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
+ * will match the original pointer \p ptr. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only of the two pointers and not both.
+ *
+ * The memory page-locked by this function must be unregistered with
+ * ::cuMemHostUnregister().
+ *
+ * \param p        - Host pointer to memory to page-lock
+ * \param bytesize - Size in bytes of the address range to page-lock
+ * \param Flags    - Flags for allocation request
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemHostUnregister,
+ * ::cuMemHostGetFlags,
+ * ::cuMemHostGetDevicePointer,
+ * ::cudaHostRegister
+ */
+CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
+
+/**
+ * \brief Unregisters a memory range that was registered with cuMemHostRegister.
+ *
+ * Unmaps the memory range whose base address is specified by \p p, and makes
+ * it pageable again.
+ *
+ * The base address must be the same one specified to ::cuMemHostRegister().
+ *
+ * \param p - Host pointer to memory to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemHostRegister,
+ * ::cudaHostUnregister
+ */
+CUresult CUDAAPI cuMemHostUnregister(void *p);
+
+/**
+ * \brief Copies memory
+ *
+ * Copies data between two pointers.
+ * \p dst and \p src are base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ * Note that this function infers the type of the transfer (host to host, host to
+ *   device, device to device, or device to host) from the pointer values.  This
+ *   function is only allowed in contexts which support unified addressing.
+ *
+ * \param dst - Destination unified virtual address space pointer
+ * \param src - Source unified virtual address space pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol
+ */
+CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+
+/**
+ * \brief Copies device memory between two contexts
+ *
+ * Copies from device memory in one context to device memory in another
+ * context. \p dstDevice is the base device pointer of the destination memory
+ * and \p dstContext is the destination context.  \p srcDevice is the base
+ * device pointer of the source memory and \p srcContext is the source pointer.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice  - Destination device pointer
+ * \param dstContext - Destination context
+ * \param srcDevice  - Source device pointer
+ * \param srcContext - Source context
+ * \param ByteCount  - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpyPeer
+ */
+CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
+
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Copies memory from Host to Device
+ *
+ * Copies from host memory to device memory. \p dstDevice and \p srcHost are
+ * the base addresses of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyToSymbol
+ */
+CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Device to Host
+ *
+ * Copies from device to host memory. \p dstHost and \p srcDevice specify the
+ * base pointers of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination host pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyFromSymbol
+ */
+CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Device to Device
+ *
+ * Copies from device memory to device memory. \p dstDevice and \p srcDevice
+ * are the base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol
+ */
+CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Device to Array
+ *
+ * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ * specify the CUDA array handle and starting index of the destination data.
+ * \p srcDevice specifies the base pointer of the source. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyToArray
+ */
+CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Array to Device
+ *
+ * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the
+ * base pointer of the destination and must be naturally aligned with the CUDA
+ * array elements. \p srcArray and \p srcOffset specify the CUDA array handle
+ * and the offset in bytes into the array where the copy is to begin.
+ * \p ByteCount specifies the number of bytes to copy and must be evenly
+ * divisible by the array element size.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyFromArray
+ */
+CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Host to Array
+ *
+ * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ * specify the CUDA array handle and starting offset in bytes of the destination
+ * data.  \p pSrc specifies the base address of the source. \p ByteCount specifies
+ * the number of bytes to copy.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyToArray
+ */
+CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Array to Host
+ *
+ * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
+ * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
+ * array handle and starting offset in bytes of the source data.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination device pointer
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyFromArray
+ */
+CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Array to Array
+ *
+ * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray
+ * specify the handles of the destination and source CUDA arrays for the copy,
+ * respectively. \p dstOffset and \p srcOffset specify the destination and
+ * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of
+ * bytes to be copied. The size of the elements in the CUDA arrays need not be
+ * the same format, but the elements must be the same size; and count must be
+ * evenly divisible by that size.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyArrayToArray
+ */
+CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+/**
+ * \brief Copies memory for 2D arrays
+ *
+ * Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ * The ::CUDA_MEMCPY2D structure is defined as:
+ *
+ * \code
+   typedef struct CUDA_MEMCPY2D_st {
+      unsigned int srcXInBytes, srcY;
+      CUmemorytype srcMemoryType;
+          const void *srcHost;
+          CUdeviceptr srcDevice;
+          CUarray srcArray;
+          unsigned int srcPitch;
+
+      unsigned int dstXInBytes, dstY;
+      CUmemorytype dstMemoryType;
+          void *dstHost;
+          CUdeviceptr dstDevice;
+          CUarray dstArray;
+          unsigned int dstPitch;
+
+      unsigned int WidthInBytes;
+      unsigned int Height;
+   } CUDA_MEMCPY2D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ * specify the (host) base address of the source data and the bytes per row to
+ * apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ * specify the (device) base address of the source data and the bytes per row
+ * to apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ * ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ * ignored.
+ *
+ * - ::srcXInBytes and ::srcY specify the base address of the source data for
+ *   the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::dstXInBytes and ::dstY specify the base address of the destination data
+ *   for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+ *   the 2D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ *
+ * \par
+ * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ * (device to device, CUDA array to device, CUDA array to CUDA array),
+ * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
+ * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
+ * significantly slower in the cases where ::cuMemcpy2D() would have returned
+ * an error code.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,
+ * ::cudaMemcpy2DFromArray
+ */
+CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
+
+/**
+ * \brief Copies memory for 2D arrays
+ *
+ * Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ * The ::CUDA_MEMCPY2D structure is defined as:
+ *
+ * \code
+   typedef struct CUDA_MEMCPY2D_st {
+      unsigned int srcXInBytes, srcY;
+      CUmemorytype srcMemoryType;
+      const void *srcHost;
+      CUdeviceptr srcDevice;
+      CUarray srcArray;
+      unsigned int srcPitch;
+      unsigned int dstXInBytes, dstY;
+      CUmemorytype dstMemoryType;
+      void *dstHost;
+      CUdeviceptr dstDevice;
+      CUarray dstArray;
+      unsigned int dstPitch;
+      unsigned int WidthInBytes;
+      unsigned int Height;
+   } CUDA_MEMCPY2D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ * specify the (host) base address of the source data and the bytes per row to
+ * apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ * specify the (device) base address of the source data and the bytes per row
+ * to apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ * ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ * ignored.
+ *
+ * - ::srcXInBytes and ::srcY specify the base address of the source data for
+ *   the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::dstXInBytes and ::dstY specify the base address of the destination data
+ *   for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+ *   the 2D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ *
+ * \par
+ * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ * (device to device, CUDA array to device, CUDA array to CUDA array),
+ * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
+ * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
+ * significantly slower in the cases where ::cuMemcpy2D() would have returned
+ * an error code.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,
+ * ::cudaMemcpy2DFromArray
+ */
+CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
+
+/**
+ * \brief Copies memory for 3D arrays
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
+ *
+ * \code
+        typedef struct CUDA_MEMCPY3D_st {
+
+            unsigned int srcXInBytes, srcY, srcZ;
+            unsigned int srcLOD;
+            CUmemorytype srcMemoryType;
+                const void *srcHost;
+                CUdeviceptr srcDevice;
+                CUarray srcArray;
+                unsigned int srcPitch;  // ignored when src is array
+                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
+
+            unsigned int dstXInBytes, dstY, dstZ;
+            unsigned int dstLOD;
+            CUmemorytype dstMemoryType;
+                void *dstHost;
+                CUdeviceptr dstDevice;
+                CUarray dstArray;
+                unsigned int dstPitch;  // ignored when dst is array
+                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
+
+            unsigned int WidthInBytes;
+            unsigned int Height;
+            unsigned int Depth;
+        } CUDA_MEMCPY3D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
+ * ::srcHeight specify the (host) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
+ * ::srcHeight specify the (device) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
+ * ::srcHeight are ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data, the bytes per row,
+ * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data, the bytes per
+ * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
+ * ::dstHeight are ignored.
+ *
+ * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
+ *   data for the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
+ *   destination data for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
+ *   and depth of the 3D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcHeight must be greater than or equal to ::Height +
+ *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+ *
+ * \par
+ * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
+ *
+ * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
+ * set to 0.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy3D
+ */
+CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Copies memory between contexts
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
+ * for documentation of its parameters.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpy3DPeer
+ */
+CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
+
+/**
+ * \brief Copies memory asynchronously
+ *
+ * Copies data between two pointers.
+ * \p dst and \p src are base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ * Note that this function infers the type of the transfer (host to host, host to
+ *   device, device to device, or device to host) from the pointer values.  This
+ *   function is only allowed in contexts which support unified addressing.
+ *
+ * \param dst       - Destination unified virtual address space pointer
+ * \param src       - Source unified virtual address space pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyToSymbolAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies device memory between two contexts asynchronously.
+ *
+ * Copies from device memory in one context to device memory in another
+ * context. \p dstDevice is the base device pointer of the destination memory
+ * and \p dstContext is the destination context.  \p srcDevice is the base
+ * device pointer of the source memory and \p srcContext is the source pointer.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice  - Destination device pointer
+ * \param dstContext - Destination context
+ * \param srcDevice  - Source device pointer
+ * \param srcContext - Source context
+ * \param ByteCount  - Size of memory copy in bytes
+ * \param hStream    - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpyPeerAsync
+ */
+CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Copies memory from Host to Device
+ *
+ * Copies from host memory to device memory. \p dstDevice and \p srcHost are
+ * the base addresses of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyToSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Device to Host
+ *
+ * Copies from device to host memory. \p dstHost and \p srcDevice specify the
+ * base pointers of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination host pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Device to Device
+ *
+ * Copies from device memory to device memory. \p dstDevice and \p srcDevice
+ * are the base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyToSymbolAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Host to Array
+ *
+ * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ * specify the CUDA array handle and starting offset in bytes of the
+ * destination data. \p srcHost specifies the base address of the source.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyToArrayAsync
+ */
+CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Array to Host
+ *
+ * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
+ * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
+ * array handle and starting offset in bytes of the source data.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination pointer
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyFromArrayAsync
+ */
+CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory for 2D arrays
+ *
+ * Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ * The ::CUDA_MEMCPY2D structure is defined as:
+ *
+ * \code
+   typedef struct CUDA_MEMCPY2D_st {
+      unsigned int srcXInBytes, srcY;
+      CUmemorytype srcMemoryType;
+      const void *srcHost;
+      CUdeviceptr srcDevice;
+      CUarray srcArray;
+      unsigned int srcPitch;
+      unsigned int dstXInBytes, dstY;
+      CUmemorytype dstMemoryType;
+      void *dstHost;
+      CUdeviceptr dstDevice;
+      CUarray dstArray;
+      unsigned int dstPitch;
+      unsigned int WidthInBytes;
+      unsigned int Height;
+   } CUDA_MEMCPY2D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ * specify the (host) base address of the source data and the bytes per row to
+ * apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ * specify the (device) base address of the source data and the bytes per row
+ * to apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ * ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ * ignored.
+ *
+ * - ::srcXInBytes and ::srcY specify the base address of the source data for
+ *   the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::dstXInBytes and ::dstY specify the base address of the destination data
+ *   for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+ *   the 2D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcHeight must be greater than or equal to ::Height +
+ *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+ *
+ * \par
+ * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ * (device to device, CUDA array to device, CUDA array to CUDA array),
+ * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch().
+ *
+ * \param pCopy   - Parameters for the memory copy
+ * \param hStream - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync
+ */
+CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+
+/**
+ * \brief Copies memory for 3D arrays
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
+ *
+ * \code
+        typedef struct CUDA_MEMCPY3D_st {
+
+            unsigned int srcXInBytes, srcY, srcZ;
+            unsigned int srcLOD;
+            CUmemorytype srcMemoryType;
+                const void *srcHost;
+                CUdeviceptr srcDevice;
+                CUarray srcArray;
+                unsigned int srcPitch;  // ignored when src is array
+                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
+
+            unsigned int dstXInBytes, dstY, dstZ;
+            unsigned int dstLOD;
+            CUmemorytype dstMemoryType;
+                void *dstHost;
+                CUdeviceptr dstDevice;
+                CUarray dstArray;
+                unsigned int dstPitch;  // ignored when dst is array
+                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
+
+            unsigned int WidthInBytes;
+            unsigned int Height;
+            unsigned int Depth;
+        } CUDA_MEMCPY3D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
+ * ::srcHeight specify the (host) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
+ * ::srcHeight specify the (device) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
+ * ::srcHeight are ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data, the bytes per row,
+ * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data, the bytes per
+ * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
+ * ::dstHeight are ignored.
+ *
+ * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
+ *   data for the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
+ *   destination data for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
+ *   and depth of the 3D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcHeight must be greater than or equal to ::Height +
+ *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+ *
+ * \par
+ * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
+ *
+ * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
+ * set to 0.
+ *
+ * \param pCopy - Parameters for the memory copy
+ * \param hStream - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpy3DAsync
+ */
+CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Copies memory between contexts asynchronously.
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
+ * for documentation of its parameters.
+ *
+ * \param pCopy - Parameters for the memory copy
+ * \param hStream - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpy3DPeerAsync
+ */
+CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the memory range of \p N 8-bit values to the specified value
+ * \p uc.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param uc        - Value to set
+ * \param N         - Number of elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset
+ */
+CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the memory range of \p N 16-bit values to the specified value
+ * \p us. The \p dstDevice pointer must be two byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param us        - Value to set
+ * \param N         - Number of elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset
+ */
+CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the memory range of \p N 32-bit values to the specified value
+ * \p ui. The \p dstDevice pointer must be four byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param ui        - Value to set
+ * \param N         - Number of elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32Async,
+ * ::cudaMemset
+ */
+CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the 2D memory range of \p Width 8-bit values to the specified value
+ * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer
+ * \param uc        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2D
+ */
+CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the 2D memory range of \p Width 16-bit values to the specified value
+ * \p us. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be two byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer
+ * \param us        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2D
+ */
+CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the 2D memory range of \p Width 32-bit values to the specified value
+ * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be four byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer
+ * \param ui        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2D
+ */
+CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the memory range of \p N 8-bit values to the specified value
+ * \p uc.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param uc        - Value to set
+ * \param N         - Number of elements
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemsetAsync
+ */
+CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the memory range of \p N 16-bit values to the specified value
+ * \p us. The \p dstDevice pointer must be two byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param us        - Value to set
+ * \param N         - Number of elements
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemsetAsync
+ */
+CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the memory range of \p N 32-bit values to the specified value
+ * \p ui. The \p dstDevice pointer must be four byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param ui        - Value to set
+ * \param N         - Number of elements
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32,
+ * ::cudaMemsetAsync
+ */
+CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the 2D memory range of \p Width 8-bit values to the specified value
+ * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer
+ * \param uc        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2DAsync
+ */
+CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the 2D memory range of \p Width 16-bit values to the specified value
+ * \p us. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be two byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer
+ * \param us        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2DAsync
+ */
+CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the 2D memory range of \p Width 32-bit values to the specified value
+ * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be four byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer
+ * \param ui        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2DAsync
+ */
+CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+
+/**
+ * \brief Creates a 1D or 2D CUDA array
+ *
+ * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure
+ * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
+ * The ::CUDA_ARRAY_DESCRIPTOR is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned int Width;
+        unsigned int Height;
+        CUarray_format Format;
+        unsigned int NumChannels;
+    } CUDA_ARRAY_DESCRIPTOR;
+ * \endcode
+ * where:
+ *
+ * - \p Width, and \p Height are the width, and height of the CUDA array (in
+ * elements); the CUDA array is one-dimensional if height is 0, two-dimensional
+ * otherwise;
+ * - ::Format specifies the format of the elements; ::CUarray_format is
+ * defined as:
+ * \code
+    typedef enum CUarray_format_enum {
+        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+        CU_AD_FORMAT_HALF = 0x10,
+        CU_AD_FORMAT_FLOAT = 0x20
+    } CUarray_format;
+ *  \endcode
+ * - \p NumChannels specifies the number of packed components per CUDA array
+ * element; it may be 1, 2, or 4;
+ *
+ * Here are examples of CUDA array descriptions:
+ *
+ * Description for a CUDA array of 2048 floats:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 2048;
+    desc.Height = 1;
+ * \endcode
+ *
+ * Description for a 64 x 64 CUDA array of floats:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 64;
+    desc.Height = 64;
+ * \endcode
+ *
+ * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit
+ * float16's:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.FormatFlags = CU_AD_FORMAT_HALF;
+    desc.NumChannels = 4;
+    desc.Width = width;
+    desc.Height = height;
+ * \endcode
+ *
+ * Description for a \p width x \p height CUDA array of 16-bit elements, each
+ * of which is two 8-bit unsigned chars:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR arrayDesc;
+    desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8;
+    desc.NumChannels = 2;
+    desc.Width = width;
+    desc.Height = height;
+ * \endcode
+ *
+ * \param pHandle        - Returned array
+ * \param pAllocateArray - Array descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMallocArray
+ */
+CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
+
+/**
+ * \brief Get a 1D or 2D CUDA array descriptor
+ *
+ * Returns in \p *pArrayDescriptor a descriptor containing information on the
+ * format and dimensions of the CUDA array \p hArray. It is useful for
+ * subroutines that have been passed a CUDA array, but need to know the CUDA
+ * array parameters for validation or other purposes.
+ *
+ * \param pArrayDescriptor - Returned array descriptor
+ * \param hArray           - Array to get descriptor of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaArrayGetInfo
+ */
+CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+
+/**
+ * \brief Destroys a CUDA array
+ *
+ * Destroys the CUDA array \p hArray.
+ *
+ * \param hArray - Array to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ARRAY_IS_MAPPED,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaFreeArray
+ */
+CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Creates a 3D CUDA array
+ *
+ * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
+ * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
+ * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned int Width;
+        unsigned int Height;
+        unsigned int Depth;
+        CUarray_format Format;
+        unsigned int NumChannels;
+        unsigned int Flags;
+    } CUDA_ARRAY3D_DESCRIPTOR;
+ * \endcode
+ * where:
+ *
+ * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
+ * CUDA array (in elements); the following types of CUDA arrays can be allocated:
+ *     - A 1D array is allocated if \p Height and \p Depth extents are both zero.
+ *     - A 2D array is allocated if only \p Depth extent is zero.
+ *     - A 3D array is allocated if all three extents are non-zero.
+ *     - A 1D layered CUDA array is allocated if only \p Height is zero and the
+ *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A 2D layered CUDA array is allocated if all three extents are non-zero and
+ *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A cubemap CUDA array is allocated if all three extents are non-zero and the
+ *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
+ *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
+ *       where the six layers represent the six faces of a cube. The order of the six
+ *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
+ *     - A cubemap layered CUDA array is allocated if all three extents are non-zero,
+ *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
+ *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
+ *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
+ *       consists of a collection of cubemaps. The first six layers represent the first
+ *       cubemap, the next six layers form the second cubemap, and so on.
+ *
+ * - ::Format specifies the format of the elements; ::CUarray_format is
+ * defined as:
+ * \code
+    typedef enum CUarray_format_enum {
+        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+        CU_AD_FORMAT_HALF = 0x10,
+        CU_AD_FORMAT_FLOAT = 0x20
+    } CUarray_format;
+ *  \endcode
+ *
+ * - \p NumChannels specifies the number of packed components per CUDA array
+ * element; it may be 1, 2, or 4;
+ *
+ * - ::Flags may be set to
+ *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set,
+ *     \p Depth specifies the number of layers, not the depth of a 3D array.
+ *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array.
+ *     If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array
+ *     to a surface reference.
+ *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be
+ *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
+ *     then \p Depth must be a multiple of six.
+ *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather.
+ *     Texture gather can only be performed on 2D CUDA arrays.
+ *
+ * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
+ * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
+ * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH.
+ *
+ * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag
+ * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH
+ * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case.
+ *
+ * <table>
+ * <tr><td><b>CUDA array type</b></td>
+ * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
+ * (depth range)}</b></td>
+ * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
+ * {(width range in elements), (height range), (depth range)}</b></td></tr>
+ * <tr><td>1D</td>
+ * <td><small>{ (1,TEXTURE1D_WIDTH), 0, 0 }</small></td>
+ * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
+ * <tr><td>2D</td>
+ * <td><small>{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }</small></td>
+ * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
+ * <tr><td>3D</td>
+ * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
+ * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
+ * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
+ * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
+ * (1,SURFACE3D_DEPTH) }</small></td></tr>
+ * <tr><td>1D Layered</td>
+ * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
+ * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
+ * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>2D Layered</td>
+ * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
+ * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
+ * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>Cubemap</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
+ * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
+ * <tr><td>Cubemap Layered</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
+ * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
+ * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
+ * </table>
+ *
+ * Here are examples of CUDA array descriptions:
+ *
+ * Description for a CUDA array of 2048 floats:
+ * \code
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 2048;
+    desc.Height = 0;
+    desc.Depth = 0;
+ * \endcode
+ *
+ * Description for a 64 x 64 CUDA array of floats:
+ * \code
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 64;
+    desc.Height = 64;
+    desc.Depth = 0;
+ * \endcode
+ *
+ * Description for a \p width x \p height x \p depth CUDA array of 64-bit,
+ * 4x16-bit float16's:
+ * \code
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+    desc.FormatFlags = CU_AD_FORMAT_HALF;
+    desc.NumChannels = 4;
+    desc.Width = width;
+    desc.Height = height;
+    desc.Depth = depth;
+ * \endcode
+ *
+ * \param pHandle        - Returned array
+ * \param pAllocateArray - 3D array descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMalloc3DArray
+ */
+CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
+
+/**
+ * \brief Get a 3D CUDA array descriptor
+ *
+ * Returns in \p *pArrayDescriptor a descriptor containing information on the
+ * format and dimensions of the CUDA array \p hArray. It is useful for
+ * subroutines that have been passed a CUDA array, but need to know the CUDA
+ * array parameters for validation or other purposes.
+ *
+ * This function may be called on 1D and 2D arrays, in which case the \p Height
+ * and/or \p Depth members of the descriptor struct will be set to 0.
+ *
+ * \param pArrayDescriptor - Returned 3D array descriptor
+ * \param hArray           - 3D array to get descriptor of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaArrayGetInfo
+ */
+CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+#if __CUDA_API_VERSION >= 5000
+
+/**
+ * \brief Creates a CUDA mipmapped array
+ *
+ * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
+ * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle.
+ * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is
+ * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))].
+ *
+ * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned int Width;
+        unsigned int Height;
+        unsigned int Depth;
+        CUarray_format Format;
+        unsigned int NumChannels;
+        unsigned int Flags;
+    } CUDA_ARRAY3D_DESCRIPTOR;
+ * \endcode
+ * where:
+ *
+ * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
+ * CUDA array (in elements); the following types of CUDA arrays can be allocated:
+ *     - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero.
+ *     - A 2D mipmapped array is allocated if only \p Depth extent is zero.
+ *     - A 3D mipmapped array is allocated if all three extents are non-zero.
+ *     - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the
+ *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and
+ *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the
+ *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
+ *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
+ *       where the six layers represent the six faces of a cube. The order of the six
+ *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
+ *     - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero,
+ *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
+ *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
+ *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
+ *       consists of a collection of cubemaps. The first six layers represent the first
+ *       cubemap, the next six layers form the second cubemap, and so on.
+ *
+ * - ::Format specifies the format of the elements; ::CUarray_format is
+ * defined as:
+ * \code
+    typedef enum CUarray_format_enum {
+        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+        CU_AD_FORMAT_HALF = 0x10,
+        CU_AD_FORMAT_FLOAT = 0x20
+    } CUarray_format;
+ *  \endcode
+ *
+ * - \p NumChannels specifies the number of packed components per CUDA array
+ * element; it may be 1, 2, or 4;
+ *
+ * - ::Flags may be set to
+ *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set,
+ *     \p Depth specifies the number of layers, not the depth of a 3D array.
+ *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of
+ *     the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to
+ *     bind a mipmap level of the CUDA mipmapped array to a surface reference.
+  *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be
+ *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
+ *     then \p Depth must be a multiple of six.
+ *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather.
+ *     Texture gather can only be performed on 2D CUDA mipmapped arrays.
+ *
+ * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
+ * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
+ * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH.
+ *
+ * <table>
+ * <tr><td><b>CUDA array type</b></td>
+ * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
+ * (depth range)}</b></td>
+ * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
+ * {(width range in elements), (height range), (depth range)}</b></td></tr>
+ * <tr><td>1D</td>
+ * <td><small>{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }</small></td>
+ * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
+ * <tr><td>2D</td>
+ * <td><small>{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }</small></td>
+ * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
+ * <tr><td>3D</td>
+ * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
+ * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
+ * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
+ * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
+ * (1,SURFACE3D_DEPTH) }</small></td></tr>
+ * <tr><td>1D Layered</td>
+ * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
+ * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
+ * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>2D Layered</td>
+ * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
+ * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
+ * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>Cubemap</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
+ * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
+ * <tr><td>Cubemap Layered</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
+ * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
+ * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
+ * </table>
+ *
+ *
+ * \param pHandle             - Returned mipmapped array
+ * \param pMipmappedArrayDesc - mipmapped array descriptor
+ * \param numMipmapLevels     - Number of mipmap levels
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMipmappedArrayDestroy,
+ * ::cuMipmappedArrayGetLevel,
+ * ::cuArrayCreate,
+ * ::cudaMallocMipmappedArray
+ */
+CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels);
+
+/**
+ * \brief Gets a mipmap level of a CUDA mipmapped array
+ *
+ * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level
+ * of the CUDA mipmapped array \p hMipmappedArray.
+ *
+ * If \p level is greater than the maximum number of levels in this mipmapped array,
+ * ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param pLevelArray     - Returned mipmap level CUDA array
+ * \param hMipmappedArray - CUDA mipmapped array
+ * \param level           - Mipmap level
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMipmappedArrayCreate,
+ * ::cuMipmappedArrayDestroy,
+ * ::cuArrayCreate,
+ * ::cudaGetMipmappedArrayLevel
+ */
+CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
+
+/**
+ * \brief Destroys a CUDA mipmapped array
+ *
+ * Destroys the CUDA mipmapped array \p hMipmappedArray.
+ *
+ * \param hMipmappedArray - Mipmapped array to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ARRAY_IS_MAPPED,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMipmappedArrayCreate,
+ * ::cuMipmappedArrayGetLevel,
+ * ::cuArrayCreate,
+ * ::cudaFreeMipmappedArray
+ */
+CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
+
+#endif /* __CUDA_API_VERSION >= 5000 */
+
+/** @} */ /* END CUDA_MEM */
+
+/**
+ * \defgroup CUDA_UNIFIED Unified Addressing
+ *
+ * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the unified addressing functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ *
+ * \section CUDA_UNIFIED_overview Overview
+ *
+ * CUDA devices can share a unified address space with the host.
+ * For these devices there is no distinction between a device
+ * pointer and a host pointer -- the same pointer value may be
+ * used to access memory from the host program and from a kernel
+ * running on the device (with exceptions enumerated below).
+ *
+ * \section CUDA_UNIFIED_support Supported Platforms
+ *
+ * Whether or not a device supports unified addressing may be
+ * queried by calling ::cuDeviceGetAttribute() with the device
+ * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.
+ *
+ * Unified addressing is automatically enabled in 64-bit processes
+ *
+ * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values
+ *
+ * It is possible to look up information about the memory which backs a
+ * pointer value.  For instance, one may want to know if a pointer points
+ * to host or device memory.  As another example, in the case of device
+ * memory, one may want to know on which CUDA device the memory
+ * resides.  These properties may be queried using the function
+ * ::cuPointerGetAttribute()
+ *
+ * Since pointers are unique, it is not necessary to specify information
+ * about the pointers specified to the various copy functions in the
+ * CUDA API.  The function ::cuMemcpy() may be used to perform a copy
+ * between two pointers, ignoring whether they point to host or device
+ * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH()
+ * unnecessary for devices supporting unified addressing).  For
+ * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be
+ * used to specify that the CUDA driver should infer the location of the
+ * pointer from its value.
+ *
+ * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory
+ *
+ * All host memory allocated in all contexts using ::cuMemAllocHost() and
+ * ::cuMemHostAlloc() is always directly accessible from all contexts on
+ * all devices that support unified addressing.  This is the case regardless
+ * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and
+ * ::CU_MEMHOSTALLOC_DEVICEMAP are specified.
+ *
+ * The pointer value through which allocated host memory may be accessed
+ * in kernels on all devices that support unified addressing is the same
+ * as the pointer value through which that memory is accessed on the host,
+ * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device
+ * pointer for these allocations.
+ *
+ * Note that this is not the case for memory allocated using the flag
+ * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below.
+ *
+ * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory
+ *
+ * Upon enabling direct access from a context that supports unified addressing
+ * to another peer context that supports unified addressing using
+ * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using
+ * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible
+ * by the current context.  The device pointer value through
+ * which any peer memory may be accessed in the current context
+ * is the same pointer value through which that memory may be
+ * accessed in the peer context.
+ *
+ * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing
+ *
+ * Not all memory may be accessed on devices through the same pointer
+ * value through which they are accessed on the host.  These exceptions
+ * are host memory registered using ::cuMemHostRegister() and host memory
+ * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED.  For these
+ * exceptions, there exists a distinct host and device address for the
+ * memory.  The device address is guaranteed to not overlap any valid host
+ * pointer range and is guaranteed to have the same value across all
+ * contexts that support unified addressing.
+ *
+ * This device address may be queried using ::cuMemHostGetDevicePointer()
+ * when a context using unified addressing is current.  Either the host
+ * or the unified device pointer value may be used to refer to this memory
+ * through ::cuMemcpy() and similar functions using the
+ * ::CU_MEMORYTYPE_UNIFIED memory type.
+ *
+ */
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Returns information about a pointer
+ *
+ * The supported attributes are:
+ *
+ * - ::CU_POINTER_ATTRIBUTE_CONTEXT:
+ *
+ *      Returns in \p *data the ::CUcontext in which \p ptr was allocated or
+ *      registered.
+ *      The type of \p data must be ::CUcontext *.
+ *
+ *      If \p ptr was not allocated by, mapped by, or registered with
+ *      a ::CUcontext which uses unified virtual addressing then
+ *      ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE:
+ *
+ *      Returns in \p *data the physical memory type of the memory that
+ *      \p ptr addresses as a ::CUmemorytype enumerated value.
+ *      The type of \p data must be unsigned int.
+ *
+ *      If \p ptr addresses device memory then \p *data is set to
+ *      ::CU_MEMORYTYPE_DEVICE.  The particular ::CUdevice on which the
+ *      memory resides is the ::CUdevice of the ::CUcontext returned by the
+ *      ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr.
+ *
+ *      If \p ptr addresses host memory then \p *data is set to
+ *      ::CU_MEMORYTYPE_HOST.
+ *
+ *      If \p ptr was not allocated by, mapped by, or registered with
+ *      a ::CUcontext which uses unified virtual addressing then
+ *      ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ *      If the current ::CUcontext does not support unified virtual
+ *      addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER:
+ *
+ *      Returns in \p *data the device pointer value through which
+ *      \p ptr may be accessed by kernels running in the current
+ *      ::CUcontext.
+ *      The type of \p data must be CUdeviceptr *.
+ *
+ *      If there exists no device pointer value through which
+ *      kernels running in the current ::CUcontext may access
+ *      \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ *      If there is no current ::CUcontext then
+ *      ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ *
+ *      Except in the exceptional disjoint addressing cases discussed
+ *      below, the value returned in \p *data will equal the input
+ *      value \p ptr.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER:
+ *
+ *      Returns in \p *data the host pointer value through which
+ *      \p ptr may be accessed by by the host program.
+ *      The type of \p data must be void **.
+ *      If there exists no host pointer value through which
+ *      the host program may directly access \p ptr then
+ *      ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ *      Except in the exceptional disjoint addressing cases discussed
+ *      below, the value returned in \p *data will equal the input
+ *      value \p ptr.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS:
+ *
+ *      Returns in \p *data two tokens for use with the nv-p2p.h Linux
+ *      kernel interface. \p data must be a struct of type
+ *      CUDA_POINTER_ATTRIBUTE_P2P_TOKENS.
+ *
+ *      \p ptr must be a pointer to memory obtained from :cuMemAlloc().
+ *      Note that p2pToken and vaSpaceToken are only valid for the
+ *      lifetime of the source allocation. A subsequent allocation at
+ *      the same address may return completely different tokens.
+ *      Querying this attribute has a side effect of setting the attribute
+ *      ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that
+ *      \p ptr points to.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
+ *
+ *      A boolean attribute which when set, ensures that synchronous memory operations
+ *      initiated on the region of memory that \p ptr points to will always synchronize.
+ *      See further documentation in the section titled "API synchronization behavior"
+ *      to learn more about cases when synchronous memory operations can
+ *      exhibit asynchronous behavior.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID:
+ *
+ *      Returns in \p *data a buffer ID which is guaranteed to be unique within the process.
+ *      \p data must point to an unsigned long long.
+ *
+ *      \p ptr must be a pointer to memory obtained from a CUDA memory allocation API.
+ *      Every memory allocation from any of the CUDA memory allocation APIs will
+ *      have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs
+ *      from previous freed allocations. IDs are only unique within a single process.
+ *
+ *
+ * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED:
+ *
+ *      Returns in \p *data a boolean that indicates whether the pointer points to
+ *      managed memory or not.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL:
+ *
+ *      Returns in \p *data an integer representing a device ordinal of a device against
+ *      which the memory was allocated or registered.
+ *
+ * \par
+ *
+ * Note that for most allocations in the unified virtual address space
+ * the host and device pointer for accessing the allocation will be the
+ * same.  The exceptions to this are
+ *  - user memory registered using ::cuMemHostRegister
+ *  - host memory allocated using ::cuMemHostAlloc with the
+ *    ::CU_MEMHOSTALLOC_WRITECOMBINED flag
+ * For these types of allocation there will exist separate, disjoint host
+ * and device addresses for accessing the allocation.  In particular
+ *  - The host address will correspond to an invalid unmapped device address
+ *    (which will result in an exception if accessed from the device)
+ *  - The device address will correspond to an invalid unmapped host address
+ *    (which will result in an exception if accessed from the host).
+ * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER
+ * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host
+ * and device addresses from either address.
+ *
+ * \param data      - Returned pointer attribute value
+ * \param attribute - Pointer attribute to query
+ * \param ptr       - Pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuPointerSetAttribute,
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuMemAllocHost,
+ * ::cuMemFreeHost,
+ * ::cuMemHostAlloc,
+ * ::cuMemHostRegister,
+ * ::cuMemHostUnregister,
+ * ::cudaPointerGetAttributes
+ */
+CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr);
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+#if __CUDA_API_VERSION >= 8000
+/**
+ * \brief Prefetches memory to the specified destination device
+ *
+ * Prefetches memory to the specified destination device.  \p devPtr is the
+ * base device pointer of the memory to be prefetched and \p dstDevice is the
+ * destination device. \p count specifies the number of bytes to copy. \p hStream
+ * is the stream in which the operation is enqueued. The memory range must refer
+ * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables.
+ *
+ * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If
+ * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+ * must be non-zero. Additionally, \p hStream must be associated with a device that has a
+ * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ *
+ * The start address and end address of the memory range will be rounded down and rounded up
+ * respectively to be aligned to CPU page size before the prefetch operation is enqueued
+ * in the stream.
+ *
+ * If no physical memory has been allocated for this region, then this memory region
+ * will be populated and mapped on the destination device. If there's insufficient
+ * memory to prefetch the desired region, the Unified Memory driver may evict pages from other
+ * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory
+ * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted.
+ *
+ * By default, any mappings to the previous location of the migrated pages are removed and
+ * mappings for the new location are only setup on \p dstDevice. The exact behavior however
+ * also depends on the settings applied to this memory range via ::cuMemAdvise as described
+ * below:
+ *
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range,
+ * then that subset will create a read-only copy of the pages on \p dstDevice.
+ *
+ * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory
+ * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the
+ * preferred location of any pages in the memory range.
+ *
+ * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range,
+ * then mappings to those pages from all the appropriate processors are updated to
+ * refer to the new location if establishing such a mapping is possible. Otherwise,
+ * those mappings are cleared.
+ *
+ * Note that this API is not required for functionality and only serves to improve performance
+ * by allowing the application to migrate data to a suitable location before it is accessed.
+ * Memory accesses to this range are always coherent and are allowed even when the data is
+ * actively being migrated.
+ *
+ * Note that this function is asynchronous with respect to the host and all work
+ * on other devices.
+ *
+ * \param devPtr    - Pointer to be prefetched
+ * \param count     - Size in bytes
+ * \param dstDevice - Destination device to prefetch to
+ * \param hStream    - Stream to enqueue prefetch operation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ * ::cuMemcpy3DPeerAsync, ::cuMemAdvise,
+ * ::cudaMemPrefetchAsync
+ */
+CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
+
+/**
+ * \brief Advise about the usage of a given memory range
+ *
+ * Advise the Unified Memory subsystem about the usage pattern for the memory range
+ * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
+ * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
+ * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged
+ * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
+ * memory provided it represents a valid, host-accessible region of memory and all additional constraints
+ * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
+ * memory range results in an error being returned.
+ *
+ * The \p advice parameter can take the following values:
+ * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read
+ * from and only occasionally written to. Any read accesses from any processor to this region will create a
+ * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync
+ * is called on this region, it will create a read-only copy of the data on the destination processor.
+ * If any processor writes to this region, all copies of the corresponding page will be invalidated
+ * except for the one where the write occurred. The \p device argument is ignored for this advice.
+ * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
+ * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * Also, if a context is created on a device that does not have the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until
+ * all such contexts are destroyed.
+ * If the memory region refers to valid system-allocated pageable memory, then the accessing device must
+ * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only
+ * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice
+ * will not create a read-only copy when that device accesses this memory region.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY:  Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the
+ * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
+ * copies of the data will be collapsed into a single copy. The location for the collapsed
+ * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
+ * copies was resident at that location. Otherwise, the location chosen is arbitrary.
+ *
+ * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the
+ * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the
+ * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location
+ * does not cause data to migrate to that location immediately. Instead, it guides the migration policy
+ * when a fault occurs on that memory region. If the data is already in its preferred location and the
+ * faulting processor can establish a mapping without requiring the data to be migrated, then
+ * data migration will be avoided. On the other hand, if the data is not in its preferred location
+ * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
+ * it. It is important to note that setting the preferred location does not prevent data prefetching
+ * done using ::cuMemPrefetchAsync.
+ * Having a preferred location can override the page thrash detection and resolution logic in the Unified
+ * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
+ * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
+ * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice, unless read accesses from
+ * \p device will not result in a read-only copy being created on that device as outlined in description for
+ * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
+ * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect. Note however that this behavior may change in the future.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION
+ * and changes the preferred location to none.
+ *
+ * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device.
+ * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then
+ * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero.
+ * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
+ * it causes the data to always be mapped in the specified processor's page tables, as long as the
+ * location of the data permits a mapping to be established. If the data gets migrated for any reason,
+ * the mappings are updated accordingly.
+ * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
+ * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
+ * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
+ * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
+ * migration may be too high. But preventing faults can still help improve performance, and so having
+ * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
+ * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
+ * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the
+ * page in host memory.
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice. Additionally, if the
+ * preferred location of this memory region or any subset of it is also \p device, then the policies
+ * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
+ * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to
+ * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
+ * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect.
+ *
+ * \param devPtr - Pointer to memory to set the advice for
+ * \param count  - Size in bytes of the memory range
+ * \param advice - Advice to be applied for the specified memory range
+ * \param device - Device to apply the advice for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync,
+ * ::cudaMemAdvise
+ */
+CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device);
+
+/**
+ * \brief Query an attribute of a given memory range
+ *
+ * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The
+ * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
+ * __managed__ variables.
+ *
+ * The \p attribute parameter can take the following values:
+ * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted
+ * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given
+ * memory range have read-duplication enabled, or 0 otherwise.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device
+ * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU
+ * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID
+ * if either all the pages don't have the same preferred location or some of the pages don't have a
+ * preferred location at all. Note that the actual location of the pages in the memory range at the time of
+ * the query may be different from the preferred location.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted
+ * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned
+ * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range.
+ * If any device does not have that advice set for the entire memory range, that device will not be included.
+ * If \p data is larger than the number of devices that have that advice set for that memory range,
+ * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12
+ * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be
+ * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have
+ * that advice set, then only as many devices will be returned as can fit in the array. There is no
+ * guarantee on which specific devices will be returned, however.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location
+ * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be
+ * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU
+ * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not
+ * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the
+ * last location that the applicaton requested to prefetch the memory range to. It gives no indication as to
+ * whether the prefetch operation to that location has completed or even begun.
+ *
+ * \param data      - A pointers to a memory location where the result
+ *                    of each attribute query will be written to.
+ * \param dataSize  - Array containing the size of data
+ * \param attribute - The attribute to query
+ * \param devPtr    - Start of the range to query
+ * \param count     - Size of the range to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync,
+ * ::cuMemAdvise,
+ * ::cudaMemRangeGetAttribute
+ */
+CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count);
+
+/**
+ * \brief Query attributes of a given memory range.
+ *
+ * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The
+ * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
+ * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes
+ * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries.
+ * The results of the query will be stored in \p data.
+ *
+ * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for
+ * attribute descriptions and restrictions.
+ *
+ * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION
+ * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION
+ *
+ * \param data          - A two-dimensional array containing pointers to memory
+ *                        locations where the result of each attribute query will be written to.
+ * \param dataSizes     - Array containing the sizes of each result
+ * \param attributes    - An array of attributes to query
+ *                        (numAttributes and the number of attributes in this array should match)
+ * \param numAttributes - Number of attributes to query
+ * \param devPtr        - Start of the range to query
+ * \param count         - Size of the range to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise
+ * ::cuMemPrefetchAsync,
+ * ::cudaMemRangeGetAttributes
+ */
+CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count);
+#endif /* __CUDA_API_VERSION >= 8000 */
+
+#if __CUDA_API_VERSION >= 6000
+/**
+ * \brief Set attributes on a previously allocated memory region
+ *
+ * The supported attributes are:
+ *
+ * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
+ *
+ *      A boolean attribute that can either be set (1) or unset (0). When set,
+ *      the region of memory that \p ptr points to is guaranteed to always synchronize
+ *      memory operations that are synchronous. If there are some previously initiated
+ *      synchronous memory operations that are pending when this attribute is set, the
+ *      function does not return until those memory operations are complete.
+ *      See further documentation in the section titled "API synchronization behavior"
+ *      to learn more about cases when synchronous memory operations can
+ *      exhibit asynchronous behavior.
+ *      \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set.
+ *
+ * \param value     - Pointer to memory containing the value to be set
+ * \param attribute - Pointer attribute to set
+ * \param ptr       - Pointer to a memory region allocated using CUDA memory allocation APIs
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa ::cuPointerGetAttribute,
+ * ::cuPointerGetAttributes,
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuMemAllocHost,
+ * ::cuMemFreeHost,
+ * ::cuMemHostAlloc,
+ * ::cuMemHostRegister,
+ * ::cuMemHostUnregister
+ */
+CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr);
+#endif /* __CUDA_API_VERSION >= 6000 */
+
+#if __CUDA_API_VERSION >= 7000
+/**
+ * \brief Returns information about a pointer.
+ *
+ * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions):
+ *
+ * - ::CU_POINTER_ATTRIBUTE_CONTEXT
+ * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER
+ * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER
+ * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS
+ * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID
+ * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
+ *
+ * \param numAttributes - Number of attributes to query
+ * \param attributes    - An array of attributes to query
+ *                      (numAttributes and the number of attributes in this array should match)
+ * \param data          - A two-dimensional array containing pointers to memory
+ *                      locations where the result of each attribute query will be written to.
+ * \param ptr           - Pointer to query
+ *
+ * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr
+ * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values
+ * and CUDA_SUCCESS is returned.
+ *
+ * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA
+ * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuPointerGetAttribute,
+ * ::cuPointerSetAttribute,
+ * ::cudaPointerGetAttributes
+ */
+CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr);
+#endif /* __CUDA_API_VERSION >= 7000 */
+
+/** @} */ /* END CUDA_UNIFIED */
+
+/**
+ * \defgroup CUDA_STREAM Stream Management
+ *
+ * ___MANBRIEF___ stream management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Create a stream
+ *
+ * Creates a stream and returns a handle in \p phStream.  The \p Flags argument
+ * determines behaviors of the stream.  Valid values for \p Flags are:
+ * - ::CU_STREAM_DEFAULT: Default stream creation flag.
+ * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created
+ *   stream may run concurrently with work in stream 0 (the NULL stream), and that
+ *   the created stream should perform no implicit synchronization with stream 0.
+ *
+ * \param phStream - Returned newly created stream
+ * \param Flags    - Parameters for stream creation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreateWithPriority,
+ * ::cuStreamGetPriority,
+ * ::cuStreamGetFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags
+ */
+CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags);
+
+/**
+ * \brief Create a stream with the given priority
+ *
+ * Creates a stream with the specified priority and returns a handle in \p phStream.
+ * This API alters the scheduler priority of work in the stream. Work in a higher
+ * priority stream may preempt work already executing in a low priority stream.
+ *
+ * \p priority follows a convention where lower numbers represent higher priorities.
+ * '0' represents default priority. The range of meaningful numerical priorities can
+ * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is
+ * outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ * it will automatically be clamped to the lowest or the highest number in the range.
+ *
+ * \param phStream    - Returned newly created stream
+ * \param flags       - Flags for stream creation. See ::cuStreamCreate for a list of
+ *                      valid flags
+ * \param priority    - Stream priority. Lower numbers represent higher priorities.
+ *                      See ::cuCtxGetStreamPriorityRange for more information about
+ *                      meaningful stream priorities that can be passed.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \note Stream priorities are supported only on GPUs
+ * with compute capability 3.5 or higher.
+ *
+ * \note In the current implementation, only compute kernels launched in
+ * priority streams are affected by the stream's priority. Stream priorities have
+ * no effect on host-to-device and device-to-host memory operations.
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuStreamGetFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreateWithPriority
+ */
+CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority);
+
+
+/**
+ * \brief Query the priority of a given stream
+ *
+ * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
+ * and return the priority in \p priority. Note that if the stream was created with a
+ * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ * this function returns the clamped priority.
+ * See ::cuStreamCreateWithPriority for details about priority clamping.
+ *
+ * \param hStream    - Handle to the stream to be queried
+ * \param priority   - Pointer to a signed integer in which the stream's priority is returned
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuStreamCreateWithPriority,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuStreamGetFlags,
+ * ::cudaStreamGetPriority
+ */
+CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
+
+/**
+ * \brief Query the flags of a given stream
+ *
+ * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
+ * and return the flags in \p flags.
+ *
+ * \param hStream    - Handle to the stream to be queried
+ * \param flags      - Pointer to an unsigned integer in which the stream's flags are returned
+ *                     The value returned in \p flags is a logical 'OR' of all flags that
+ *                     were used while creating this stream. See ::cuStreamCreate for the list
+ *                     of valid flags
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cudaStreamGetFlags
+ */
+CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
+
+#if __CUDA_API_VERSION >= 9020
+
+/**
+ * \brief Query the context associated with a stream
+ *
+ * Returns the CUDA context that the stream is associated with.
+ *
+ * The stream handle \p hStream can refer to any of the following:
+ * <ul>
+ *   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
+ *   and ::cuStreamCreateWithPriority, or their runtime API equivalents such as
+ *   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
+ *   The returned context is the context that was active in the calling thread when the
+ *   stream was created. Passing an invalid handle will result in undefined behavior.</li>
+ *   <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
+ *   ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
+ *   which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.
+ *   Specifying any of the special handles will return the context current to the
+ *   calling thread. If no context is current to the calling thread,
+ *   ::CUDA_ERROR_INVALID_CONTEXT is returned.</li>
+ * </ul>
+ *
+ * \param hStream - Handle to the stream to be queried
+ * \param pctx    - Returned context associated with the stream
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreateWithPriority,
+ * ::cuStreamGetPriority,
+ * ::cuStreamGetFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags
+ */
+CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
+
+#endif /* __CUDA_API_VERSION >= 9020 */
+
+/**
+ * \brief Make a compute stream wait on an event
+ *
+ * Makes all future work submitted to \p hStream wait for all work captured in
+ * \p hEvent.  See ::cuEventRecord() for details on what is captured by an event.
+ * The synchronization will be performed efficiently on the device when applicable.
+ * \p hEvent may be from a different context or device than \p hStream.
+ *
+ * \param hStream - Stream to wait
+ * \param hEvent  - Event to wait on (may not be NULL)
+ * \param Flags   - Parameters for the operation (must be 0)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuEventRecord,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cuStreamDestroy,
+ * ::cudaStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
+
+/**
+ * \brief Add a callback to a compute stream
+ *
+ * \note This function is slated for eventual deprecation and removal. If
+ * you do not require the callback to execute in case of a device error,
+ * consider using ::cuLaunchHostFunc. Additionally, this function is not
+ * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike
+ * ::cuLaunchHostFunc.
+ *
+ * Adds a callback to be called on the host after all currently enqueued
+ * items in the stream have completed.  For each
+ * cuStreamAddCallback call, the callback will be executed exactly once.
+ * The callback will block later work in the stream until it is finished.
+ *
+ * The callback may be passed ::CUDA_SUCCESS or an error code.  In the event
+ * of a device error, all subsequently executed callbacks will receive an
+ * appropriate ::CUresult.
+ *
+ * Callbacks must not make any CUDA API calls.  Attempting to use a CUDA API
+ * will result in ::CUDA_ERROR_NOT_PERMITTED.  Callbacks must not perform any
+ * synchronization that may depend on outstanding device work or other callbacks
+ * that are not mandated to run earlier.  Callbacks without a mandated order
+ * (in independent streams) execute in undefined order and may be serialized.
+ *
+ * For the purposes of Unified Memory, callback execution makes a number of
+ * guarantees:
+ * <ul>
+ *   <li>The callback stream is considered idle for the duration of the
+ *   callback.  Thus, for example, a callback may always use memory attached
+ *   to the callback stream.</li>
+ *   <li>The start of execution of a callback has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the callback.  It thus synchronizes streams which have been "joined"
+ *   prior to the callback.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding host functions and stream callbacks
+ *   have executed.  Thus, for
+ *   example, a callback might use global attached memory even if work has
+ *   been added to another stream, if the work has been ordered behind the
+ *   callback with an event.</li>
+ *   <li>Completion of a callback does not cause a stream to become
+ *   active except as described above.  The callback stream will remain idle
+ *   if no device work follows the callback, and will remain idle across
+ *   consecutive callbacks without device work in between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a callback at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * \param hStream  - Stream to add callback to
+ * \param callback - The function to call once preceding stream operations are complete
+ * \param userData - User specified data to be passed to the callback function
+ * \param flags    - Reserved for future use, must be 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuMemAllocManaged,
+ * ::cuStreamAttachMemAsync,
+ * ::cuStreamLaunchHostFunc,
+ * ::cudaStreamAddCallback
+ */
+CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+
+#if __CUDA_API_VERSION >= 10000
+
+/**
+ * \brief Begins graph capture on a stream
+ *
+ * Begin graph capture on \p hStream. When a stream is in capture mode, all operations
+ * pushed into the stream will not be executed, but will instead be captured into
+ * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated
+ * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which
+ * it was initiated, and it may only be initiated if the stream is not already in capture
+ * mode. The capture mode may be queried via ::cuStreamIsCapturing.
+ *
+ * \param hStream - Stream in which to initiate capture
+ *
+ * \note Kernels captured using this API must not use texture and surface references.
+ *       Reading or writing through any texture or surface reference is undefined
+ *       behavior. This restriction does not apply to texture and surface objects.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamCreate,
+ * ::cuStreamIsCapturing,
+ * ::cuStreamEndCapture
+ */
+CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream);
+
+/**
+ * \brief Ends capture on a stream, returning the captured graph
+ *
+ * End capture on \p hStream, returning the captured graph via \p phGraph.
+ * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture.
+ * If capture was invalidated, due to a violation of the rules of stream capture, then
+ * a NULL graph will be returned.
+ *
+ * \param hStream - Stream to query
+ * \param phGraph - The captured graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamCreate,
+ * ::cuStreamBeginCapture,
+ * ::cuStreamIsCapturing
+ */
+CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
+
+/**
+ * \brief Returns a stream's capture status
+ *
+ * Return the capture status of \p hStream via \p captureStatus. After a successful
+ * call, \p *captureStatus will contain one of the following:
+ * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing.
+ * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing.
+ * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error
+ *   has invalidated the capture sequence. The capture sequence must be terminated
+ *   with ::cuStreamEndCapture on the stream where it was initiated in order to
+ *   continue using \p hStream.
+ *
+ * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while
+ * a blocking stream in the same context is capturing, it will return
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified
+ * after the call. The blocking stream capture is not invalidated.
+ *
+ * When a blocking stream is capturing, the legacy stream is in an
+ * unusable state until the blocking stream capture is terminated. The legacy
+ * stream is not supported for stream capture, but attempted use would have an
+ * implicit dependency on the capturing stream(s).
+ *
+ * \param hStream       - Stream to query
+ * \param captureStatus - Returns the stream's capture status
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamCreate,
+ * ::cuStreamBeginCapture,
+ * ::cuStreamEndCapture
+ */
+CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+
+#endif /* __CUDA_API_VERSION >= 10000 */
+
+#if __CUDA_API_VERSION >= 6000
+
+/**
+ * \brief Attach memory to a stream asynchronously
+ *
+ * Enqueues an operation in \p hStream to specify stream association of
+ * \p length bytes of memory starting from \p dptr. This function is a
+ * stream-ordered operation, meaning that it is dependent on, and will
+ * only take effect when, previous work in stream has completed. Any
+ * previous association is automatically replaced.
+ *
+ * \p dptr must point to one of the following types of memories:
+ * - managed memory declared using the __managed__ keyword or allocated with
+ *   ::cuMemAllocManaged.
+ * - a valid host-accessible region of system-allocated pageable memory. This
+ *   type of memory may only be specified if the device associated with the
+ *   stream reports a non-zero value for the device attribute
+ *   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ *
+ * For managed allocations, \p length must be either zero or the entire
+ * allocation's size. Both indicate that the entire allocation's stream
+ * association is being changed. Currently, it is not possible to change stream
+ * association for a portion of a managed allocation.
+ *
+ * For pageable host allocations, \p length must be non-zero.
+ *
+ * The stream association is specified using \p flags which must be
+ * one of ::CUmemAttach_flags.
+ * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed
+ * by any stream on any device.
+ * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee
+ * that it won't access the memory on the device from any stream on a device that
+ * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with
+ * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS,
+ * the program makes a guarantee that it will only access the memory on the device
+ * from \p hStream. It is illegal to attach singly to the NULL stream, because the
+ * NULL stream is a virtual global stream and not a specific stream. An error will
+ * be returned in this case.
+ *
+ * When memory is associated with a single stream, the Unified Memory system will
+ * allow CPU access to this memory region so long as all operations in \p hStream
+ * have completed, regardless of whether other streams are active. In effect,
+ * this constrains exclusive ownership of the managed memory region by
+ * an active GPU to per-stream activity instead of whole-GPU activity.
+ *
+ * Accessing memory on the device from streams that are not associated with
+ * it will produce undefined results. No error checking is performed by the
+ * Unified Memory system to ensure that kernels launched into other streams
+ * do not access this region.
+ *
+ * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync
+ * via events, synchronization or other means to ensure legal access to memory
+ * at all times. Data visibility and coherency will be changed appropriately
+ * for all kernels which follow a stream-association change.
+ *
+ * If \p hStream is destroyed while data is associated with it, the association is
+ * removed and the association reverts to the default visibility of the allocation
+ * as specified at ::cuMemAllocManaged. For __managed__ variables, the default
+ * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an
+ * asynchronous operation, and as a result, the change to default association won't
+ * happen until all work in the stream has completed.
+ *
+ * \param hStream - Stream in which to enqueue the attach operation
+ * \param dptr    - Pointer to memory (must be a pointer to managed memory or
+ *                  to a valid host-accessible region of system-allocated
+ *                  pageable memory)
+ * \param length  - Length of memory
+ * \param flags   - Must be one of ::CUmemAttach_flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuMemAllocManaged,
+ * ::cudaStreamAttachMemAsync
+ */
+CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
+
+#endif /* __CUDA_API_VERSION >= 6000 */
+
+/**
+ * \brief Determine status of a compute stream
+ *
+ * Returns ::CUDA_SUCCESS if all operations in the stream specified by
+ * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not.
+ *
+ * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
+ * is equivalent to having called ::cuStreamSynchronize().
+ *
+ * \param hStream - Stream to query status of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_READY
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamQuery
+ */
+CUresult CUDAAPI cuStreamQuery(CUstream hStream);
+
+/**
+ * \brief Wait until a stream's tasks are completed
+ *
+ * Waits until the device has completed all operations in the stream specified
+ * by \p hStream. If the context was created with the
+ * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the
+ * stream is finished with all of its tasks.
+ *
+ * \param hStream - Stream to wait for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamDestroy,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamSynchronize
+ */
+CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Destroys a stream
+ *
+ * Destroys the stream specified by \p hStream.
+ *
+ * In case the device is still doing work in the stream \p hStream
+ * when ::cuStreamDestroy() is called, the function will return immediately
+ * and the resources associated with \p hStream will be released automatically
+ * once the device has completed all work in \p hStream.
+ *
+ * \param hStream - Stream to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamDestroy
+ */
+CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+/** @} */ /* END CUDA_STREAM */
+
+
+/**
+ * \defgroup CUDA_EVENT Event Management
+ *
+ * ___MANBRIEF___ event management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the event management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates an event
+ *
+ * Creates an event *phEvent for the current context with the flags specified via
+ * \p Flags. Valid flags include:
+ * - ::CU_EVENT_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
+ *   synchronization.  A CPU thread that uses ::cuEventSynchronize() to wait on
+ *   an event created with this flag will block until the event has actually
+ *   been recorded.
+ * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need
+ *   to record timing data.  Events created with this flag specified and
+ *   the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best
+ *   performance when used with ::cuStreamWaitEvent() and ::cuEventQuery().
+ * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an
+ *   interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must
+ *   be specified along with ::CU_EVENT_DISABLE_TIMING.
+ *
+ * \param phEvent - Returns newly created event
+ * \param Flags   - Event creation flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventCreate,
+ * ::cudaEventCreateWithFlags
+ */
+CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags);
+
+/**
+ * \brief Records an event
+ *
+ * Captures in \p hEvent the contents of \p hStream at the time of this call.
+ * \p hEvent and \p hStream must be from the same context.
+ * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
+ * examine or wait for completion of the work that was captured. Uses of
+ * \p hStream after this call do not modify \p hEvent. See note on default
+ * stream behavior for what is captured in the default case.
+ *
+ * ::cuEventRecord() can be called multiple times on the same event and
+ * will overwrite the previously captured state. Other APIs such as
+ * ::cuStreamWaitEvent() use the most recently captured state at the time
+ * of the API call, and are not affected by later calls to
+ * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an
+ * event represents an empty set of work, so for example ::cuEventQuery()
+ * would return ::CUDA_SUCCESS.
+ *
+ * \param hEvent  - Event to record
+ * \param hStream - Stream to record event for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventRecord
+ */
+CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
+
+/**
+ * \brief Queries an event's status
+ *
+ * Queries the status of all work currently captured by \p hEvent. See
+ * ::cuEventRecord() for details on what is captured by an event.
+ *
+ * Returns ::CUDA_SUCCESS if all captured work has been completed, or
+ * ::CUDA_ERROR_NOT_READY if any captured work is incomplete.
+ *
+ * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
+ * is equivalent to having called ::cuEventSynchronize().
+ *
+ * \param hEvent - Event to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_READY
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventQuery
+ */
+CUresult CUDAAPI cuEventQuery(CUevent hEvent);
+
+/**
+ * \brief Waits for an event to complete
+ *
+ * Waits until the completion of all work currently captured in \p hEvent.
+ * See ::cuEventRecord() for details on what is captured by an event.
+ *
+ * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC
+ * flag will cause the calling CPU thread to block until the event has
+ * been completed by the device.  If the ::CU_EVENT_BLOCKING_SYNC flag has
+ * not been set, then the CPU thread will busy-wait until the event has
+ * been completed by the device.
+ *
+ * \param hEvent - Event to wait for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventSynchronize
+ */
+CUresult CUDAAPI cuEventSynchronize(CUevent hEvent);
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Destroys an event
+ *
+ * Destroys the event specified by \p hEvent.
+ *
+ * An event may be destroyed before it is complete (i.e., while
+ * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the
+ * call does not block on completion of the event, and any associated
+ * resources will automatically be released asynchronously at completion.
+ *
+ * \param hEvent - Event to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventElapsedTime,
+ * ::cudaEventDestroy
+ */
+CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+/**
+ * \brief Computes the elapsed time between two events
+ *
+ * Computes the elapsed time between two events (in milliseconds with a
+ * resolution of around 0.5 microseconds).
+ *
+ * If either event was last recorded in a non-NULL stream, the resulting time
+ * may be greater than expected (even if both used the same stream handle). This
+ * happens because the ::cuEventRecord() operation takes place asynchronously
+ * and there is no guarantee that the measured latency is actually just between
+ * the two events. Any number of other different stream operations could execute
+ * in between the two measured events, thus altering the timing in a significant
+ * way.
+ *
+ * If ::cuEventRecord() has not been called on either event then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
+ * on both events but one or both of them has not yet been completed (that is,
+ * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
+ * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
+ * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
+ * ::CUDA_ERROR_INVALID_HANDLE.
+ *
+ * \param pMilliseconds - Time between \p hStart and \p hEnd in ms
+ * \param hStart        - Starting event
+ * \param hEnd          - Ending event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_READY
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cudaEventElapsedTime
+ */
+CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+
+/** @} */ /* END CUDA_EVENT */
+
+/**
+ * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability
+ *
+ * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the external resource interoperability functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+#if __CUDA_API_VERSION >= 10000
+
+ /**
+ * \brief Imports an external memory object
+ *
+ * Imports an externally allocated memory object and returns
+ * a handle to that in \p extMem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure
+ * is defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
+            CUexternalMemoryHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                    const void *name;
+                } win32;
+            } handle;
+            unsigned int flags;
+        } CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type
+ * of handle being imported. ::CUexternalMemoryHandleType is
+ * defined as:
+ *
+ * \code
+        typedef enum CUexternalMemoryHandleType_enum {
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD        = 1,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32     = 2,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP       = 4,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE   = 5
+        } CUexternalMemoryHandleType;
+ * \endcode
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid
+ * file descriptor referencing a memory object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
+ * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a memory object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a memory object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ * be non-NULL and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * memory object are destroyed.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one
+ * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3DDevice::CreateSharedHandle when referring to a
+ * ID3D12Heap object. This handle holds a reference to the underlying
+ * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Heap object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one
+ * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3DDevice::CreateSharedHandle when referring to a
+ * ID3D12Resource object. This handle holds a reference to the
+ * underlying object. If
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Resource object.
+ *
+ * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the
+ * resource is a dedicated resource. The definition of what a
+ * dedicated resource is outside the scope of this extension.
+ *
+ * \param extMem_out    - Returned handle to an external memory object
+ * \param memHandleDesc - Memory import handle descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the
+ * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges
+ * as well as appropriate Vulkan pipeline barriers to maintain coherence between
+ * CPU and GPU. For more information on these APIs, please refer to "Synchronization
+ * and Cache Control" chapter from Vulkan specification.
+ *
+ * \sa ::cuDestroyExternalMemory,
+ * ::cuExternalMemoryGetMappedBuffer,
+ * ::cuExternalMemoryGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc);
+
+/**
+ * \brief Maps a buffer onto an imported memory object
+ *
+ * Maps a buffer onto an imported memory object and returns a device
+ * pointer in \p devPtr.
+ *
+ * The properties of the buffer being mapped must be described in
+ * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is
+ * defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
+            unsigned long long offset;
+            unsigned long long size;
+            unsigned int flags;
+        } CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in
+ * the memory object where the buffer's base address is.
+ * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer.
+ * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero.
+ *
+ * The offset and size have to be suitably aligned to match the
+ * requirements of the external API. Mapping two buffers whose ranges
+ * overlap may or may not result in the same virtual address being
+ * returned for the overlapped portion. In such cases, the application
+ * must ensure that all accesses to that region from the GPU are
+ * volatile. Otherwise writes made via one address are not guaranteed
+ * to be visible via the other address, even if they're issued by the
+ * same thread. It is recommended that applications map the combined
+ * range instead of mapping separate buffers and then apply the
+ * appropriate offsets to the returned pointer to derive the
+ * individual buffers.
+ *
+ * \param devPtr     - Returned device pointer to buffer
+ * \param extMem     - Handle to external memory object
+ * \param bufferDesc - Buffer descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalMemory
+ * ::cuDestroyExternalMemory,
+ * ::cuExternalMemoryGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc);
+
+/**
+ * \brief Maps a CUDA mipmapped array onto an external memory object
+ *
+ * Maps a CUDA mipmapped array onto an external object and returns a
+ * handle to it in \p mipmap.
+ *
+ * The properties of the CUDA mipmapped array being mapped must be
+ * described in \p mipmapDesc. The structure
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
+            unsigned long long offset;
+            CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+            unsigned int numLevels;
+        } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the
+ * offset in the memory object where the base level of the mipmap
+ * chain is.
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes
+ * the format, dimensions and type of the base level of the mipmap
+ * chain. For further details on these parameters, please refer to the
+ * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped
+ * array is bound as a color target in the graphics API, then the flag
+ * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags.
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies
+ * the total number of levels in the mipmap chain.
+ *
+ * \param mipmap     - Returned CUDA mipmapped array
+ * \param extMem     - Handle to external memory object
+ * \param mipmapDesc - CUDA array descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalMemory
+ * ::cuDestroyExternalMemory,
+ * ::cuExternalMemoryGetMappedBuffer
+ */
+CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc);
+
+/**
+ * \brief Releases all resources associated with an external memory
+ * object.
+ *
+ * Frees all buffers and CUDA mipmapped arrays that were
+ * mapped onto this external memory object and releases any reference
+ * on the underlying memory itself.
+ *
+ * \param extMem - External memory object to be destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalMemory
+ * ::cuExternalMemoryGetMappedBuffer,
+ * ::cuExternalMemoryGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem);
+
+/**
+ * \brief Imports an external semaphore
+ *
+ * Imports an externally allocated synchronization object and returns
+ * a handle to that in \p extSem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is
+ * defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
+            CUexternalSemaphoreHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                const void *name;
+                } win32;
+            } handle;
+            unsigned int flags;
+        } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of
+ * handle being imported. ::CUexternalSemaphoreHandleType is defined
+ * as:
+ *
+ * \code
+        typedef enum CUexternalSemaphoreHandleType_enum {
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD        = 1,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32     = 2,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE      = 4
+        } CUexternalSemaphoreHandleType;
+ * \endcode
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid
+ * file descriptor referencing a synchronization object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
+ * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a synchronization object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must
+ * be non-NULL and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * synchronization object are destroyed.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one
+ * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3DDevice::CreateSharedHandle when referring to a
+ * ID3D12Fence object. This handle holds a reference to the underlying
+ * object. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid ID3D12Fence object.
+ *
+ * \param extSem_out    - Returned handle to an external semaphore
+ * \param semHandleDesc - Semaphore import handle descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuDestroyExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc);
+
+/**
+ * \brief Signals a set of external semaphore objects
+ *
+ * Enqueues a signal operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of signaling a semaphore depends on the type of
+ * the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+ * then signaling the semaphore will set it to the signaled state.
+ *
+ * If the semaphore object is of the type
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then the
+ * semaphore will be set to the value specified in
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value.
+ *
+ * \param extSemArray - Set of external semaphores to be signaled
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to signal
+ * \param stream     - Stream to enqueue the signal operations in
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalSemaphore,
+ * ::cuDestroyExternalSemaphore,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+
+/**
+ * \brief Waits on a set of external semaphore objects
+ *
+ * Enqueues a wait operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of waiting on a semaphore depends on the type
+ * of the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+ * then waiting on the semaphore will wait until the semaphore reaches
+ * the signaled state. The semaphore will then be reset to the
+ * unsignaled state. Therefore for every signal operation, there can
+ * only be one wait operation.
+ *
+ * If the semaphore object is of the type
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then waiting on
+ * the semaphore will wait until the value of the semaphore is
+ * greater than or equal to
+ * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value.
+ *
+ * \param extSemArray - External semaphores to be waited on
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to wait on
+ * \param stream      - Stream to enqueue the wait operations in
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalSemaphore,
+ * ::cuDestroyExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+
+/**
+ * \brief Destroys an external semaphore
+ *
+ * Destroys an external semaphore object and releases any references
+ * to the underlying resource. Any outstanding signals or waits must
+ * have completed before the semaphore is destroyed.
+ *
+ * \param extSem - External semaphore to be destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
+
+#endif /* __CUDA_API_VERSION >= 10000 */
+
+/** @} */ /* END CUDA_EXTRES_INTEROP */
+
+/**
+ * \defgroup CUDA_MEMOP Stream memory operations
+ *
+ * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream memory operations of the low-level CUDA
+ * driver application programming interface.
+ *
+ * The whole set of operations is disabled by default. Users are required
+ * to explicitly enable them, e.g. on Linux by passing the kernel module
+ * parameter shown below:
+ *     modprobe nvidia NVreg_EnableStreamMemOPs=1
+ * There is currently no way to enable these operations on other operating
+ * systems.
+ *
+ * Users can programmatically query whether the device supports these
+ * operations with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
+ *
+ * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.
+ *
+ * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64()
+ * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and
+ * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+ *
+ * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and
+ * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform
+ * hardware features and can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.
+ *
+ * Note that all memory pointers passed as parameters to these operations
+ * are device pointers. Where necessary a device pointer should be
+ * obtained, for example with ::cuMemHostGetDevicePointer().
+ *
+ * None of the operations accepts pointers to managed memory buffers
+ * (::cuMemAllocManaged).
+ *
+ * @{
+ */
+
+#if __CUDA_API_VERSION >= 8000
+/**
+ * \brief Wait on a memory location
+ *
+ * Enqueues a synchronization of the stream on the given memory location. Work
+ * ordered after the operation will block until the given condition on the
+ * memory is satisfied. By default, the condition is to wait for
+ * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal.
+ * Other condition types can be specified via \p flags.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
+ * be used with managed memory (::cuMemAllocManaged).
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
+ *
+ * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.
+ *
+ * \param stream The stream to synchronize on the memory location.
+ * \param addr The memory location to wait on.
+ * \param value The value to compare with the memory location.
+ * \param flags See ::CUstreamWaitValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue64,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWriteValue64
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+
+/**
+ * \brief Wait on a memory location
+ *
+ * Enqueues a synchronization of the stream on the given memory location. Work
+ * ordered after the operation will block until the given condition on the
+ * memory is satisfied. By default, the condition is to wait for
+ * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal.
+ * Other condition types can be specified via \p flags.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer().
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+ *
+ * \param stream The stream to synchronize on the memory location.
+ * \param addr The memory location to wait on.
+ * \param value The value to compare with the memory location.
+ * \param flags See ::CUstreamWaitValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue32,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWriteValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+
+/**
+ * \brief Write a value to memory
+ *
+ * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER
+ * flag is passed, the write is preceded by a system-wide memory fence,
+ * equivalent to a __threadfence_system() but scoped to the stream
+ * rather than a CUDA thread.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
+ * be used with managed memory (::cuMemAllocManaged).
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
+ *
+ * \param stream The stream to do the write in.
+ * \param addr The device address to write to.
+ * \param value The value to write.
+ * \param flags See ::CUstreamWriteValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWriteValue64,
+ * ::cuStreamWaitValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuEventRecord
+ */
+CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+
+/**
+ * \brief Write a value to memory
+ *
+ * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER
+ * flag is passed, the write is preceded by a system-wide memory fence,
+ * equivalent to a __threadfence_system() but scoped to the stream
+ * rather than a CUDA thread.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer().
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+ *
+ * \param stream The stream to do the write in.
+ * \param addr The device address to write to.
+ * \param value The value to write.
+ * \param flags See ::CUstreamWriteValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWriteValue32,
+ * ::cuStreamWaitValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuEventRecord
+ */
+CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+
+/**
+ * \brief Batch operations to synchronize the stream via memory operations
+ *
+ * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32().
+ * Batching operations may avoid some performance overhead in both the API call
+ * and the device execution versus adding them to the stream in separate API
+ * calls. The operations are enqueued in the order they appear in the array.
+ *
+ * See ::CUstreamBatchMemOpType for the full set of supported operations, and
+ * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(),
+ * and ::cuStreamWriteValue64() for details of specific operations.
+ *
+ * Basic support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. See related APIs for details
+ * on querying support for specific operations.
+ *
+ * \param stream The stream to enqueue the operations in.
+ * \param count The number of operations in the array. Must be less than 256.
+ * \param paramArray The types and parameters of the individual operations.
+ * \param flags Reserved for future expansion; must be 0.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWriteValue64,
+ * ::cuMemHostRegister
+ */
+CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+#endif /* __CUDA_API_VERSION >= 8000 */
+
+/** @} */ /* END CUDA_MEMOP */
+
+/**
+ * \defgroup CUDA_EXEC Execution Control
+ *
+ * ___MANBRIEF___ execution control functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the execution control functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns information about a function
+ *
+ * Returns in \p *pi the integer value of the attribute \p attrib on the kernel
+ * given by \p hfunc. The supported attributes are:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads
+ *   per block, beyond which a launch of the function would fail. This number
+ *   depends on both the function and the device on which the function is
+ *   currently loaded.
+ * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of
+ *   statically-allocated shared memory per block required by this function.
+ *   This does not include dynamically-allocated shared memory requested by
+ *   the user at runtime.
+ * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated
+ *   constant memory required by this function.
+ * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory
+ *   used by each thread of this function.
+ * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread
+ *   of this function.
+ * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for
+ *   which the function was compiled. This value is the major PTX version * 10
+ *   + the minor PTX version, so a PTX version 1.3 function would return the
+ *   value 13. Note that this may return the undefined value of 0 for cubins
+ *   compiled prior to CUDA 3.0.
+ * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for
+ *   which the function was compiled. This value is the major binary
+ *   version * 10 + the minor binary version, so a binary version 1.3 function
+ *   would return the value 13. Note that this will return a value of 10 for
+ *   legacy cubins that do not have a properly-encoded binary architecture
+ *   version.
+ * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has
+ *   been compiled with user specified option "-Xptxas --dlcm=ca" set .
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of
+ *   dynamically-allocated shared memory.
+ * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1
+ *   cache split ratio in percent of shared memory.
+ *
+ * \param pi     - Returned attribute value
+ * \param attrib - Attribute requested
+ * \param hfunc  - Function to query attribute of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuLaunchKernel,
+ * ::cudaFuncGetAttributes
+ * ::cudaFuncSetAttribute
+ */
+CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+
+#if __CUDA_API_VERSION >= 9000
+
+/**
+ * \brief Sets information about a function
+ *
+ * This call sets the value of a specified attribute \p attrib on the kernel given
+ * by \p hfunc to an integer value specified by \p val
+ * This function returns CUDA_SUCCESS if the new value of the attribute could be
+ * successfully set. If the set fails, this call will return an error.
+ * Not all attributes can have values set. Attempting to set a value on a read-only
+ * attribute will result in an error (CUDA_ERROR_INVALID_VALUE)
+ *
+ * Supported attributes for the cuFuncSetAttribute call are:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of
+ *   dynamically-allocated shared memory. The value should contain the requested
+ *   maximum size of dynamically-allocated shared memory. The sum of this value and
+ *   the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the
+ *   device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN.
+ *   The maximal size of requestable dynamic shared memory may differ by GPU
+ *   architecture.
+ * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1
+ *   cache and shared memory use the same hardware resources, this sets the shared memory
+ *   carveout preference, in percent of the total resources. This is only a hint, and the
+ *   driver can choose a different ratio if required to execute the function.
+ *
+ * \param hfunc  - Function to query attribute of
+ * \param attrib - Attribute requested
+ * \param value   - The value to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuLaunchKernel,
+ * ::cudaFuncGetAttributes
+ * ::cudaFuncSetAttribute
+ */
+CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value);
+#endif // __CUDA_API_VERSION >= 9000
+
+/**
+ * \brief Sets the preferred cache configuration for a device function
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p config the preferred cache configuration for
+ * the device function \p hfunc. This is only a preference. The driver will use
+ * the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute \p hfunc.  Any context-wide preference
+ * set via ::cuCtxSetCacheConfig() will be overridden by this per-function
+ * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In
+ * that case, the current context-wide setting will be used.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \param hfunc  - Kernel to configure cache for
+ * \param config - Requested cache configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchKernel,
+ * ::cudaFuncSetCacheConfig
+ */
+CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
+
+#if __CUDA_API_VERSION >= 4020
+/**
+ * \brief Sets the shared memory configuration for a device function.
+ *
+ * On devices with configurable shared memory banks, this function will
+ * force all subsequent launches of the specified device function to have
+ * the given shared memory bank size configuration. On any given launch of the
+ * function, the shared memory configuration of the device will be temporarily
+ * changed if needed to suit the function's preferred configuration. Changes in
+ * shared memory configuration between subsequent launches of functions,
+ * may introduce a device side synchronization point.
+ *
+ * Any per-function setting of shared memory bank size set via
+ * ::cuFuncSetSharedMemConfig will override the context wide setting set with
+ * ::cuCtxSetSharedMemConfig.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance.
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * The supported bank configurations are:
+ * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory
+ *   configuration when launching this function.
+ * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively four bytes when launching this function.
+ * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively eight bytes when launching this function.
+ *
+ * \param hfunc  - kernel to be given a shared memory config
+ * \param config - requested shared memory configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuCtxSetSharedMemConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchKernel,
+ * ::cudaFuncSetSharedMemConfig
+ */
+CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
+#endif
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Launches a CUDA function
+ *
+ * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
+ * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
+ * \p blockDimZ threads.
+ *
+ * \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * Kernel parameters to \p f can be specified in one of two ways:
+ *
+ * 1) Kernel parameters can be specified via \p kernelParams.  If \p f
+ * has N parameters, then \p kernelParams needs to be an array of N
+ * pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
+ * must point to a region of memory from which the actual kernel
+ * parameter will be copied.  The number of kernel parameters and their
+ * offsets and sizes do not need to be specified as that information is
+ * retrieved directly from the kernel's image.
+ *
+ * 2) Kernel parameters can also be packaged by the application into
+ * a single buffer that is passed in via the \p extra parameter.
+ * This places the burden on the application of knowing each kernel
+ * parameter's size and alignment/padding within the buffer.  Here is
+ * an example of using the \p extra parameter in this manner:
+ * \code
+    size_t argBufferSize;
+    char argBuffer[256];
+
+    // populate argBuffer and argBufferSize
+
+    void *config[] = {
+        CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
+        CU_LAUNCH_PARAM_BUFFER_SIZE,    &argBufferSize,
+        CU_LAUNCH_PARAM_END
+    };
+    status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
+ * \endcode
+ *
+ * The \p extra parameter exists to allow ::cuLaunchKernel to take
+ * additional less commonly used arguments.  \p extra specifies a list of
+ * names of extra settings and their corresponding values.  Each extra
+ * setting name is immediately followed by the corresponding value.  The
+ * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END.
+ *
+ * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
+ *   array;
+ * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
+ *   value in \p extra will be a pointer to a buffer containing all
+ *   the kernel parameters for launching kernel \p f;
+ * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
+ *   value in \p extra will be a pointer to a size_t containing the
+ *   size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER;
+ *
+ * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel
+ * parameters are specified with both \p kernelParams and \p extra
+ * (i.e. both \p kernelParams and \p extra are non-NULL).
+ *
+ * Calling ::cuLaunchKernel() sets persistent function state that is
+ * the same as function state set through the following deprecated APIs:
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(),
+ *  ::cuParamSetv().
+ *
+ * When the kernel \p f is launched via ::cuLaunchKernel(), the previous
+ * block shape, shared size and parameter info associated with \p f
+ * is overwritten.
+ *
+ * Note that to use ::cuLaunchKernel(), the kernel \p f must either have
+ * been compiled with toolchain version 3.2 or later so that it will
+ * contain kernel parameter information, or have no kernel parameters.
+ * If either of these conditions is not met, then ::cuLaunchKernel() will
+ * return ::CUDA_ERROR_INVALID_IMAGE.
+ *
+ * \param f              - Kernel to launch
+ * \param gridDimX       - Width of grid in blocks
+ * \param gridDimY       - Height of grid in blocks
+ * \param gridDimZ       - Depth of grid in blocks
+ * \param blockDimX      - X dimension of each thread block
+ * \param blockDimY      - Y dimension of each thread block
+ * \param blockDimZ      - Z dimension of each thread block
+ * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
+ * \param hStream        - Stream identifier
+ * \param kernelParams   - Array of pointers to kernel parameters
+ * \param extra          - Extra options
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cudaLaunchKernel
+ */
+CUresult CUDAAPI cuLaunchKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams,
+                                void **extra);
+#endif /* __CUDA_API_VERSION >= 4000 */
+#if __CUDA_API_VERSION >= 9000
+/**
+ * \brief Launches a CUDA function where thread blocks can cooperate and synchronize as they execute
+ *
+ * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
+ * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
+ * \p blockDimZ threads.
+ *
+ * \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * The device on which this kernel is invoked must have a non-zero value for
+ * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH.
+ *
+ * The total number of blocks launched cannot exceed the maximum number of blocks per
+ * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+ *
+ * The kernel cannot make use of CUDA dynamic parallelism.
+ *
+ * Kernel parameters must be specified via \p kernelParams.  If \p f
+ * has N parameters, then \p kernelParams needs to be an array of N
+ * pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
+ * must point to a region of memory from which the actual kernel
+ * parameter will be copied.  The number of kernel parameters and their
+ * offsets and sizes do not need to be specified as that information is
+ * retrieved directly from the kernel's image.
+ *
+ * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is
+ * the same as function state set through ::cuLaunchKernel API
+ *
+ * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous
+ * block shape, shared size and parameter info associated with \p f
+ * is overwritten.
+ *
+ * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have
+ * been compiled with toolchain version 3.2 or later so that it will
+ * contain kernel parameter information, or have no kernel parameters.
+ * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will
+ * return ::CUDA_ERROR_INVALID_IMAGE.
+ *
+ * \param f              - Kernel to launch
+ * \param gridDimX       - Width of grid in blocks
+ * \param gridDimY       - Height of grid in blocks
+ * \param gridDimZ       - Depth of grid in blocks
+ * \param blockDimX      - X dimension of each thread block
+ * \param blockDimY      - Y dimension of each thread block
+ * \param blockDimZ      - Z dimension of each thread block
+ * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
+ * \param hStream        - Stream identifier
+ * \param kernelParams   - Array of pointers to kernel parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchCooperativeKernelMultiDevice,
+ * ::cudaLaunchCooperativeKernel
+ */
+CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams);
+
+/**
+ * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute
+ *
+ * Invokes kernels as specified in the \p launchParamsList array where each element
+ * of the array specifies all the parameters required to perform a single kernel launch.
+ * These kernels can cooperate and synchronize as they execute. The size of the array is
+ * specified by \p numDevices.
+ *
+ * No two kernels can be launched on the same device. All the devices targeted by this
+ * multi-device launch must be identical. All devices must have a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH.
+ *
+ * All kernels launched must be identical with respect to the compiled code. Note that
+ * any __device__, __constant__ or __managed__ variables present in the module that owns
+ * the kernel launched on each device, are independently instantiated on every device.
+ * It is the application's responsiblity to ensure these variables are initialized and
+ * used appropriately.
+ *
+ * The size of the grids as specified in blocks, the size of the blocks themselves
+ * and the amount of shared memory used by each thread block must also match across
+ * all launched kernels.
+ *
+ * The streams used to launch these kernels must have been created via either ::cuStreamCreate
+ * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD
+ * cannot be used.
+ *
+ * The total number of blocks launched per kernel cannot exceed the maximum number of blocks
+ * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the
+ * total number of blocks launched per device has to match across all devices, the maximum
+ * number of blocks that can be launched per device will be limited by the device with the
+ * least number of multiprocessors.
+ *
+ * The kernels cannot make use of CUDA dynamic parallelism.
+ *
+ * The ::CUDA_LAUNCH_PARAMS structure is defined as:
+ * \code
+        typedef struct CUDA_LAUNCH_PARAMS_st
+        {
+            CUfunction function;
+            unsigned int gridDimX;
+            unsigned int gridDimY;
+            unsigned int gridDimZ;
+            unsigned int blockDimX;
+            unsigned int blockDimY;
+            unsigned int blockDimZ;
+            unsigned int sharedMemBytes;
+            CUstream hStream;
+            void **kernelParams;
+        } CUDA_LAUNCH_PARAMS;
+ * \endcode
+ * where:
+ * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must
+ *   be identical with respect to the compiled code.
+ * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes.
+ *   This must match across all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot
+ *   be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated
+ *   with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function.
+ * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If
+ *   ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams
+ *   needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through
+ *   ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual
+ *   kernel parameter will be copied. The number of kernel parameters and their offsets and sizes
+ *   do not need to be specified as that information is retrieved directly from the kernel's image.
+ *
+ * By default, the kernel won't begin execution on any GPU until all prior work in all the specified
+ * streams has completed. This behavior can be overridden by specifying the flag
+ * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel
+ * will only wait for prior work in the stream corresponding to that GPU to complete before it begins
+ * execution.
+ *
+ * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin
+ * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying
+ * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified,
+ * any subsequent work pushed in any of the specified streams will only wait for the kernel launched
+ * on the GPU corresponding to that stream to complete before it begins execution.
+ *
+ * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is
+ * the same as function state set through ::cuLaunchKernel API when called individually for each
+ * element in \p launchParamsList.
+ *
+ * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous
+ * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function
+ * in \p launchParamsList is overwritten.
+ *
+ * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have
+ * been compiled with toolchain version 3.2 or later so that it will
+ * contain kernel parameter information, or have no kernel parameters.
+ * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will
+ * return ::CUDA_ERROR_INVALID_IMAGE.
+ *
+ * \param launchParamsList - List of launch parameters, one per device
+ * \param numDevices       - Size of the \p launchParamsList array
+ * \param flags            - Flags to control launch behavior
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchCooperativeKernel,
+ * ::cudaLaunchCooperativeKernelMultiDevice
+ */
+CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags);
+
+#endif /* __CUDA_API_VERSION >= 9000 */
+
+#if __CUDA_API_VERSION >= 10000
+
+/**
+ * \brief Enqueues a host function call in a stream
+ *
+ * Enqueues a host function to run in a stream.  The function will be called
+ * after currently enqueued work and will block work added after it.
+ *
+ * The host function must not make any CUDA API calls.  Attempting to use a
+ * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required.
+ * The host function must not perform any synchronization that may depend on
+ * outstanding CUDA work not mandated to run earlier.  Host functions without a
+ * mandated order (such as in independent streams) execute in undefined order
+ * and may be serialized.
+ *
+ * For the purposes of Unified Memory, execution makes a number of guarantees:
+ * <ul>
+ *   <li>The stream is considered idle for the duration of the function's
+ *   execution.  Thus, for example, the function may always use memory attached
+ *   to the stream it was enqueued in.</li>
+ *   <li>The start of execution of the function has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the function.  It thus synchronizes streams which have been "joined"
+ *   prior to the function.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding host functions and stream callbacks
+ *   have executed.  Thus, for
+ *   example, a function might use global attached memory even if work has
+ *   been added to another stream, if the work has been ordered behind the
+ *   function call with an event.</li>
+ *   <li>Completion of the function does not cause a stream to become
+ *   active except as described above.  The stream will remain idle
+ *   if no device work follows the function, and will remain idle across
+ *   consecutive host functions or stream callbacks without device work in
+ *   between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a host function at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * Note that, in contrast to ::cuStreamAddCallback, the function will not be
+ * called in the event of an error in the CUDA context.
+ *
+ * \param hStream  - Stream to enqueue function call in
+ * \param fn       - The function to call once preceding stream operations are complete
+ * \param userData - User-specified data to be passed to the function
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuMemAllocManaged,
+ * ::cuStreamAttachMemAsync,
+ * ::cuStreamAddCallback
+ */
+CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
+
+#endif /* __CUDA_API_VERSION >= 10000 */
+
+/** @} */ /* END CUDA_EXEC */
+
+/**
+ * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated execution control functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Sets the block-dimensions for the function
+ *
+ * \deprecated
+ *
+ * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are
+ * created when the kernel given by \p hfunc is launched.
+ *
+ * \param hfunc - Kernel to specify dimensions of
+ * \param x     - X dimension
+ * \param y     - Y dimension
+ * \param z     - Z dimension
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetSharedSize,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSeti,
+ * ::cuParamSetf,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
+
+/**
+ * \brief Sets the dynamic shared-memory size for the function
+ *
+ * \deprecated
+ *
+ * Sets through \p bytes the amount of dynamic shared memory that will be
+ * available to each thread block when the kernel given by \p hfunc is launched.
+ *
+ * \param hfunc - Kernel to specify dynamic shared-memory size for
+ * \param bytes - Dynamic shared-memory size per thread in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSeti,
+ * ::cuParamSetf,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
+
+/**
+ * \brief Sets the parameter size for the function
+ *
+ * \deprecated
+ *
+ * Sets through \p numbytes the total size in bytes needed by the function
+ * parameters of the kernel corresponding to \p hfunc.
+ *
+ * \param hfunc    - Kernel to set parameter size for
+ * \param numbytes - Size of parameter list in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes);
+
+/**
+ * \brief Adds an integer parameter to the function's argument list
+ *
+ * \deprecated
+ *
+ * Sets an integer parameter that will be specified the next time the
+ * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
+ *
+ * \param hfunc  - Kernel to add parameter to
+ * \param offset - Offset to add parameter to argument list
+ * \param value  - Value of parameter
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value);
+
+/**
+ * \brief Adds a floating-point parameter to the function's argument list
+ *
+ * \deprecated
+ *
+ * Sets a floating-point parameter that will be specified the next time the
+ * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
+ *
+ * \param hfunc  - Kernel to add parameter to
+ * \param offset - Offset to add parameter to argument list
+ * \param value  - Value of parameter
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value);
+
+/**
+ * \brief Adds arbitrary data to the function's argument list
+ *
+ * \deprecated
+ *
+ * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr
+ * into the parameter space of the kernel corresponding to \p hfunc. \p offset
+ * is a byte offset.
+ *
+ * \param hfunc    - Kernel to add data to
+ * \param offset   - Offset to add data to argument list
+ * \param ptr      - Pointer to arbitrary data
+ * \param numbytes - Size of data to copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * \deprecated
+ *
+ * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block
+ * contains the number of threads specified by a previous call to
+ * ::cuFuncSetBlockShape().
+ *
+ * \param f - Kernel to launch
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * \deprecated
+ *
+ * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
+ * blocks. Each block contains the number of threads specified by a previous
+ * call to ::cuFuncSetBlockShape().
+ *
+ * \param f           - Kernel to launch
+ * \param grid_width  - Width of grid in blocks
+ * \param grid_height - Height of grid in blocks
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * \deprecated
+ *
+ * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
+ * blocks. Each block contains the number of threads specified by a previous
+ * call to ::cuFuncSetBlockShape().
+ *
+ * \param f           - Kernel to launch
+ * \param grid_width  - Width of grid in blocks
+ * \param grid_height - Height of grid in blocks
+ * \param hStream     - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ *
+ * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no),
+ *       this function may serialize kernel launches. In order to force the CUDA driver to retain
+ *       asynchronous behavior, set the ::CU_CTX_LMEM_RESIZE_TO_MAX flag during context creation (see ::cuCtxCreate).
+ *
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
+
+
+/**
+ * \brief Adds a texture-reference to the function's argument list
+ *
+ * \deprecated
+ *
+ * Makes the CUDA array or linear memory bound to the texture reference
+ * \p hTexRef available to a device program as a texture. In this version of
+ * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and
+ * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT.
+ *
+ * \param hfunc   - Kernel to add texture-reference to
+ * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT)
+ * \param hTexRef - Texture-reference to add to argument list
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
+/** @} */ /* END CUDA_EXEC_DEPRECATED */
+
+#if __CUDA_API_VERSION >= 10000
+/**
+ * \defgroup CUDA_GRAPH Graph Management
+ *
+ * ___MANBRIEF___ graph management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graph management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a graph
+ *
+ * Creates an empty graph, which is returned via \p phGraph.
+ *
+ * \param phGraph - Returns newly created graph
+ * \param flags   - Graph creation flags, must be 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphInstantiate,
+ * ::cuGraphDestroy,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphClone
+ */
+CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags);
+
+/**
+ * \brief Creates a kernel execution node and adds it to a graph
+ *
+ * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The CUDA_KERNEL_NODE_PARAMS structure is defined as:
+ *
+ * \code
+ *  typedef struct CUDA_KERNEL_NODE_PARAMS_st {
+ *      CUfunction func;
+ *      unsigned int gridDimX;
+ *      unsigned int gridDimY;
+ *      unsigned int gridDimZ;
+ *      unsigned int blockDimX;
+ *      unsigned int blockDimY;
+ *      unsigned int blockDimZ;
+ *      unsigned int sharedMemBytes;
+ *      void **kernelParams;
+ *      void **extra;
+ *  } CUDA_KERNEL_NODE_PARAMS;
+ * \endcode
+ *
+ * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x
+ * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains
+ * (\p blockDimX x \p blockDimY x \p blockDimZ) threads.
+ *
+ * \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * Kernel parameters to \p func can be specified in one of two ways:
+ *
+ * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N
+ * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer,
+ * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual
+ * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need
+ * to be specified as that information is retrieved directly from the kernel's image.
+ *
+ * 2) Kernel parameters can also be packaged by the application into a single buffer that is passed in
+ * via \p extra. This places the burden on the application of knowing each kernel
+ * parameter's size and alignment/padding within the buffer. The \p extra parameter exists
+ * to allow this function to take additional less commonly used arguments. \p extra specifies
+ * a list of names of extra settings and their corresponding values. Each extra setting name is
+ * immediately followed by the corresponding value. The list must be terminated with either NULL or
+ * CU_LAUNCH_PARAM_END.
+ *
+ * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
+ *   array;
+ * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
+ *   value in \p extra will be a pointer to a buffer
+ *   containing all the kernel parameters for launching kernel
+ *   \p func;
+ * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
+ *   value in \p extra will be a pointer to a size_t
+ *   containing the size of the buffer specified with
+ *   ::CU_LAUNCH_PARAM_BUFFER_POINTER;
+ *
+ * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both
+ * \p kernelParams and \p extra (i.e. both \p kernelParams and
+ * \p extra are non-NULL).
+ *
+ * The \p kernelParams or \p extra array, as well as the argument values it points to,
+ * are copied during this call.
+ *
+ * \note Kernels launched using graphs must not use texture and surface references. Reading or
+ *       writing through any texture or surface reference is undefined behavior.
+ *       This restriction does not apply to texture and surface objects.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the GPU execution node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphKernelNodeGetParams,
+ * ::cuGraphKernelNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a kernel node's parameters
+ *
+ * Returns the parameters of kernel node \p hNode in \p nodeParams.
+ * The \p kernelParams or \p extra array returned in \p nodeParams,
+ * as well as the argument values it points to, are owned by the node.
+ * This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphKernelNodeSetParams to update the
+ * parameters of this node.
+ *
+ * The params will contain either \p kernelParams or \p extra,
+ * according to which of these was most recently set on the node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphKernelNodeSetParams
+ */
+CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets a kernel node's parameters
+ *
+ * Sets the parameters of kernel node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphKernelNodeGetParams
+ */
+CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a memcpy node and adds it to a graph
+ *
+ * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * When the graph is launched, the node will perform the memcpy described by \p copyParams.
+ * See ::cuMemcpy3D() for a description of the structure and its restrictions.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer
+ * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed
+ * for those operand(s). The managed memory will be treated as residing on either the
+ * host or the device, depending on which memory type is specified.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param copyParams      - Parameters for the memory copy
+ * \param ctx             - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemcpy3D,
+ * ::cuGraphMemcpyNodeGetParams,
+ * ::cuGraphMemcpyNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx);
+
+/**
+ * \brief Returns a memcpy node's parameters
+ *
+ * Returns the parameters of memcpy node \p hNode in \p nodeParams.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemcpy3D,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphMemcpyNodeSetParams
+ */
+CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams);
+
+/**
+ * \brief Sets a memcpy node's parameters
+ *
+ * Sets the parameters of memcpy node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemcpy3D,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphMemcpyNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams);
+
+/**
+ * \brief Creates a memset node and adds it to a graph
+ *
+ * Creates a new memset node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The element size must be 1, 2, or 4 bytes.
+ * When the graph is launched, the node will perform the memset described by \p memsetParams.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param memsetParams    - Parameters for the memory set
+ * \param ctx             - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemsetD2D32,
+ * ::cuGraphMemsetNodeGetParams,
+ * ::cuGraphMemsetNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode
+ */
+CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx);
+
+/**
+ * \brief Returns a memset node's parameters
+ *
+ * Returns the parameters of memset node \p hNode in \p nodeParams.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemsetD2D32,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphMemsetNodeSetParams
+ */
+CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets a memset node's parameters
+ *
+ * Sets the parameters of memset node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemsetD2D32,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphMemsetNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a host execution node and adds it to a graph
+ *
+ * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * When the graph is launched, the node will invoke the specified CPU function.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the host node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchHostFunc,
+ * ::cuGraphHostNodeGetParams,
+ * ::cuGraphHostNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a host node's parameters
+ *
+ * Returns the parameters of host node \p hNode in \p nodeParams.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchHostFunc,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphHostNodeSetParams
+ */
+CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets a host node's parameters
+ *
+ * Sets the parameters of host node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchHostFunc,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphHostNodeGetParams
+ */
+CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a child graph node and adds it to a graph
+ *
+ * Creates a new node which executes an embedded graph, and adds it to \p hGraph with
+ * \p numDependencies dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The node executes an embedded child graph. The child graph is cloned in this call.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param childGraph      - The graph to clone into this node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphChildGraphNodeGetGraph,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphClone
+ */
+CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
+
+/**
+ * \brief Gets a handle to the embedded graph of a child graph node
+ *
+ * Gets a handle to the embedded graph in a child graph node. This call
+ * does not clone the graph. Changes to the graph will be reflected in
+ * the node, and the node retains ownership of the graph.
+ *
+ * \param hNode   - Node to get the embedded graph for
+ * \param phGraph - Location to store a handle to the graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphNodeFindInClone
+ */
+CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph);
+
+/**
+ * \brief Creates an empty node and adds it to a graph
+ *
+ * Creates a new node which performs no operation, and adds it to \p hGraph with
+ * \p numDependencies dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * An empty node performs no operation during execution, but can be used for
+ * transitive ordering. For example, a phased execution graph with 2 groups of n
+ * nodes with a barrier between them can be represented using an empty node and
+ * 2*n dependency edges, rather than no empty node and n^2 dependency edges.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies);
+
+/**
+ * \brief Clones a graph
+ *
+ * This function creates a copy of \p originalGraph and returns it in \p * phGraphClone.
+ * All parameters are copied into the cloned graph. The original graph may be modified
+ * after this call without affecting the clone.
+ *
+ * Child graph nodes in the original graph are recursively copied into the clone.
+ *
+ * \param phGraphClone  - Returns newly created cloned graph
+ * \param originalGraph - Graph to clone
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphNodeFindInClone
+ */
+CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph);
+
+/**
+ * \brief Finds a cloned version of a node
+ *
+ * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode
+ * in the original graph.
+ *
+ * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone.
+ * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to
+ * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have
+ * been removed. The cloned node is then returned via \p phClonedNode.
+ *
+ * \param phNode  - Returns handle to the cloned node
+ * \param hOriginalNode - Handle to the original node
+ * \param hClonedGraph - Cloned graph to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphClone
+ */
+CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
+
+/**
+ * \brief Returns a node's type
+ *
+ * Returns the node type of \p hNode in \p type.
+ *
+ * \param hNode - Node to query
+ * \param type  - Pointer to return the node type
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphChildGraphNodeGetGraph,
+ * ::cuGraphKernelNodeGetParams,
+ * ::cuGraphKernelNodeSetParams,
+ * ::cuGraphHostNodeGetParams,
+ * ::cuGraphHostNodeSetParams,
+ * ::cuGraphMemcpyNodeGetParams,
+ * ::cuGraphMemcpyNodeSetParams,
+ * ::cuGraphMemsetNodeGetParams,
+ * ::cuGraphMemsetNodeSetParams
+ */
+CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type);
+
+/**
+ * \brief Returns a graph's nodes
+ *
+ * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this
+ * function will return the number of nodes in \p numNodes. Otherwise,
+ * \p numNodes entries will be filled in. If \p numNodes is higher than the actual
+ * number of nodes, the remaining entries in \p nodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numNodes.
+ *
+ * \param hGraph   - Graph to query
+ * \param nodes    - Pointer to return the nodes
+ * \param numNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetType,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
+
+/**
+ * \brief Returns a graph's root nodes
+ *
+ * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this
+ * function will return the number of root nodes in \p numRootNodes. Otherwise,
+ * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual
+ * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numRootNodes.
+ *
+ * \param hGraph       - Graph to query
+ * \param rootNodes    - Pointer to return the root nodes
+ * \param numRootNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetType,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
+
+/**
+ * \brief Returns a graph's dependency edges
+ *
+ * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding
+ * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the
+ * node in \p from[i]. \p from and \p to may both be NULL, in which
+ * case this function only returns the number of edges in \p numEdges. Otherwise,
+ * \p numEdges entries will be filled in. If \p numEdges is higher than the actual
+ * number of edges, the remaining entries in \p from and \p to will be set to NULL, and
+ * the number of edges actually returned will be written to \p numEdges.
+ *
+ * \param hGraph   - Graph to get the edges from
+ * \param from     - Location to return edge endpoints
+ * \param to       - Location to return edge endpoints
+ * \param numEdges - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
+
+/**
+ * \brief Returns a node's dependencies
+ *
+ * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this
+ * function will return the number of dependencies in \p numDependencies. Otherwise,
+ * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual
+ * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numDependencies.
+ *
+ * \param hNode           - Node to query
+ * \param dependencies    - Pointer to return the dependencies
+ * \param numDependencies - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetDependentNodes,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies
+ */
+CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
+
+/**
+ * \brief Returns a node's dependent nodes
+ *
+ * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which
+ * case this function will return the number of dependent nodes in \p numDependentNodes.
+ * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is
+ * higher than the actual number of dependent nodes, the remaining entries in
+ * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will
+ * be returned in \p numDependentNodes.
+ *
+ * \param hNode             - Node to query
+ * \param dependentNodes    - Pointer to return the dependent nodes
+ * \param numDependentNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies
+ */
+CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
+
+/**
+ * \brief Adds dependency edges to a graph
+ *
+ * The number of dependencies to be added is defined by \p numDependencies
+ * Elements in \p from and \p to at corresponding indices define a dependency.
+ * Each node in \p from and \p to must belong to \p hGraph.
+ *
+ * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ * Specifying an existing dependency will return an error.
+ *
+ * \param hGraph - Graph to which dependencies are added
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be added
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphRemoveDependencies,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t numDependencies);
+
+/**
+ * \brief Removes dependency edges from a graph
+ *
+ * The number of \p dependencies to be removed is defined by \p numDependencies.
+ * Elements in \p from and \p to at corresponding indices define a dependency.
+ * Each node in \p from and \p to must belong to \p hGraph.
+ *
+ * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ * Specifying a non-existing dependency will return an error.
+ *
+ * \param hGraph - Graph from which to remove dependencies
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be removed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddDependencies,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t numDependencies);
+
+/**
+ * \brief Remove a node from the graph
+ *
+ * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes
+ * on \p hNode and vice versa.
+ *
+ * \param hNode  - Node to remove
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode);
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p hGraph as an executable graph. The graph is validated for any
+ * structural constraints or intra-node constraints which were not previously
+ * validated. If instantiation is successful, a handle to the instantiated graph
+ * is returned in \p graphExec.
+ *
+ * If there are any errors, diagnostic information may be returned in \p errorNode and
+ * \p logBuffer. This is the primary way to inspect instantiation errors. The output
+ * will be null terminated unless the diagnostics overflow
+ * the buffer. In this case, they will be truncated, and the last byte can be
+ * inspected to determine if truncation occurred.
+ *
+ * \param phGraphExec - Returns instantiated graph
+ * \param hGraph      - Graph to instantiate
+ * \param phErrorNode - In case of an instantiation error, this may be modified to
+ *                      indicate a node contributing to the error
+ * \param logBuffer   - A character buffer to store diagnostic messages
+ * \param bufferSize  - Size of the log buffer in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphLaunch,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+
+/**
+ * \brief Launches an executable graph in a stream
+ *
+ * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing
+ * at a time. Each launch is ordered behind both any previous work in \p hStream
+ * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be
+ * instantiated multiple times into multiple executable graphs.
+ *
+ * \param hGraphExec - Executable graph to launch
+ * \param hStream    - Stream in which to launch the graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream);
+
+/**
+ * \brief Destroys an executable graph
+ *
+ * Destroys the executable graph specified by \p hGraphExec, as well
+ * as all of its executable nodes. If the executable graph is
+ * in-flight, it will not be terminated, but rather freed
+ * asynchronously on completion.
+ *
+ * \param hGraphExec - Executable graph to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphLaunch
+ */
+CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec);
+
+/**
+ * \brief Destroys a graph
+ *
+ * Destroys the graph specified by \p hGraph, as well as all of its nodes.
+ *
+ * \param hGraph - Graph to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph);
+/** @} */ /* END CUDA_GRAPH */
+#endif /* __CUDA_API_VERSION >= 10000 */
+
+#if __CUDA_API_VERSION >= 6050
+/**
+ * \defgroup CUDA_OCCUPANCY Occupancy
+ *
+ * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the occupancy calculation functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns occupancy of a function
+ *
+ * Returns in \p *numBlocks the number of the maximum active blocks per
+ * streaming multiprocessor.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ */
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
+
+/**
+ * \brief Returns occupancy of a function
+ *
+ * Returns in \p *numBlocks the number of the maximum active blocks per
+ * streaming multiprocessor.
+ *
+ * The \p Flags parameter controls how special cases are handled. The
+ * valid flags are:
+ *
+ * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
+ *   ::cuOccupancyMaxActiveBlocksPerMultiprocessor;
+ *
+ * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
+ *   default behavior on platform where global caching affects
+ *   occupancy. On such platforms, if caching is enabled, but
+ *   per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching
+ *   is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes
+ *   the occupancy calculator to return 0 in such cases. More information
+ *   can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param flags           - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ */
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+
+/**
+ * \brief Suggest a launch configuration with reasonable occupancy
+ *
+ * Returns in \p *blockSize a reasonable block size that can achieve
+ * the maximum occupancy (or, the maximum number of active warps with
+ * the fewest blocks per multiprocessor), and in \p *minGridSize the
+ * minimum grid size to achieve the maximum occupancy.
+ *
+ * If \p blockSizeLimit is 0, the configurator will use the maximum
+ * block size permitted by the device / function instead.
+ *
+ * If per-block dynamic shared memory allocation is not needed, the
+ * user should leave both \p blockSizeToDynamicSMemSize and \p
+ * dynamicSMemSize as 0.
+ *
+ * If per-block dynamic shared memory allocation is needed, then if
+ * the dynamic shared memory size is constant regardless of block
+ * size, the size should be passed through \p dynamicSMemSize, and \p
+ * blockSizeToDynamicSMemSize should be NULL.
+ *
+ * Otherwise, if the per-block dynamic shared memory size varies with
+ * different block sizes, the user needs to provide a unary function
+ * through \p blockSizeToDynamicSMemSize that computes the dynamic
+ * shared memory needed by \p func for any given block size. \p
+ * dynamicSMemSize is ignored. An example signature is:
+ *
+ * \code
+ *    // Take block size, returns dynamic shared memory needed
+ *    size_t blockToSmem(int blockSize);
+ * \endcode
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
+ * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
+ * \param func        - Kernel for which launch configuration is calculated
+ * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
+ * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxPotentialBlockSize
+ */
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
+
+/**
+ * \brief Suggest a launch configuration with reasonable occupancy
+ *
+ * An extended version of ::cuOccupancyMaxPotentialBlockSize. In
+ * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize,
+ * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags
+ * parameter.
+ *
+ * The \p Flags parameter controls how special cases are handled. The
+ * valid flags are:
+ *
+ * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
+ *   ::cuOccupancyMaxPotentialBlockSize;
+ *
+ * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
+ *   default behavior on platform where global caching affects
+ *   occupancy. On such platforms, the launch configurations that
+ *   produces maximal occupancy might not support global
+ *   caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE
+ *   guarantees that the the produced launch configuration is global
+ *   caching compatible at a potential cost of occupancy. More information
+ *   can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
+ * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
+ * \param func        - Kernel for which launch configuration is calculated
+ * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
+ * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to handle
+ * \param flags       - Options
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ */
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
+
+/** @} */ /* END CUDA_OCCUPANCY */
+#endif /* __CUDA_API_VERSION >= 6050 */
+
+/**
+ * \defgroup CUDA_TEXREF Texture Reference Management
+ *
+ * ___MANBRIEF___ texture reference management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the texture reference management functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Binds an array as a texture reference
+ *
+ * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any
+ * previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. \p Flags must be set to
+ * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is
+ * unbound.
+ *
+ * \param hTexRef - Texture reference to bind
+ * \param hArray  - Array to bind
+ * \param Flags   - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToArray
+ */
+CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+
+/**
+ * \brief Binds a mipmapped array to a texture reference
+ *
+ * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef.
+ * Any previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT.
+ * Any CUDA array previously bound to \p hTexRef is unbound.
+ *
+ * \param hTexRef         - Texture reference to bind
+ * \param hMipmappedArray - Mipmapped array to bind
+ * \param Flags           - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Binds an address as a texture reference
+ *
+ * Binds a linear address range to the texture reference \p hTexRef. Any
+ * previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. Any memory previously bound to \p hTexRef
+ * is unbound.
+ *
+ * Since the hardware enforces an alignment requirement on texture base
+ * addresses, ::cuTexRefSetAddress() passes back a byte offset in
+ * \p *ByteOffset that must be applied to texture fetches in order to read from
+ * the desired memory. This offset must be divided by the texel size and
+ * passed to kernels that read from the texture so they can be applied to the
+ * ::tex1Dfetch() function.
+ *
+ * If the device memory pointer was returned from ::cuMemAlloc(), the offset
+ * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter.
+ *
+ * The total number of elements (or texels) in the linear address range
+ * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH.
+ * The number of elements is computed as (\p bytes / bytesPerElement),
+ * where bytesPerElement is determined from the data format and number of
+ * components set using ::cuTexRefSetFormat().
+ *
+ * \param ByteOffset - Returned byte offset
+ * \param hTexRef    - Texture reference to bind
+ * \param dptr       - Device pointer to bind
+ * \param bytes      - Size of memory to bind in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTexture
+ */
+CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
+
+/**
+ * \brief Binds an address as a 2D texture reference
+ *
+ * Binds a linear address range to the texture reference \p hTexRef. Any
+ * previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. Any memory previously bound to \p hTexRef
+ * is unbound.
+ *
+ * Using a ::tex2D() function inside a kernel requires a call to either
+ * ::cuTexRefSetArray() to bind the corresponding texture reference to an
+ * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear
+ * memory.
+ *
+ * Function calls to ::cuTexRefSetFormat() cannot follow calls to
+ * ::cuTexRefSetAddress2D() for the same texture reference.
+ *
+ * It is required that \p dptr be aligned to the appropriate hardware-specific
+ * texture alignment. You can query this value using the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is
+ * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \p Pitch has to be aligned to the hardware-specific texture pitch alignment.
+ * This value can be queried using the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is
+ * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * Width and Height, which are specified in elements (or texels), cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
+ * \p Pitch, which is specified in bytes, cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
+ *
+ * \param hTexRef - Texture reference to bind
+ * \param desc    - Descriptor of CUDA array
+ * \param dptr    - Device pointer to bind
+ * \param Pitch   - Line pitch in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTexture2D
+ */
+CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * \brief Sets the format for a texture reference
+ *
+ * Specifies the format of the data to be read by the texture reference
+ * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the
+ * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure:
+ * They specify the format of each component and the number of components per
+ * array element.
+ *
+ * \param hTexRef             - Texture reference
+ * \param fmt                 - Format to set
+ * \param NumPackedComponents - Number of components per array element
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaCreateChannelDesc,
+ * ::cudaBindTexture,
+ * ::cudaBindTexture2D,
+ * ::cudaBindTextureToArray,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
+
+/**
+ * \brief Sets the addressing mode for a texture reference
+ *
+ * Specifies the addressing mode \p am for the given dimension \p dim of the
+ * texture reference \p hTexRef. If \p dim is zero, the addressing mode is
+ * applied to the first parameter of the functions used to fetch from the
+ * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined
+ * as:
+ * \code
+   typedef enum CUaddress_mode_enum {
+      CU_TR_ADDRESS_MODE_WRAP = 0,
+      CU_TR_ADDRESS_MODE_CLAMP = 1,
+      CU_TR_ADDRESS_MODE_MIRROR = 2,
+      CU_TR_ADDRESS_MODE_BORDER = 3
+   } CUaddress_mode;
+ * \endcode
+ *
+ * Note that this call has no effect if \p hTexRef is bound to linear memory.
+ * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only
+ * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
+ *
+ * \param hTexRef - Texture reference
+ * \param dim     - Dimension
+ * \param am      - Addressing mode to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTexture,
+ * ::cudaBindTexture2D,
+ * ::cudaBindTextureToArray,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
+
+/**
+ * \brief Sets the filtering mode for a texture reference
+ *
+ * Specifies the filtering mode \p fm to be used when reading memory through
+ * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
+ *
+ * \code
+   typedef enum CUfilter_mode_enum {
+      CU_TR_FILTER_MODE_POINT = 0,
+      CU_TR_FILTER_MODE_LINEAR = 1
+   } CUfilter_mode;
+ * \endcode
+ *
+ * Note that this call has no effect if \p hTexRef is bound to linear memory.
+ *
+ * \param hTexRef - Texture reference
+ * \param fm      - Filtering mode to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToArray
+ */
+CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+
+/**
+ * \brief Sets the mipmap filtering mode for a texture reference
+ *
+ * Specifies the mipmap filtering mode \p fm to be used when reading memory through
+ * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
+ *
+ * \code
+   typedef enum CUfilter_mode_enum {
+      CU_TR_FILTER_MODE_POINT = 0,
+      CU_TR_FILTER_MODE_LINEAR = 1
+   } CUfilter_mode;
+ * \endcode
+ *
+ * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+ *
+ * \param hTexRef - Texture reference
+ * \param fm      - Filtering mode to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+
+/**
+ * \brief Sets the mipmap level bias for a texture reference
+ *
+ * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when
+ * reading memory through the texture reference \p hTexRef.
+ *
+ * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+ *
+ * \param hTexRef - Texture reference
+ * \param bias    - Mipmap level bias
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias);
+
+/**
+ * \brief Sets the mipmap min/max mipmap level clamps for a texture reference
+ *
+ * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp
+ * respectively, to be used when reading memory through the texture reference
+ * \p hTexRef.
+ *
+ * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+ *
+ * \param hTexRef        - Texture reference
+ * \param minMipmapLevelClamp - Mipmap min level clamp
+ * \param maxMipmapLevelClamp - Mipmap max level clamp
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
+
+/**
+ * \brief Sets the maximum anisotropy for a texture reference
+ *
+ * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through
+ * the texture reference \p hTexRef.
+ *
+ * Note that this call has no effect if \p hTexRef is bound to linear memory.
+ *
+ * \param hTexRef  - Texture reference
+ * \param maxAniso - Maximum anisotropy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToArray,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso);
+
+/**
+ * \brief Sets the border color for a texture reference
+ *
+ * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference
+ * \p hTexRef. The color value supports only float type and holds color components in
+ * the following sequence:
+ * pBorderColor[0] holds 'R' component
+ * pBorderColor[1] holds 'G' component
+ * pBorderColor[2] holds 'B' component
+ * pBorderColor[3] holds 'A' component
+ *
+ * Note that the color values can be set only when the Address mode is set to
+ * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode.
+ * Applications using integer border color values have to "reinterpret_cast" their values to float.
+ *
+ * \param hTexRef       - Texture reference
+ * \param pBorderColor  - RGBA color
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddressMode,
+ * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor,
+ * ::cudaBindTexture,
+ * ::cudaBindTexture2D,
+ * ::cudaBindTextureToArray,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor);
+
+/**
+ * \brief Sets the flags for a texture reference
+ *
+ * Specifies optional flags via \p Flags to specify the behavior of data
+ * returned through the texture reference \p hTexRef. The valid flags are:
+ *
+ * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
+ *   having the texture promote integer data to floating point data in the
+ *   range [0, 1]. Note that texture with 32-bit integer format
+ *   would not be promoted, regardless of whether or not this
+ *   flag is specified;
+ * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the
+ *   default behavior of having the texture coordinates range
+ *   from [0, Dim) where Dim is the width or height of the CUDA
+ *   array. Instead, the texture coordinates [0, 1.0) reference
+ *   the entire breadth of the array dimension;
+ *
+ * \param hTexRef - Texture reference
+ * \param Flags   - Optional flags to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTexture,
+ * ::cudaBindTexture2D,
+ * ::cudaBindTextureToArray,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Gets the address associated with a texture reference
+ *
+ * Returns in \p *pdptr the base address bound to the texture reference
+ * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ * is not bound to any device memory range.
+ *
+ * \param pdptr   - Returned device address
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * \brief Gets the array bound to a texture reference
+ *
+ * Returns in \p *phArray the CUDA array bound to the texture reference
+ * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ * is not bound to any CUDA array.
+ *
+ * \param phArray - Returned array
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
+
+/**
+ * \brief Gets the mipmapped array bound to a texture reference
+ *
+ * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture
+ * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ * is not bound to any CUDA mipmapped array.
+ *
+ * \param phMipmappedArray - Returned mipmapped array
+ * \param hTexRef          - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
+
+/**
+ * \brief Gets the addressing mode used by a texture reference
+ *
+ * Returns in \p *pam the addressing mode corresponding to the
+ * dimension \p dim of the texture reference \p hTexRef. Currently, the only
+ * valid value for \p dim are 0 and 1.
+ *
+ * \param pam     - Returned addressing mode
+ * \param hTexRef - Texture reference
+ * \param dim     - Dimension
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
+
+/**
+ * \brief Gets the filter-mode used by a texture reference
+ *
+ * Returns in \p *pfm the filtering mode of the texture reference
+ * \p hTexRef.
+ *
+ * \param pfm     - Returned filtering mode
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+
+/**
+ * \brief Gets the format used by a texture reference
+ *
+ * Returns in \p *pFormat and \p *pNumChannels the format and number
+ * of components of the CUDA array bound to the texture reference \p hTexRef.
+ * If \p pFormat or \p pNumChannels is NULL, it will be ignored.
+ *
+ * \param pFormat      - Returned format
+ * \param pNumChannels - Returned number of components
+ * \param hTexRef      - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags
+ */
+CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
+
+/**
+ * \brief Gets the mipmap filtering mode for a texture reference
+ *
+ * Returns the mipmap filtering mode in \p pfm that's used when reading memory through
+ * the texture reference \p hTexRef.
+ *
+ * \param pfm     - Returned mipmap filtering mode
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+
+/**
+ * \brief Gets the mipmap level bias for a texture reference
+ *
+ * Returns the mipmap level bias in \p pBias that's added to the specified mipmap
+ * level when reading memory through the texture reference \p hTexRef.
+ *
+ * \param pbias   - Returned mipmap level bias
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef);
+
+/**
+ * \brief Gets the min/max mipmap level clamps for a texture reference
+ *
+ * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp
+ * that's used when reading memory through the texture reference \p hTexRef.
+ *
+ * \param pminMipmapLevelClamp - Returned mipmap min level clamp
+ * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp
+ * \param hTexRef              - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
+
+/**
+ * \brief Gets the maximum anisotropy for a texture reference
+ *
+ * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through
+ * the texture reference \p hTexRef.
+ *
+ * \param pmaxAniso - Returned maximum anisotropy
+ * \param hTexRef   - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef);
+
+/**
+ * \brief Gets the border color used by a texture reference
+ *
+ * Returns in \p pBorderColor, values of the RGBA color used by
+ * the texture reference \p hTexRef.
+ * The color value is of type float and holds color components in
+ * the following sequence:
+ * pBorderColor[0] holds 'R' component
+ * pBorderColor[1] holds 'G' component
+ * pBorderColor[2] holds 'B' component
+ * pBorderColor[3] holds 'A' component
+ *
+ * \param hTexRef  - Texture reference
+ * \param pBorderColor   - Returned Type and Value of RGBA color
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor
+ */
+CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef);
+
+/**
+ * \brief Gets the flags used by a texture reference
+ *
+ * Returns in \p *pFlags the flags of the texture reference \p hTexRef.
+ *
+ * \param pFlags  - Returned flags
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
+
+/** @} */ /* END CUDA_TEXREF */
+
+/**
+ * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated texture reference management functions of the
+ * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated texture reference management
+ * functions of the low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a texture reference
+ *
+ * \deprecated
+ *
+ * Creates a texture reference and returns its handle in \p *pTexRef. Once
+ * created, the application must call ::cuTexRefSetArray() or
+ * ::cuTexRefSetAddress() to associate the reference with allocated memory.
+ * Other texture reference functions are used to specify the format and
+ * interpretation (addressing, filtering, etc.) to be used when the memory is
+ * read through this texture reference.
+ *
+ * \param pTexRef - Returned texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefDestroy
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef);
+
+/**
+ * \brief Destroys a texture reference
+ *
+ * \deprecated
+ *
+ * Destroys the texture reference specified by \p hTexRef.
+ *
+ * \param hTexRef - Texture reference to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefCreate
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef);
+
+/** @} */ /* END CUDA_TEXREF_DEPRECATED */
+
+
+/**
+ * \defgroup CUDA_SURFREF Surface Reference Management
+ *
+ * ___MANBRIEF___ surface reference management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the surface reference management functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Sets the CUDA array for a surface reference.
+ *
+ * Sets the CUDA array \p hArray to be read and written by the surface reference
+ * \p hSurfRef.  Any previous CUDA array state associated with the surface
+ * reference is superseded by this function.  \p Flags must be set to 0.
+ * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array.
+ * Any CUDA array previously bound to \p hSurfRef is unbound.
+
+ * \param hSurfRef - Surface reference handle
+ * \param hArray - CUDA array handle
+ * \param Flags - set to 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuModuleGetSurfRef,
+ * ::cuSurfRefGetArray,
+ * ::cudaBindSurfaceToArray
+ */
+CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
+
+/**
+ * \brief Passes back the CUDA array bound to a surface reference.
+ *
+ * Returns in \p *phArray the CUDA array bound to the surface reference
+ * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference
+ * is not bound to any CUDA array.
+
+ * \param phArray - Surface reference handle
+ * \param hSurfRef - Surface reference handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray
+ */
+CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
+
+/** @} */ /* END CUDA_SURFREF */
+
+#if __CUDA_API_VERSION >= 5000
+/**
+ * \defgroup CUDA_TEXOBJECT Texture Object Management
+ *
+ * ___MANBRIEF___ texture object management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the texture object management functions of the
+ * low-level CUDA driver application programming interface. The texture
+ * object API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a texture object
+ *
+ * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes
+ * the data to texture from. \p pTexDesc describes how the data should be sampled.
+ * \p pResViewDesc is an optional argument that specifies an alternate format for
+ * the data described by \p pResDesc, and also describes the subresource region
+ * to restrict access to when texturing. \p pResViewDesc can only be specified if
+ * the type of resource is a CUDA array or a CUDA mipmapped array.
+ *
+ * Texture objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a texture object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * The ::CUDA_RESOURCE_DESC structure is defined as:
+ * \code
+        typedef struct CUDA_RESOURCE_DESC_st
+        {
+            CUresourcetype resType;
+
+            union {
+                struct {
+                    CUarray hArray;
+                } array;
+                struct {
+                    CUmipmappedArray hMipmappedArray;
+                } mipmap;
+                struct {
+                    CUdeviceptr devPtr;
+                    CUarray_format format;
+                    unsigned int numChannels;
+                    size_t sizeInBytes;
+                } linear;
+                struct {
+                    CUdeviceptr devPtr;
+                    CUarray_format format;
+                    unsigned int numChannels;
+                    size_t width;
+                    size_t height;
+                    size_t pitchInBytes;
+                } pitch2D;
+            } res;
+
+            unsigned int flags;
+        } CUDA_RESOURCE_DESC;
+
+ * \endcode
+ * where:
+ * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from.
+ * CUresourceType is defined as:
+ * \code
+        typedef enum CUresourcetype_enum {
+            CU_RESOURCE_TYPE_ARRAY           = 0x00,
+            CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01,
+            CU_RESOURCE_TYPE_LINEAR          = 0x02,
+            CU_RESOURCE_TYPE_PITCH2D         = 0x03
+        } CUresourcetype;
+ * \endcode
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray
+ * must be set to a valid CUDA array handle.
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray
+ * must be set to a valid CUDA mipmapped array handle.
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr
+ * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
+ * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels
+ * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes
+ * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)).
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr
+ * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
+ * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels
+ * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width
+ * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
+ * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to
+ * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
+ *
+ * - ::flags must be set to zero.
+ *
+ *
+ * The ::CUDA_TEXTURE_DESC struct is defined as
+ * \code
+        typedef struct CUDA_TEXTURE_DESC_st {
+            CUaddress_mode addressMode[3];
+            CUfilter_mode filterMode;
+            unsigned int flags;
+            unsigned int maxAnisotropy;
+            CUfilter_mode mipmapFilterMode;
+            float mipmapLevelBias;
+            float minMipmapLevelClamp;
+            float maxMipmapLevelClamp;
+        } CUDA_TEXTURE_DESC;
+ * \endcode
+ * where
+ * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as:
+ *   \code
+        typedef enum CUaddress_mode_enum {
+            CU_TR_ADDRESS_MODE_WRAP = 0,
+            CU_TR_ADDRESS_MODE_CLAMP = 1,
+            CU_TR_ADDRESS_MODE_MIRROR = 2,
+            CU_TR_ADDRESS_MODE_BORDER = 3
+        } CUaddress_mode;
+ *   \endcode
+ *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES
+ *   is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
+ *
+ * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as:
+ *   \code
+        typedef enum CUfilter_mode_enum {
+            CU_TR_FILTER_MODE_POINT = 0,
+            CU_TR_FILTER_MODE_LINEAR = 1
+        } CUfilter_mode;
+ *   \endcode
+ *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR.
+ *
+ * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following:
+ *   - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of having the texture promote integer data to floating point data in the
+ *     range [0, 1]. Note that texture with 32-bit integer format would not be promoted, regardless of whether or not this flag is specified.
+ *   - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior of having the texture coordinates range from [0, Dim) where Dim is
+ *     the width or height of the CUDA array. Instead, the texture coordinates [0, 1.0) reference the entire breadth of the array dimension; Note
+ *     that for CUDA mipmapped arrays, this flag has to be set.
+ *
+ * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be
+ *   clamped to the range [1,16].
+ *
+ * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels.
+ *
+ * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level.
+ *
+ * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to.
+ *
+ * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to.
+ *
+ *
+ * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as
+ * \code
+        typedef struct CUDA_RESOURCE_VIEW_DESC_st
+        {
+            CUresourceViewFormat format;
+            size_t width;
+            size_t height;
+            size_t depth;
+            unsigned int firstMipmapLevel;
+            unsigned int lastMipmapLevel;
+            unsigned int firstLayer;
+            unsigned int lastLayer;
+        } CUDA_RESOURCE_VIEW_DESC;
+ * \endcode
+ * where:
+ * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should
+ *   be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block
+ *   compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32.
+ *   with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have
+ *   a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base
+ *   format but with 4 channels.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the
+ *   original resource.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero.
+ *   For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp
+ *   will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified,
+ *   then the actual minimum mipmap level clamp will be 3.2.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value
+ *   has to be zero.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero.
+ *   For non-layered resources, this value has to be zero.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources,
+ *   this value has to be zero.
+ *
+ *
+ * \param pTexObject   - Texture object to create
+ * \param pResDesc     - Resource descriptor
+ * \param pTexDesc     - Texture descriptor
+ * \param pResViewDesc - Resource view descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectDestroy,
+ * ::cudaCreateTextureObject
+ */
+CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc);
+
+/**
+ * \brief Destroys a texture object
+ *
+ * Destroys the texture object specified by \p texObject.
+ *
+ * \param texObject - Texture object to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaDestroyTextureObject
+ */
+CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject);
+
+/**
+ * \brief Returns a texture object's resource descriptor
+ *
+ * Returns the resource descriptor for the texture object specified by \p texObject.
+ *
+ * \param pResDesc  - Resource descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaGetTextureObjectResourceDesc,
+ */
+CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject);
+
+/**
+ * \brief Returns a texture object's texture descriptor
+ *
+ * Returns the texture descriptor for the texture object specified by \p texObject.
+ *
+ * \param pTexDesc  - Texture descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaGetTextureObjectTextureDesc
+ */
+CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject);
+
+/**
+ * \brief Returns a texture object's resource view descriptor
+ *
+ * Returns the resource view descriptor for the texture object specified by \p texObject.
+ * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param pResViewDesc - Resource view descriptor
+ * \param texObject    - Texture object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaGetTextureObjectResourceViewDesc
+ */
+CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject);
+
+/** @} */ /* END CUDA_TEXOBJECT */
+
+/**
+ * \defgroup CUDA_SURFOBJECT Surface Object Management
+ *
+ * ___MANBRIEF___ surface object management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the surface object management functions of the
+ * low-level CUDA driver application programming interface. The surface
+ * object API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a surface object
+ *
+ * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes
+ * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be
+ * ::CU_RESOURCE_TYPE_ARRAY and  ::CUDA_RESOURCE_DESC::res::array::hArray
+ * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero.
+ *
+ * Surface objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a surface object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * \param pSurfObject - Surface object to create
+ * \param pResDesc    - Resource descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuSurfObjectDestroy,
+ * ::cudaCreateSurfaceObject
+ */
+CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc);
+
+/**
+ * \brief Destroys a surface object
+ *
+ * Destroys the surface object specified by \p surfObject.
+ *
+ * \param surfObject - Surface object to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuSurfObjectCreate,
+ * ::cudaDestroySurfaceObject
+ */
+CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject);
+
+/**
+ * \brief Returns a surface object's resource descriptor
+ *
+ * Returns the resource descriptor for the surface object specified by \p surfObject.
+ *
+ * \param pResDesc   - Resource descriptor
+ * \param surfObject - Surface object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuSurfObjectCreate,
+ * ::cudaGetSurfaceObjectResourceDesc
+ */
+CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject);
+
+/** @} */ /* END CUDA_SURFOBJECT */
+#endif /* __CUDA_API_VERSION >= 5000 */
+
+/**
+ * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access
+ *
+ * ___MANBRIEF___ direct peer context memory access functions of the low-level
+ * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the direct peer context memory access functions
+ * of the low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+#if __CUDA_API_VERSION >= 4000
+
+/**
+ * \brief Queries if a device may directly access a peer device's memory.
+ *
+ * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of
+ * directly accessing memory from contexts on \p peerDev and 0 otherwise.
+ * If direct access of \p peerDev from \p dev is possible, then access may be
+ * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess().
+ *
+ * \param canAccessPeer - Returned access capability
+ * \param dev           - Device from which allocations on \p peerDev are to
+ *                        be directly accessed.
+ * \param peerDev       - Device on which the allocations to be directly accessed
+ *                        by \p dev reside.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxEnablePeerAccess,
+ * ::cuCtxDisablePeerAccess,
+ * ::cudaDeviceCanAccessPeer
+ */
+CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
+
+/**
+ * \brief Enables direct access to memory allocations in a peer context.
+ *
+ * If both the current context and \p peerContext are on devices which support unified
+ * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same
+ * major compute capability, then on success all allocations from \p peerContext will
+ * immediately be accessible by the current context.  See \ref CUDA_UNIFIED for additional
+ * details.
+ *
+ * Note that access granted by this call is unidirectional and that in order to access
+ * memory from the current context in \p peerContext, a separate symmetric call
+ * to ::cuCtxEnablePeerAccess() is required.
+ *
+ * There is a system-wide maximum of eight peer connections per device.
+ *
+ * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates
+ * that the ::CUdevice of the current context cannot directly access memory
+ * from the ::CUdevice of \p peerContext.
+ *
+ * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of
+ * \p peerContext from the current context has already been enabled.
+ *
+ * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible
+ * because hardware resources required for peer access have been exhausted.
+ *
+ * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext
+ * is not a valid context, or if the current context is \p peerContext.
+ *
+ * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0.
+ *
+ * \param peerContext - Peer context to enable direct access to from the current context
+ * \param Flags       - Reserved for future use and must be set to 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
+ * ::CUDA_ERROR_TOO_MANY_PEERS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceCanAccessPeer,
+ * ::cuCtxDisablePeerAccess,
+ * ::cudaDeviceEnablePeerAccess
+ */
+CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags);
+
+/**
+ * \brief Disables direct access to memory allocations in a peer context and
+ * unregisters any registered allocations.
+ *
+  Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
+ * not yet been enabled from \p peerContext to the current context.
+ *
+ * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if
+ * \p peerContext is not a valid context.
+ *
+ * \param peerContext - Peer context to disable direct access to
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceCanAccessPeer,
+ * ::cuCtxEnablePeerAccess,
+ * ::cudaDeviceDisablePeerAccess
+ */
+CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext);
+
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+#if __CUDA_API_VERSION >= 8000
+
+/**
+ * \brief Queries attributes of the link between two devices.
+ *
+ * Returns in \p *value the value of the requested attribute \p attrib of the
+ * link between \p srcDevice and \p dstDevice. The supported attributes are:
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the
+ *   performance of the link between two devices.
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable.
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over
+ *   the link are supported.
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can
+ *   be accessed over the link.
+ *
+ * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid
+ * or if they represent the same device.
+ *
+ * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is
+ * a null pointer.
+ *
+ * \param value         - Returned value of the requested attribute
+ * \param attrib        - The requested attribute of the link between \p srcDevice and \p dstDevice.
+ * \param srcDevice     - The source device of the target link.
+ * \param dstDevice     - The destination device of the target link.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxEnablePeerAccess,
+ * ::cuCtxDisablePeerAccess,
+ * ::cuDeviceCanAccessPeer,
+ * ::cudaDeviceGetP2PAttribute
+ */
+CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice);
+
+#endif /* __CUDA_API_VERSION >= 8000 */
+
+/** @} */ /* END CUDA_PEER_ACCESS */
+
+/**
+ * \defgroup CUDA_GRAPHICS Graphics Interoperability
+ *
+ * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graphics interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Unregisters a graphics resource for access by CUDA
+ *
+ * Unregisters the graphics resource \p resource so it is not accessible by
+ * CUDA unless registered again.
+ *
+ * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is
+ * returned.
+ *
+ * \param resource - Resource to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsD3D9RegisterResource,
+ * ::cuGraphicsD3D10RegisterResource,
+ * ::cuGraphicsD3D11RegisterResource,
+ * ::cuGraphicsGLRegisterBuffer,
+ * ::cuGraphicsGLRegisterImage,
+ * ::cudaGraphicsUnregisterResource
+ */
+CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource);
+
+/**
+ * \brief Get an array through which to access a subresource of a mapped graphics resource.
+ *
+ * Returns in \p *pArray an array through which the subresource of the mapped
+ * graphics resource \p resource which corresponds to array index \p arrayIndex
+ * and mipmap level \p mipLevel may be accessed.  The value set in \p *pArray may
+ * change every time that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via an array and
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
+ * If \p arrayIndex is not a valid array index for \p resource then
+ * ::CUDA_ERROR_INVALID_VALUE is returned.
+ * If \p mipLevel is not a valid mipmap level for \p resource then
+ * ::CUDA_ERROR_INVALID_VALUE is returned.
+ * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * \param pArray      - Returned array through which a subresource of \p resource may be accessed
+ * \param resource    - Mapped resource to access
+ * \param arrayIndex  - Array index for array textures or cubemap face
+ *                      index as defined by ::CUarray_cubemap_face for
+ *                      cubemap textures for the subresource to access
+ * \param mipLevel    - Mipmap level for the subresource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsSubResourceGetMappedArray
+ */
+CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
+
+#if __CUDA_API_VERSION >= 5000
+
+/**
+ * \brief Get a mipmapped array through which to access a mapped graphics resource.
+ *
+ * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics
+ * resource \p resource. The value set in \p *pMipmappedArray may change every time
+ * that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via a mipmapped array and
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
+ * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed
+ * \param resource        - Mapped resource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsResourceGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
+
+#endif /* __CUDA_API_VERSION >= 5000 */
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Get a device pointer through which to access a mapped graphics resource.
+ *
+ * Returns in \p *pDevPtr a pointer through which the mapped graphics resource
+ * \p resource may be accessed.
+ * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer.
+ * The value set in \p pPointer may change every time that \p resource is mapped.
+ *
+ * If \p resource is not a buffer then it cannot be accessed via a pointer and
+ * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned.
+ * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ * *
+ * \param pDevPtr    - Returned pointer through which \p resource may be accessed
+ * \param pSize      - Returned size of the buffer accessible starting at \p *pPointer
+ * \param resource   - Mapped resource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsResourceGetMappedPointer
+ */
+CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * \brief Set usage flags for mapping a graphics resource
+ *
+ * Set \p flags for mapping the graphics resource \p resource.
+ *
+ * Changes to \p flags will take effect the next time \p resource is mapped.
+ * The \p flags argument may be any of the following:
+
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA kernels.  This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which
+ *   access this resource will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels
+ *   which access this resource will not read from this resource and will
+ *   write over the entire contents of the resource, so none of the data
+ *   previously stored in the resource will be preserved.
+ *
+ * If \p resource is presently mapped for access by CUDA then
+ * ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param resource - Registered resource to set flags for
+ * \param flags    - Parameters for resource mapping
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cudaGraphicsResourceSetMapFlags
+ */
+CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
+
+/**
+ * \brief Map graphics resources for access by CUDA
+ *
+ * Maps the \p count graphics resources in \p resources for access by CUDA.
+ *
+ * The resources in \p resources may be accessed by CUDA until they
+ * are unmapped. The graphics API from which \p resources were registered
+ * should not access any resources while they are mapped by CUDA. If an
+ * application does so, the results are undefined.
+ *
+ * This function provides the synchronization guarantee that any graphics calls
+ * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA
+ * work issued in \p stream begins.
+ *
+ * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ *
+ * \param count      - Number of resources to map
+ * \param resources  - Resources to map for CUDA usage
+ * \param hStream    - Stream with which to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_UNKNOWN
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsUnmapResources,
+ * ::cudaGraphicsMapResources
+ */
+CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+
+/**
+ * \brief Unmap graphics resources.
+ *
+ * Unmaps the \p count graphics resources in \p resources.
+ *
+ * Once unmapped, the resources in \p resources may not be accessed by CUDA
+ * until they are mapped again.
+ *
+ * This function provides the synchronization guarantee that any CUDA work issued
+ * in \p stream before ::cuGraphicsUnmapResources() will complete before any
+ * subsequently issued graphics work begins.
+ *
+ *
+ * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * \param count      - Number of resources to unmap
+ * \param resources  - Resources to unmap
+ * \param hStream    - Stream with which to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_UNKNOWN
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cudaGraphicsUnmapResources
+ */
+CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+
+/** @} */ /* END CUDA_GRAPHICS */
+
+CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
+
+
+/**
+ * CUDA API versioning support
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuMemHostRegister
+    #undef cuGraphicsResourceSetMapFlags
+    #undef cuLinkCreate
+    #undef cuLinkAddData
+    #undef cuLinkAddFile
+    #undef cuDeviceTotalMem
+    #undef cuCtxCreate
+    #undef cuModuleGetGlobal
+    #undef cuMemGetInfo
+    #undef cuMemAlloc
+    #undef cuMemAllocPitch
+    #undef cuMemFree
+    #undef cuMemGetAddressRange
+    #undef cuMemAllocHost
+    #undef cuMemHostGetDevicePointer
+    #undef cuMemcpyHtoD
+    #undef cuMemcpyDtoH
+    #undef cuMemcpyDtoD
+    #undef cuMemcpyDtoA
+    #undef cuMemcpyAtoD
+    #undef cuMemcpyHtoA
+    #undef cuMemcpyAtoH
+    #undef cuMemcpyAtoA
+    #undef cuMemcpyHtoAAsync
+    #undef cuMemcpyAtoHAsync
+    #undef cuMemcpy2D
+    #undef cuMemcpy2DUnaligned
+    #undef cuMemcpy3D
+    #undef cuMemcpyHtoDAsync
+    #undef cuMemcpyDtoHAsync
+    #undef cuMemcpyDtoDAsync
+    #undef cuMemcpy2DAsync
+    #undef cuMemcpy3DAsync
+    #undef cuMemsetD8
+    #undef cuMemsetD16
+    #undef cuMemsetD32
+    #undef cuMemsetD2D8
+    #undef cuMemsetD2D16
+    #undef cuMemsetD2D32
+    #undef cuArrayCreate
+    #undef cuArrayGetDescriptor
+    #undef cuArray3DCreate
+    #undef cuArray3DGetDescriptor
+    #undef cuTexRefSetAddress
+    #undef cuTexRefSetAddress2D
+    #undef cuTexRefGetAddress
+    #undef cuGraphicsResourceGetMappedPointer
+    #undef cuCtxDestroy
+    #undef cuCtxPopCurrent
+    #undef cuCtxPushCurrent
+    #undef cuStreamDestroy
+    #undef cuEventDestroy
+    #undef cuMemcpy
+    #undef cuMemcpyAsync
+    #undef cuMemcpyPeer
+    #undef cuMemcpyPeerAsync
+    #undef cuMemcpy3DPeer
+    #undef cuMemcpy3DPeerAsync
+    #undef cuMemsetD8Async
+    #undef cuMemsetD16Async
+    #undef cuMemsetD32Async
+    #undef cuMemsetD2D8Async
+    #undef cuMemsetD2D16Async
+    #undef cuMemsetD2D32Async
+    #undef cuStreamGetPriority
+    #undef cuStreamGetFlags
+    #undef cuStreamGetCtx
+    #undef cuStreamWaitEvent
+    #undef cuStreamAddCallback
+    #undef cuStreamAttachMemAsync
+    #undef cuStreamQuery
+    #undef cuStreamSynchronize
+    #undef cuEventRecord
+    #undef cuLaunchKernel
+    #undef cuLaunchHostFunc
+    #undef cuGraphicsMapResources
+    #undef cuGraphicsUnmapResources
+    #undef cuStreamWriteValue32
+    #undef cuStreamWaitValue32
+    #undef cuStreamWriteValue64
+    #undef cuStreamWaitValue64
+    #undef cuStreamBatchMemOp
+    #undef cuMemPrefetchAsync
+    #undef cuLaunchCooperativeKernel
+    #undef cuSignalExternalSemaphoresAsync
+    #undef cuWaitExternalSemaphoresAsync
+    #undef cuStreamBeginCapture
+    #undef cuStreamEndCapture
+    #undef cuStreamIsCapturing
+    #undef cuGraphLaunch
+#endif /* __CUDA_API_VERSION_INTERNAL */
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050)
+CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
+#endif /* defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050) */
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050
+CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
+#endif /* defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050 */
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050)
+CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
+    unsigned int numOptions, CUjit_option *options, void **optionValues);
+CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
+    unsigned int numOptions, CUjit_option *options, void **optionValues);
+#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050) */
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010)
+CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010) */
+
+/**
+ * CUDA API made obselete at API version 3020
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #define CUdeviceptr                  CUdeviceptr_v1
+    #define CUDA_MEMCPY2D_st             CUDA_MEMCPY2D_v1_st
+    #define CUDA_MEMCPY2D                CUDA_MEMCPY2D_v1
+    #define CUDA_MEMCPY3D_st             CUDA_MEMCPY3D_v1_st
+    #define CUDA_MEMCPY3D                CUDA_MEMCPY3D_v1
+    #define CUDA_ARRAY_DESCRIPTOR_st     CUDA_ARRAY_DESCRIPTOR_v1_st
+    #define CUDA_ARRAY_DESCRIPTOR        CUDA_ARRAY_DESCRIPTOR_v1
+    #define CUDA_ARRAY3D_DESCRIPTOR_st   CUDA_ARRAY3D_DESCRIPTOR_v1_st
+    #define CUDA_ARRAY3D_DESCRIPTOR      CUDA_ARRAY3D_DESCRIPTOR_v1
+#endif /* CUDA_FORCE_LEGACY32_INTERNAL */
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020
+
+typedef unsigned int CUdeviceptr;
+
+typedef struct CUDA_MEMCPY2D_st
+{
+    unsigned int srcXInBytes;   /**< Source X in bytes */
+    unsigned int srcY;          /**< Source Y */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+
+    unsigned int dstXInBytes;   /**< Destination X in bytes */
+    unsigned int dstY;          /**< Destination Y */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+
+    unsigned int WidthInBytes;  /**< Width of 2D memory copy in bytes */
+    unsigned int Height;        /**< Height of 2D memory copy */
+} CUDA_MEMCPY2D;
+
+typedef struct CUDA_MEMCPY3D_st
+{
+    unsigned int srcXInBytes;   /**< Source X in bytes */
+    unsigned int srcY;          /**< Source Y */
+    unsigned int srcZ;          /**< Source Z */
+    unsigned int srcLOD;        /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    void *reserved0;            /**< Must be NULL */
+    unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+    unsigned int srcHeight;     /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+    unsigned int dstXInBytes;   /**< Destination X in bytes */
+    unsigned int dstY;          /**< Destination Y */
+    unsigned int dstZ;          /**< Destination Z */
+    unsigned int dstLOD;        /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    void *reserved1;            /**< Must be NULL */
+    unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+    unsigned int dstHeight;     /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+    unsigned int WidthInBytes;  /**< Width of 3D memory copy in bytes */
+    unsigned int Height;        /**< Height of 3D memory copy */
+    unsigned int Depth;         /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D;
+
+typedef struct CUDA_ARRAY_DESCRIPTOR_st
+{
+    unsigned int Width;         /**< Width of array */
+    unsigned int Height;        /**< Height of array */
+
+    CUarray_format Format;      /**< Array format */
+    unsigned int NumChannels;   /**< Channels per array element */
+} CUDA_ARRAY_DESCRIPTOR;
+
+typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
+{
+    unsigned int Width;         /**< Width of 3D array */
+    unsigned int Height;        /**< Height of 3D array */
+    unsigned int Depth;         /**< Depth of 3D array */
+
+    CUarray_format Format;      /**< Array format */
+    unsigned int NumChannels;   /**< Channels per array element */
+    unsigned int Flags;         /**< Flags */
+} CUDA_ARRAY3D_DESCRIPTOR;
+
+CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
+CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
+CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
+CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total);
+CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
+CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
+CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
+CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr);
+CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
+CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
+CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
+CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
+CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
+CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
+CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
+CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
+CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
+CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N);
+CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N);
+CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);
+CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
+CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
+CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
+CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
+CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
+CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes);
+CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
+CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
+CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
+#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 3020 */
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 4000
+CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
+CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
+CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
+CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
+CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
+#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 4000 */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef CUdeviceptr
+    #undef CUDA_MEMCPY2D_st
+    #undef CUDA_MEMCPY2D
+    #undef CUDA_MEMCPY3D_st
+    #undef CUDA_MEMCPY3D
+    #undef CUDA_ARRAY_DESCRIPTOR_st
+    #undef CUDA_ARRAY_DESCRIPTOR
+    #undef CUDA_ARRAY3D_DESCRIPTOR_st
+    #undef CUDA_ARRAY3D_DESCRIPTOR
+#endif /* __CUDA_API_VERSION_INTERNAL */
+
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy);
+    CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy);
+    CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy);
+    CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N);
+    CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N);
+    CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N);
+    CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+    CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+    CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+    CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
+    CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
+
+    CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+
+    CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
+    CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
+    CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
+    CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
+    CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+    CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
+    CUresult CUDAAPI cuStreamQuery(CUstream hStream);
+    CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
+    CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
+    CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+    CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
+    CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+    CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+    CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+    CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
+    CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
+    CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+    CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+    CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream);
+    CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
+    CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+    CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef __CUDA_API_VERSION
+#undef __CUDA_DEPRECATED
+
+#endif /* __cuda_cuda_h__ */
diff --git a/include/triton/external/CUDA/nvml.h b/include/triton/external/CUDA/nvml.h
new file mode 100755
index 000000000..0b38f5f8a
--- /dev/null
+++ b/include/triton/external/CUDA/nvml.h
@@ -0,0 +1,6281 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:   
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and 
+ * international Copyright laws.  Users and possessors of this source code 
+ * are hereby granted a nonexclusive, royalty-free license to use this code 
+ * in individual and commercial software.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE 
+ * OR PERFORMANCE OF THIS SOURCE CODE.  
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as 
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of 
+ * "commercial computer  software"  and "commercial computer software 
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) 
+ * and is provided to the U.S. Government only as a commercial end item.  
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+ * source code with only those rights set forth herein. 
+ *
+ * Any use of this source code in individual and commercial software must 
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+/* 
+NVML API Reference
+
+The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and 
+managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building
+3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi
+tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads.
+
+API Documentation
+
+Supported platforms:
+- Windows:     Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit
+- Linux:       32-bit and 64-bit
+- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5
+
+Supported products:
+- Full Support
+    - All Tesla products, starting with the Fermi architecture
+    - All Quadro products, starting with the Fermi architecture
+    - All GRID products, starting with the Kepler architecture
+    - Selected GeForce Titan products
+- Limited Support
+    - All Geforce products, starting with the Fermi architecture
+
+The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is
+not be added to the system path by default. To dynamically link to NVML, add this path to the PATH 
+environmental variable. To dynamically load NVML, call LoadLibrary with this path.
+
+On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit
+and 64 bit NVML libraries will be installed.
+
+Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html
+*/
+
+#ifndef __nvml_nvml_h__
+#define __nvml_nvml_h__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * On Windows, set up methods for DLL export
+ * define NVML_STATIC_IMPORT when using nvml_loader library
+ */
+#if defined _WINDOWS
+    #if !defined NVML_STATIC_IMPORT
+        #if defined NVML_LIB_EXPORT
+            #define DECLDIR __declspec(dllexport)
+        #else
+            #define DECLDIR __declspec(dllimport)
+        #endif
+    #else
+        #define DECLDIR
+    #endif
+#else
+    #define DECLDIR
+#endif
+
+/**
+ * NVML API versioning support
+ */
+#define NVML_API_VERSION            10
+#define NVML_API_VERSION_STR        "10"
+#define nvmlInit                    nvmlInit_v2
+#define nvmlDeviceGetPciInfo        nvmlDeviceGetPciInfo_v3
+#define nvmlDeviceGetCount          nvmlDeviceGetCount_v2
+#define nvmlDeviceGetHandleByIndex  nvmlDeviceGetHandleByIndex_v2
+#define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2
+#define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2
+#define nvmlDeviceRemoveGpu         nvmlDeviceRemoveGpu_v2
+
+/***************************************************************************************************/
+/** @defgroup nvmlDeviceStructs Device Structs
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Special constant that some fields take when they are not available.
+ * Used when only part of the struct is not available.
+ *
+ * Each structure explicitly states when to check for this value.
+ */
+#define NVML_VALUE_NOT_AVAILABLE (-1)
+
+typedef struct nvmlDevice_st* nvmlDevice_t;
+
+/**
+ * Buffer size guaranteed to be large enough for pci bus id
+ */
+#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE      32
+
+/**
+ * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy
+ */
+#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE   16
+
+/**
+ * PCI information about a GPU device.
+ */
+typedef struct nvmlPciInfo_st
+{
+    char busIdLegacy[NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE]; //!< The legacy tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
+    unsigned int domain;             //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff
+    unsigned int bus;                //!< The bus on which the device resides, 0 to 0xff
+    unsigned int device;             //!< The device's id on the bus, 0 to 31
+    unsigned int pciDeviceId;        //!< The combined 16-bit device id and 16-bit vendor id
+
+    // Added in NVML 2.285 API
+    unsigned int pciSubSystemId;     //!< The 32-bit Sub System Device ID
+
+    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
+} nvmlPciInfo_t;
+
+/**
+ * PCI format string for ::busIdLegacy
+ */
+#define NVML_DEVICE_PCI_BUS_ID_LEGACY_FMT           "%04X:%02X:%02X.0"
+
+/**
+ * PCI format string for ::busId
+ */
+#define NVML_DEVICE_PCI_BUS_ID_FMT                  "%08X:%02X:%02X.0"
+
+/**
+ * Utility macro for filling the pci bus id format from a nvmlPciInfo_t
+ */
+#define NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(pciInfo)    (pciInfo)->domain, \
+                                                    (pciInfo)->bus,    \
+                                                    (pciInfo)->device
+
+/**
+ * Detailed ECC error counts for a device.
+ *
+ * @deprecated  Different GPU families can have different memory error counters
+ *              See \ref nvmlDeviceGetMemoryErrorCounter
+ */
+typedef struct nvmlEccErrorCounts_st 
+{
+    unsigned long long l1Cache;      //!< L1 cache errors
+    unsigned long long l2Cache;      //!< L2 cache errors
+    unsigned long long deviceMemory; //!< Device memory errors
+    unsigned long long registerFile; //!< Register file errors
+} nvmlEccErrorCounts_t;
+
+/** 
+ * Utilization information for a device.
+ * Each sample period may be between 1 second and 1/6 second, depending on the product being queried.
+ */
+typedef struct nvmlUtilization_st 
+{
+    unsigned int gpu;                //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU
+    unsigned int memory;             //!< Percent of time over the past sample period during which global (device) memory was being read or written
+} nvmlUtilization_t;
+
+/** 
+ * Memory allocation information for a device.
+ */
+typedef struct nvmlMemory_st 
+{
+    unsigned long long total;        //!< Total installed FB memory (in bytes)
+    unsigned long long free;         //!< Unallocated FB memory (in bytes)
+    unsigned long long used;         //!< Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping
+} nvmlMemory_t;
+
+/**
+ * BAR1 Memory allocation Information for a device
+ */
+typedef struct nvmlBAR1Memory_st
+{
+    unsigned long long bar1Total;    //!< Total BAR1 Memory (in bytes)
+    unsigned long long bar1Free;     //!< Unallocated BAR1 Memory (in bytes)
+    unsigned long long bar1Used;     //!< Allocated Used Memory (in bytes)
+}nvmlBAR1Memory_t;
+
+/**
+ * Information about running compute processes on the GPU
+ */
+typedef struct nvmlProcessInfo_st
+{
+    unsigned int pid;                 //!< Process ID
+    unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes.
+                                      //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported
+                                      //! because Windows KMD manages all the memory and not the NVIDIA driver
+} nvmlProcessInfo_t;
+
+/**
+ * Enum to represent type of bridge chip
+ */
+typedef enum nvmlBridgeChipType_enum
+{
+    NVML_BRIDGE_CHIP_PLX = 0,
+    NVML_BRIDGE_CHIP_BRO4 = 1           
+}nvmlBridgeChipType_t;
+
+/**
+ * Maximum number of NvLink links supported 
+ */
+#define NVML_NVLINK_MAX_LINKS 6
+
+/**
+ * Enum to represent the NvLink utilization counter packet units
+ */
+typedef enum nvmlNvLinkUtilizationCountUnits_enum
+{
+    NVML_NVLINK_COUNTER_UNIT_CYCLES =  0,     // count by cycles
+    NVML_NVLINK_COUNTER_UNIT_PACKETS = 1,     // count by packets
+    NVML_NVLINK_COUNTER_UNIT_BYTES   = 2,     // count by bytes
+
+    // this must be last
+    NVML_NVLINK_COUNTER_UNIT_COUNT
+} nvmlNvLinkUtilizationCountUnits_t;
+
+/**
+ * Enum to represent the NvLink utilization counter packet types to count
+ *  ** this is ONLY applicable with the units as packets or bytes
+ *  ** as specified in \a nvmlNvLinkUtilizationCountUnits_t
+ *  ** all packet filter descriptions are target GPU centric
+ *  ** these can be "OR'd" together 
+ */
+typedef enum nvmlNvLinkUtilizationCountPktTypes_enum
+{
+    NVML_NVLINK_COUNTER_PKTFILTER_NOP        = 0x1,     // no operation packets
+    NVML_NVLINK_COUNTER_PKTFILTER_READ       = 0x2,     // read packets
+    NVML_NVLINK_COUNTER_PKTFILTER_WRITE      = 0x4,     // write packets
+    NVML_NVLINK_COUNTER_PKTFILTER_RATOM      = 0x8,     // reduction atomic requests
+    NVML_NVLINK_COUNTER_PKTFILTER_NRATOM     = 0x10,    // non-reduction atomic requests
+    NVML_NVLINK_COUNTER_PKTFILTER_FLUSH      = 0x20,    // flush requests
+    NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA   = 0x40,    // responses with data
+    NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80,    // responses without data
+    NVML_NVLINK_COUNTER_PKTFILTER_ALL        = 0xFF     // all packets
+} nvmlNvLinkUtilizationCountPktTypes_t;
+
+/** 
+ * Struct to define the NVLINK counter controls
+ */
+typedef struct nvmlNvLinkUtilizationControl_st
+{
+    nvmlNvLinkUtilizationCountUnits_t units;
+    nvmlNvLinkUtilizationCountPktTypes_t pktfilter;
+} nvmlNvLinkUtilizationControl_t;
+
+/**
+ * Enum to represent NvLink queryable capabilities
+ */
+typedef enum nvmlNvLinkCapability_enum
+{
+    NVML_NVLINK_CAP_P2P_SUPPORTED = 0,     // P2P over NVLink is supported
+    NVML_NVLINK_CAP_SYSMEM_ACCESS = 1,     // Access to system memory is supported
+    NVML_NVLINK_CAP_P2P_ATOMICS   = 2,     // P2P atomics are supported
+    NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3,     // System memory atomics are supported
+    NVML_NVLINK_CAP_SLI_BRIDGE    = 4,     // SLI is supported over this link
+    NVML_NVLINK_CAP_VALID         = 5,     // Link is supported on this device
+    // should be last
+    NVML_NVLINK_CAP_COUNT
+} nvmlNvLinkCapability_t;
+
+/**
+ * Enum to represent NvLink queryable error counters
+ */
+typedef enum nvmlNvLinkErrorCounter_enum
+{
+    NVML_NVLINK_ERROR_DL_REPLAY   = 0,     // Data link transmit replay error counter
+    NVML_NVLINK_ERROR_DL_RECOVERY = 1,     // Data link transmit recovery error counter
+    NVML_NVLINK_ERROR_DL_CRC_FLIT = 2,     // Data link receive flow control digit CRC error counter
+    NVML_NVLINK_ERROR_DL_CRC_DATA = 3,     // Data link receive data CRC error counter
+
+    // this must be last
+    NVML_NVLINK_ERROR_COUNT
+} nvmlNvLinkErrorCounter_t;
+
+/**
+ * Represents level relationships within a system between two GPUs
+ * The enums are spaced to allow for future relationships
+ */
+typedef enum nvmlGpuLevel_enum
+{
+    NVML_TOPOLOGY_INTERNAL           = 0, // e.g. Tesla K80
+    NVML_TOPOLOGY_SINGLE             = 10, // all devices that only need traverse a single PCIe switch
+    NVML_TOPOLOGY_MULTIPLE           = 20, // all devices that need not traverse a host bridge
+    NVML_TOPOLOGY_HOSTBRIDGE         = 30, // all devices that are connected to the same host bridge
+    NVML_TOPOLOGY_NODE               = 40, // all devices that are connected to the same NUMA node but possibly multiple host bridges
+    NVML_TOPOLOGY_SYSTEM             = 50, // all devices in the system
+
+    // there is purposefully no COUNT here because of the need for spacing above
+} nvmlGpuTopologyLevel_t;
+
+/* Compatibility for CPU->NODE renaming */
+#define NVML_TOPOLOGY_CPU NVML_TOPOLOGY_NODE
+
+/* P2P Capability Index Status*/
+typedef enum nvmlGpuP2PStatus_enum
+{
+    NVML_P2P_STATUS_OK     = 0,
+    NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
+    NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
+    NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
+    NVML_P2P_STATUS_DISABLED_BY_REGKEY,
+    NVML_P2P_STATUS_NOT_SUPPORTED,
+    NVML_P2P_STATUS_UNKNOWN
+
+} nvmlGpuP2PStatus_t;
+
+/* P2P Capability Index*/
+typedef enum nvmlGpuP2PCapsIndex_enum
+{
+    NVML_P2P_CAPS_INDEX_READ = 0,
+    NVML_P2P_CAPS_INDEX_WRITE,
+    NVML_P2P_CAPS_INDEX_NVLINK,
+    NVML_P2P_CAPS_INDEX_ATOMICS,
+    NVML_P2P_CAPS_INDEX_PROP,
+    NVML_P2P_CAPS_INDEX_UNKNOWN
+}nvmlGpuP2PCapsIndex_t;
+
+/**
+ * Maximum limit on Physical Bridges per Board
+ */
+#define NVML_MAX_PHYSICAL_BRIDGE                         (128)
+
+/**
+ * Information about the Bridge Chip Firmware
+ */
+typedef struct nvmlBridgeChipInfo_st
+{
+    nvmlBridgeChipType_t type;                  //!< Type of Bridge Chip 
+    unsigned int fwVersion;                     //!< Firmware Version. 0=Version is unavailable
+}nvmlBridgeChipInfo_t;
+
+/**
+ * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate 
+ * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth.
+ */
+typedef struct nvmlBridgeChipHierarchy_st
+{
+    unsigned char  bridgeCount;                 //!< Number of Bridge Chips on the Board
+    nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board
+}nvmlBridgeChipHierarchy_t;
+
+/**
+ *  Represents Type of Sampling Event
+ */
+typedef enum nvmlSamplingType_enum
+{
+    NVML_TOTAL_POWER_SAMPLES        = 0, //!< To represent total power drawn by GPU
+    NVML_GPU_UTILIZATION_SAMPLES    = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU
+    NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written
+    NVML_ENC_UTILIZATION_SAMPLES    = 3, //!< To represent percent of time during which NVENC remains busy
+    NVML_DEC_UTILIZATION_SAMPLES    = 4, //!< To represent percent of time during which NVDEC remains busy            
+    NVML_PROCESSOR_CLK_SAMPLES      = 5, //!< To represent processor clock samples
+    NVML_MEMORY_CLK_SAMPLES         = 6, //!< To represent memory clock samples
+            
+    // Keep this last
+    NVML_SAMPLINGTYPE_COUNT               
+}nvmlSamplingType_t;
+
+/**
+ * Represents the queryable PCIe utilization counters
+ */
+typedef enum nvmlPcieUtilCounter_enum
+{
+    NVML_PCIE_UTIL_TX_BYTES             = 0, // 1KB granularity
+    NVML_PCIE_UTIL_RX_BYTES             = 1, // 1KB granularity
+    
+    // Keep this last
+    NVML_PCIE_UTIL_COUNT
+} nvmlPcieUtilCounter_t;
+
+/**
+ * Represents the type for sample value returned
+ */
+typedef enum nvmlValueType_enum 
+{
+    NVML_VALUE_TYPE_DOUBLE = 0,
+    NVML_VALUE_TYPE_UNSIGNED_INT = 1,
+    NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
+    NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
+    NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4,
+
+    // Keep this last
+    NVML_VALUE_TYPE_COUNT
+}nvmlValueType_t;
+
+
+/**
+ * Union to represent different types of Value
+ */
+typedef union nvmlValue_st
+{
+    double dVal;                    //!< If the value is double
+    unsigned int uiVal;             //!< If the value is unsigned int
+    unsigned long ulVal;            //!< If the value is unsigned long
+    unsigned long long ullVal;      //!< If the value is unsigned long long
+    signed long long sllVal;        //!< If the value is signed long long
+}nvmlValue_t;
+
+/**
+ * Information for Sample
+ */
+typedef struct nvmlSample_st 
+{
+    unsigned long long timeStamp;       //!< CPU Timestamp in microseconds
+    nvmlValue_t sampleValue;        //!< Sample Value
+}nvmlSample_t;
+
+/**
+ * Represents type of perf policy for which violation times can be queried 
+ */
+typedef enum nvmlPerfPolicyType_enum
+{
+    NVML_PERF_POLICY_POWER = 0,              //!< How long did power violations cause the GPU to be below application clocks
+    NVML_PERF_POLICY_THERMAL = 1,            //!< How long did thermal violations cause the GPU to be below application clocks
+    NVML_PERF_POLICY_SYNC_BOOST = 2,         //!< How long did sync boost cause the GPU to be below application clocks
+    NVML_PERF_POLICY_BOARD_LIMIT = 3,        //!< How long did the board limit cause the GPU to be below application clocks
+    NVML_PERF_POLICY_LOW_UTILIZATION = 4,    //!< How long did low utilization cause the GPU to be below application clocks
+    NVML_PERF_POLICY_RELIABILITY = 5,        //!< How long did the board reliability limit cause the GPU to be below application clocks
+
+    NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10,  //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above)
+    NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks
+
+    // Keep this last
+    NVML_PERF_POLICY_COUNT
+}nvmlPerfPolicyType_t;
+
+/**
+ * Struct to hold perf policy violation status data
+ */
+typedef struct nvmlViolationTime_st
+{
+    unsigned long long referenceTime;  //!< referenceTime represents CPU timestamp in microseconds
+    unsigned long long violationTime;  //!< violationTime in Nanoseconds
+}nvmlViolationTime_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlDeviceEnumvs Device Enums
+ *  @{
+ */
+/***************************************************************************************************/
+
+/** 
+ * Generic enable/disable enum. 
+ */
+typedef enum nvmlEnableState_enum 
+{
+    NVML_FEATURE_DISABLED    = 0,     //!< Feature disabled 
+    NVML_FEATURE_ENABLED     = 1      //!< Feature enabled
+} nvmlEnableState_t;
+
+//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details.
+#define nvmlFlagDefault     0x00      
+//! Generic flag used to force some behavior. See description of particular functions for details.
+#define nvmlFlagForce       0x01      
+
+/**
+ *  * The Brand of the GPU
+ *   */
+typedef enum nvmlBrandType_enum
+{
+    NVML_BRAND_UNKNOWN = 0, 
+    NVML_BRAND_QUADRO  = 1,
+    NVML_BRAND_TESLA   = 2,
+    NVML_BRAND_NVS     = 3,
+    NVML_BRAND_GRID    = 4,
+    NVML_BRAND_GEFORCE = 5,
+    NVML_BRAND_TITAN   = 6,
+
+    // Keep this last
+    NVML_BRAND_COUNT
+} nvmlBrandType_t;
+
+/**
+ * Temperature thresholds.
+ */
+typedef enum nvmlTemperatureThresholds_enum
+{
+    NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0,    // Temperature at which the GPU will shut down
+                                                // for HW protection
+    NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1,    // Temperature at which the GPU will begin HW slowdown
+    NVML_TEMPERATURE_THRESHOLD_MEM_MAX  = 2,    // Memory Temperature at which the GPU will begin SW slowdown
+    NVML_TEMPERATURE_THRESHOLD_GPU_MAX  = 3,    // GPU Temperature at which the GPU can be throttled below base clock
+    // Keep this last
+    NVML_TEMPERATURE_THRESHOLD_COUNT
+} nvmlTemperatureThresholds_t;
+
+/** 
+ * Temperature sensors. 
+ */
+typedef enum nvmlTemperatureSensors_enum 
+{
+    NVML_TEMPERATURE_GPU      = 0,    //!< Temperature sensor for the GPU die
+    
+    // Keep this last
+    NVML_TEMPERATURE_COUNT
+} nvmlTemperatureSensors_t;
+
+/** 
+ * Compute mode. 
+ *
+ * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0.
+ * Earlier CUDA versions supported a single exclusive mode, 
+ * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond.
+ */
+typedef enum nvmlComputeMode_enum 
+{
+    NVML_COMPUTEMODE_DEFAULT           = 0,  //!< Default compute mode -- multiple contexts per device
+    NVML_COMPUTEMODE_EXCLUSIVE_THREAD  = 1,  //!< Support Removed
+    NVML_COMPUTEMODE_PROHIBITED        = 2,  //!< Compute-prohibited mode -- no contexts per device
+    NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3,  //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
+    
+    // Keep this last
+    NVML_COMPUTEMODE_COUNT
+} nvmlComputeMode_t;
+
+/** 
+ * ECC bit types.
+ *
+ * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type
+ */
+#define nvmlEccBitType_t nvmlMemoryErrorType_t
+
+/**
+ * Single bit ECC errors
+ *
+ * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED
+ */
+#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED
+
+/**
+ * Double bit ECC errors
+ *
+ * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED
+ */
+#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED
+
+/**
+ * Memory error types
+ */
+typedef enum nvmlMemoryErrorType_enum
+{
+    /**
+     * A memory error that was corrected
+     * 
+     * For ECC errors, these are single bit errors
+     * For Texture memory, these are errors fixed by resend
+     */
+    NVML_MEMORY_ERROR_TYPE_CORRECTED = 0,
+    /**
+     * A memory error that was not corrected
+     * 
+     * For ECC errors, these are double bit errors
+     * For Texture memory, these are errors where the resend fails
+     */
+    NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1,
+    
+    
+    // Keep this last
+    NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types
+
+} nvmlMemoryErrorType_t;
+
+/** 
+ * ECC counter types. 
+ *
+ * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent.
+ *       On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver 
+ *       client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app
+ *       is run.
+ */
+typedef enum nvmlEccCounterType_enum 
+{
+    NVML_VOLATILE_ECC      = 0,      //!< Volatile counts are reset each time the driver loads.
+    NVML_AGGREGATE_ECC     = 1,      //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device)
+    
+    // Keep this last
+    NVML_ECC_COUNTER_TYPE_COUNT      //!< Count of memory counter types
+} nvmlEccCounterType_t;
+
+/** 
+ * Clock types. 
+ * 
+ * All speeds are in Mhz.
+ */
+typedef enum nvmlClockType_enum 
+{
+    NVML_CLOCK_GRAPHICS  = 0,        //!< Graphics clock domain
+    NVML_CLOCK_SM        = 1,        //!< SM clock domain
+    NVML_CLOCK_MEM       = 2,        //!< Memory clock domain
+    NVML_CLOCK_VIDEO     = 3,        //!< Video encoder/decoder clock domain
+    
+    // Keep this last
+    NVML_CLOCK_COUNT //!< Count of clock types
+} nvmlClockType_t;
+
+/**
+ * Clock Ids.  These are used in combination with nvmlClockType_t
+ * to specify a single clock value.
+ */
+typedef enum nvmlClockId_enum
+{
+    NVML_CLOCK_ID_CURRENT            = 0,   //!< Current actual clock value
+    NVML_CLOCK_ID_APP_CLOCK_TARGET   = 1,   //!< Target application clock
+    NVML_CLOCK_ID_APP_CLOCK_DEFAULT  = 2,   //!< Default application clock target
+    NVML_CLOCK_ID_CUSTOMER_BOOST_MAX = 3,   //!< OEM-defined maximum clock rate
+
+    //Keep this last
+    NVML_CLOCK_ID_COUNT //!< Count of Clock Ids.
+} nvmlClockId_t;
+
+/** 
+ * Driver models. 
+ *
+ * Windows only.
+ */
+typedef enum nvmlDriverModel_enum 
+{
+    NVML_DRIVER_WDDM      = 0,       //!< WDDM driver model -- GPU treated as a display device
+    NVML_DRIVER_WDM       = 1        //!< WDM (TCC) model (recommended) -- GPU treated as a generic device
+} nvmlDriverModel_t;
+
+/**
+ * Allowed PStates.
+ */
+typedef enum nvmlPStates_enum 
+{
+    NVML_PSTATE_0               = 0,       //!< Performance state 0 -- Maximum Performance
+    NVML_PSTATE_1               = 1,       //!< Performance state 1 
+    NVML_PSTATE_2               = 2,       //!< Performance state 2
+    NVML_PSTATE_3               = 3,       //!< Performance state 3
+    NVML_PSTATE_4               = 4,       //!< Performance state 4
+    NVML_PSTATE_5               = 5,       //!< Performance state 5
+    NVML_PSTATE_6               = 6,       //!< Performance state 6
+    NVML_PSTATE_7               = 7,       //!< Performance state 7
+    NVML_PSTATE_8               = 8,       //!< Performance state 8
+    NVML_PSTATE_9               = 9,       //!< Performance state 9
+    NVML_PSTATE_10              = 10,      //!< Performance state 10
+    NVML_PSTATE_11              = 11,      //!< Performance state 11
+    NVML_PSTATE_12              = 12,      //!< Performance state 12
+    NVML_PSTATE_13              = 13,      //!< Performance state 13
+    NVML_PSTATE_14              = 14,      //!< Performance state 14
+    NVML_PSTATE_15              = 15,      //!< Performance state 15 -- Minimum Performance 
+    NVML_PSTATE_UNKNOWN         = 32       //!< Unknown performance state
+} nvmlPstates_t;
+
+/**
+ * GPU Operation Mode
+ *
+ * GOM allows to reduce power usage and optimize GPU throughput by disabling GPU features.
+ *
+ * Each GOM is designed to meet specific user needs.
+ */
+typedef enum nvmlGom_enum
+{
+    NVML_GOM_ALL_ON                    = 0, //!< Everything is enabled and running at full speed
+
+    NVML_GOM_COMPUTE                   = 1, //!< Designed for running only compute tasks. Graphics operations
+                                            //!< are not allowed
+
+    NVML_GOM_LOW_DP                    = 2  //!< Designed for running graphics applications that don't require
+                                            //!< high bandwidth double precision
+} nvmlGpuOperationMode_t;
+
+/** 
+ * Available infoROM objects.
+ */
+typedef enum nvmlInforomObject_enum 
+{
+    NVML_INFOROM_OEM            = 0,       //!< An object defined by OEM
+    NVML_INFOROM_ECC            = 1,       //!< The ECC object determining the level of ECC support
+    NVML_INFOROM_POWER          = 2,       //!< The power management object
+
+    // Keep this last
+    NVML_INFOROM_COUNT                     //!< This counts the number of infoROM objects the driver knows about
+} nvmlInforomObject_t;
+
+/** 
+ * Return values for NVML API calls. 
+ */
+typedef enum nvmlReturn_enum 
+{
+    NVML_SUCCESS = 0,                   //!< The operation was successful
+    NVML_ERROR_UNINITIALIZED = 1,       //!< NVML was not first initialized with nvmlInit()
+    NVML_ERROR_INVALID_ARGUMENT = 2,    //!< A supplied argument is invalid
+    NVML_ERROR_NOT_SUPPORTED = 3,       //!< The requested operation is not available on target device
+    NVML_ERROR_NO_PERMISSION = 4,       //!< The current user does not have permission for operation
+    NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
+    NVML_ERROR_NOT_FOUND = 6,           //!< A query to find an object was unsuccessful
+    NVML_ERROR_INSUFFICIENT_SIZE = 7,   //!< An input argument is not large enough
+    NVML_ERROR_INSUFFICIENT_POWER = 8,  //!< A device's external power cables are not properly attached
+    NVML_ERROR_DRIVER_NOT_LOADED = 9,   //!< NVIDIA driver is not loaded
+    NVML_ERROR_TIMEOUT = 10,            //!< User provided timeout passed
+    NVML_ERROR_IRQ_ISSUE = 11,          //!< NVIDIA Kernel detected an interrupt issue with a GPU
+    NVML_ERROR_LIBRARY_NOT_FOUND = 12,  //!< NVML Shared Library couldn't be found or loaded
+    NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
+    NVML_ERROR_CORRUPTED_INFOROM = 14,  //!< infoROM is corrupted
+    NVML_ERROR_GPU_IS_LOST = 15,        //!< The GPU has fallen off the bus or has otherwise become inaccessible
+    NVML_ERROR_RESET_REQUIRED = 16,     //!< The GPU requires a reset before it can be used again
+    NVML_ERROR_OPERATING_SYSTEM = 17,   //!< The GPU control device has been blocked by the operating system/cgroups
+    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
+    NVML_ERROR_IN_USE = 19,             //!< An operation cannot be performed because the GPU is currently in use
+    NVML_ERROR_MEMORY = 20,             //!< Insufficient memory
+    NVML_ERROR_NO_DATA = 21,            //!<No data
+    NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22,    //!< The requested vgpu operation is not available on target device, becasue ECC is enabled
+    NVML_ERROR_UNKNOWN = 999            //!< An internal driver error occurred
+} nvmlReturn_t;
+
+/**
+ * See \ref nvmlDeviceGetMemoryErrorCounter
+ */
+typedef enum nvmlMemoryLocation_enum
+{
+    NVML_MEMORY_LOCATION_L1_CACHE        = 0,    //!< GPU L1 Cache
+    NVML_MEMORY_LOCATION_L2_CACHE        = 1,    //!< GPU L2 Cache
+    NVML_MEMORY_LOCATION_DRAM            = 2,    //!< Turing+ DRAM
+    NVML_MEMORY_LOCATION_DEVICE_MEMORY   = 2,    //!< GPU Device Memory
+    NVML_MEMORY_LOCATION_REGISTER_FILE   = 3,    //!< GPU Register File
+    NVML_MEMORY_LOCATION_TEXTURE_MEMORY  = 4,    //!< GPU Texture Memory
+    NVML_MEMORY_LOCATION_TEXTURE_SHM     = 5,    //!< Shared memory
+    NVML_MEMORY_LOCATION_CBU             = 6,    //!< CBU
+    NVML_MEMORY_LOCATION_SRAM            = 7,    //!< Turing+ SRAM
+    // Keep this last
+    NVML_MEMORY_LOCATION_COUNT              //!< This counts the number of memory locations the driver knows about
+} nvmlMemoryLocation_t;
+
+/**
+ * Causes for page retirement
+ */
+typedef enum nvmlPageRetirementCause_enum
+{
+    NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS = 0, //!< Page was retired due to multiple single bit ECC error
+    NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR = 1,           //!< Page was retired due to double bit ECC error
+
+    // Keep this last
+    NVML_PAGE_RETIREMENT_CAUSE_COUNT
+} nvmlPageRetirementCause_t;
+
+/**
+ * API types that allow changes to default permission restrictions
+ */
+typedef enum nvmlRestrictedAPI_enum
+{
+    NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0,   //!< APIs that change application clocks, see nvmlDeviceSetApplicationsClocks 
+                                                      //!< and see nvmlDeviceResetApplicationsClocks
+    NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1,  //!< APIs that enable/disable Auto Boosted clocks
+                                                      //!< see nvmlDeviceSetAutoBoostedClocksEnabled
+    // Keep this last
+    NVML_RESTRICTED_API_COUNT
+} nvmlRestrictedAPI_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlGridEnums GRID Enums
+ *  @{
+ */
+/***************************************************************************************************/
+
+/*!
+ * GPU virtualization mode types.
+ */
+typedef enum nvmlGpuVirtualizationMode {
+    NVML_GPU_VIRTUALIZATION_MODE_NONE = 0,  //!< Represents Bare Metal GPU
+    NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1,  //!< Device is associated with GPU-Passthorugh
+    NVML_GPU_VIRTUALIZATION_MODE_VGPU = 2,  //!< Device is associated with vGPU inside virtual machine.
+    NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3,  //!< Device is associated with VGX hypervisor in vGPU mode
+    NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4,  //!< Device is associated with VGX hypervisor in vSGA mode
+} nvmlGpuVirtualizationMode_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlFieldValueEnums Field Value Enums
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Field Identifiers.
+ *
+ * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change.
+ */
+#define NVML_FI_DEV_ECC_CURRENT           1   //!< Current ECC mode. 1=Active. 0=Inactive
+#define NVML_FI_DEV_ECC_PENDING           2   //!< Pending ECC mode. 1=Active. 0=Inactive
+/* ECC Count Totals */
+#define NVML_FI_DEV_ECC_SBE_VOL_TOTAL     3   //!< Total single bit volatile ECC errors
+#define NVML_FI_DEV_ECC_DBE_VOL_TOTAL     4   //!< Total double bit volatile ECC errors
+#define NVML_FI_DEV_ECC_SBE_AGG_TOTAL     5   //!< Total single bit aggregate (persistent) ECC errors
+#define NVML_FI_DEV_ECC_DBE_AGG_TOTAL     6   //!< Total double bit aggregate (persistent) ECC errors
+/* Individual ECC locations */
+#define NVML_FI_DEV_ECC_SBE_VOL_L1        7   //!< L1 cache single bit volatile ECC errors
+#define NVML_FI_DEV_ECC_DBE_VOL_L1        8   //!< L1 cache double bit volatile ECC errors
+#define NVML_FI_DEV_ECC_SBE_VOL_L2        9   //!< L2 cache single bit volatile ECC errors
+#define NVML_FI_DEV_ECC_DBE_VOL_L2        10  //!< L2 cache double bit volatile ECC errors
+#define NVML_FI_DEV_ECC_SBE_VOL_DEV       11  //!< Device memory single bit volatile ECC errors
+#define NVML_FI_DEV_ECC_DBE_VOL_DEV       12  //!< Device memory double bit volatile ECC errors
+#define NVML_FI_DEV_ECC_SBE_VOL_REG       13  //!< Register file single bit volatile ECC errors
+#define NVML_FI_DEV_ECC_DBE_VOL_REG       14  //!< Register file double bit volatile ECC errors
+#define NVML_FI_DEV_ECC_SBE_VOL_TEX       15  //!< Texture memory single bit volatile ECC errors
+#define NVML_FI_DEV_ECC_DBE_VOL_TEX       16  //!< Texture memory double bit volatile ECC errors
+#define NVML_FI_DEV_ECC_DBE_VOL_CBU       17  //!< CBU double bit volatile ECC errors
+#define NVML_FI_DEV_ECC_SBE_AGG_L1        18  //!< L1 cache single bit aggregate (persistent) ECC errors
+#define NVML_FI_DEV_ECC_DBE_AGG_L1        19  //!< L1 cache double bit aggregate (persistent) ECC errors
+#define NVML_FI_DEV_ECC_SBE_AGG_L2        20  //!< L2 cache single bit aggregate (persistent) ECC errors
+#define NVML_FI_DEV_ECC_DBE_AGG_L2        21  //!< L2 cache double bit aggregate (persistent) ECC errors
+#define NVML_FI_DEV_ECC_SBE_AGG_DEV       22  //!< Device memory single bit aggregate (persistent) ECC errors
+#define NVML_FI_DEV_ECC_DBE_AGG_DEV       23  //!< Device memory double bit aggregate (persistent) ECC errors
+#define NVML_FI_DEV_ECC_SBE_AGG_REG       24  //!< Register File single bit aggregate (persistent) ECC errors
+#define NVML_FI_DEV_ECC_DBE_AGG_REG       25  //!< Register File double bit aggregate (persistent) ECC errors
+#define NVML_FI_DEV_ECC_SBE_AGG_TEX       26  //!< Texture memory single bit aggregate (persistent) ECC errors
+#define NVML_FI_DEV_ECC_DBE_AGG_TEX       27  //!< Texture memory double bit aggregate (persistent) ECC errors
+#define NVML_FI_DEV_ECC_DBE_AGG_CBU       28  //!< CBU double bit aggregate ECC errors
+
+/* Page Retirement */
+#define NVML_FI_DEV_RETIRED_SBE           29  //!< Number of retired pages because of single bit errors
+#define NVML_FI_DEV_RETIRED_DBE           30  //!< Number of retired pages because of double bit errors
+#define NVML_FI_DEV_RETIRED_PENDING       31  //!< If any pages are pending retirement. 1=yes. 0=no.
+
+/* NvLink Flit Error Counters */
+#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0    32 //!< NVLink flow control CRC  Error Counter for Lane 0
+#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1    33 //!< NVLink flow control CRC  Error Counter for Lane 1
+#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2    34 //!< NVLink flow control CRC  Error Counter for Lane 2
+#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3    35 //!< NVLink flow control CRC  Error Counter for Lane 3
+#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4    36 //!< NVLink flow control CRC  Error Counter for Lane 4
+#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5    37 //!< NVLink flow control CRC  Error Counter for Lane 5
+#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 38 //!< NVLink flow control CRC  Error Counter total for all Lanes
+
+/* NvLink CRC Data Error Counters */
+#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0    39 //!< NVLink data CRC Error Counter for Lane 0
+#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1    40 //!< NVLink data CRC Error Counter for Lane 1
+#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2    41 //!< NVLink data CRC Error Counter for Lane 2
+#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3    42 //!< NVLink data CRC Error Counter for Lane 3
+#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4    43 //!< NVLink data CRC Error Counter for Lane 4
+#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5    44 //!< NVLink data CRC Error Counter for Lane 5
+#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 45 //!< NvLink data CRC Error Counter total for all Lanes
+
+/* NvLink Replay Error Counters */
+#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0      46 //!< NVLink Replay Error Counter for Lane 0
+#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1      47 //!< NVLink Replay Error Counter for Lane 1
+#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2      48 //!< NVLink Replay Error Counter for Lane 2
+#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3      49 //!< NVLink Replay Error Counter for Lane 3
+#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4      50 //!< NVLink Replay Error Counter for Lane 4
+#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5      51 //!< NVLink Replay Error Counter for Lane 5
+#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL   52 //!< NVLink Replay Error Counter total for all Lanes
+
+/* NvLink Recovery Error Counters */
+#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0    53 //!< NVLink Recovery Error Counter for Lane 0
+#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1    54 //!< NVLink Recovery Error Counter for Lane 1
+#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2    55 //!< NVLink Recovery Error Counter for Lane 2
+#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3    56 //!< NVLink Recovery Error Counter for Lane 3
+#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4    57 //!< NVLink Recovery Error Counter for Lane 4
+#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5    58 //!< NVLink Recovery Error Counter for Lane 5
+#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 59 //!< NVLink Recovery Error Counter total for all Lanes
+
+/* NvLink Bandwidth Counters */
+#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L0     60 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 0
+#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L1     61 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 1
+#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L2     62 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 2
+#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L3     63 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 3
+#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L4     64 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 4
+#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L5     65 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 5
+#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_TOTAL  66 //!< NVLink Bandwidth Counter Total for Counter Set 0, All Lanes
+
+/* NvLink Bandwidth Counters */
+#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L0     67 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 0
+#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L1     68 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 1
+#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L2     69 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 2
+#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L3     70 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 3
+#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L4     71 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 4
+#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L5     72 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 5
+#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_TOTAL  73 //!< NVLink Bandwidth Counter Total for Counter Set 1, All Lanes
+
+/* NVML Perf Policy Counters */
+#define NVML_FI_DEV_PERF_POLICY_POWER              74   //!< Perf Policy Counter for Power Policy
+#define NVML_FI_DEV_PERF_POLICY_THERMAL            75   //!< Perf Policy Counter for Thermal Policy
+#define NVML_FI_DEV_PERF_POLICY_SYNC_BOOST         76   //!< Perf Policy Counter for Sync boost Policy
+#define NVML_FI_DEV_PERF_POLICY_BOARD_LIMIT        77   //!< Perf Policy Counter for Board Limit
+#define NVML_FI_DEV_PERF_POLICY_LOW_UTILIZATION    78   //!< Perf Policy Counter for Low GPU Utilization Policy
+#define NVML_FI_DEV_PERF_POLICY_RELIABILITY        79   //!< Perf Policy Counter for Reliability Policy
+#define NVML_FI_DEV_PERF_POLICY_TOTAL_APP_CLOCKS   80   //!< Perf Policy Counter for Total App Clock Policy
+#define NVML_FI_DEV_PERF_POLICY_TOTAL_BASE_CLOCKS  81   //!< Perf Policy Counter for Total Base Clocks Policy
+
+/* Memory temperatures */
+#define NVML_FI_DEV_MEMORY_TEMP  82 //!< Memory temperature for the device
+
+/* Energy Counter */
+#define NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION 83 //!< Total energy consumption for the GPU in mJ since the driver was last reloaded
+
+/* NVLink Speed */
+#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L0     84  //!< NVLink Speed in MBps for Link 0
+#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L1     85  //!< NVLink Speed in MBps for Link 1
+#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L2     86  //!< NVLink Speed in MBps for Link 2
+#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L3     87  //!< NVLink Speed in MBps for Link 3
+#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L4     88  //!< NVLink Speed in MBps for Link 4
+#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L5     89  //!< NVLink Speed in MBps for Link 5
+#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90  //!< Common NVLink Speed in MBps for active links
+
+#define NVML_FI_DEV_NVLINK_LINK_COUNT        91  //!< Number of NVLinks present on the device
+
+#define NVML_FI_DEV_RETIRED_PENDING_SBE      92  //!< If any pages are pending retirement due to SBE. 1=yes. 0=no.
+#define NVML_FI_DEV_RETIRED_PENDING_DBE      93  //!< If any pages are pending retirement due to DBE. 1=yes. 0=no.
+
+#define NVML_FI_DEV_PCIE_REPLAY_COUNTER             94  //!< PCIe replay counter
+#define NVML_FI_DEV_PCIE_REPLAY_ROLLOVER_COUNTER    95  //!< PCIe replay rollover counter
+
+#define NVML_FI_MAX 96 //!< One greater than the largest field ID defined above
+
+/**
+ * Information for a Field Value Sample
+ */
+typedef struct nvmlFieldValue_st
+{
+    unsigned int fieldId;       //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above.
+    unsigned int unused;        //!< Currently unused. This should be initialized to 0 by the caller before any API call
+    long long timestamp;        //!< CPU Timestamp of this value in microseconds since 1970
+    long long latencyUsec;      //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by the same driver call.
+    nvmlValueType_t valueType;  //!< Type of the value stored in value
+    nvmlReturn_t nvmlReturn;    //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn != NVML_SUCCESS
+    nvmlValue_t value;          //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS
+} nvmlFieldValue_t;
+
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlUnitStructs Unit Structs
+ *  @{
+ */
+/***************************************************************************************************/
+
+typedef struct nvmlUnit_st* nvmlUnit_t;
+
+/** 
+ * Description of HWBC entry 
+ */
+typedef struct nvmlHwbcEntry_st 
+{
+    unsigned int hwbcId;
+    char firmwareVersion[32];
+} nvmlHwbcEntry_t;
+
+/** 
+ * Fan state enum. 
+ */
+typedef enum nvmlFanState_enum 
+{
+    NVML_FAN_NORMAL       = 0,     //!< Fan is working properly
+    NVML_FAN_FAILED       = 1      //!< Fan has failed
+} nvmlFanState_t;
+
+/** 
+ * Led color enum. 
+ */
+typedef enum nvmlLedColor_enum 
+{
+    NVML_LED_COLOR_GREEN       = 0,     //!< GREEN, indicates good health
+    NVML_LED_COLOR_AMBER       = 1      //!< AMBER, indicates problem
+} nvmlLedColor_t;
+
+
+/** 
+ * LED states for an S-class unit.
+ */
+typedef struct nvmlLedState_st 
+{
+    char cause[256];               //!< If amber, a text description of the cause
+    nvmlLedColor_t color;          //!< GREEN or AMBER
+} nvmlLedState_t;
+
+/** 
+ * Static S-class unit info.
+ */
+typedef struct nvmlUnitInfo_st 
+{
+    char name[96];                      //!< Product name
+    char id[96];                        //!< Product identifier
+    char serial[96];                    //!< Product serial number
+    char firmwareVersion[96];           //!< Firmware version
+} nvmlUnitInfo_t;
+
+/** 
+ * Power usage information for an S-class unit.
+ * The power supply state is a human readable string that equals "Normal" or contains
+ * a combination of "Abnormal" plus one or more of the following:
+ *    
+ *    - High voltage
+ *    - Fan failure
+ *    - Heatsink temperature
+ *    - Current limit
+ *    - Voltage below UV alarm threshold
+ *    - Low-voltage
+ *    - SI2C remote off command
+ *    - MOD_DISABLE input
+ *    - Short pin transition 
+*/
+typedef struct nvmlPSUInfo_st 
+{
+    char state[256];                 //!< The power supply state
+    unsigned int current;            //!< PSU current (A)
+    unsigned int voltage;            //!< PSU voltage (V)
+    unsigned int power;              //!< PSU power draw (W)
+} nvmlPSUInfo_t;
+
+/** 
+ * Fan speed reading for a single fan in an S-class unit.
+ */
+typedef struct nvmlUnitFanInfo_st 
+{
+    unsigned int speed;              //!< Fan speed (RPM)
+    nvmlFanState_t state;            //!< Flag that indicates whether fan is working properly
+} nvmlUnitFanInfo_t;
+
+/** 
+ * Fan speed readings for an entire S-class unit.
+ */
+typedef struct nvmlUnitFanSpeeds_st 
+{
+    nvmlUnitFanInfo_t fans[24];      //!< Fan speed data for each fan
+    unsigned int count;              //!< Number of fans in unit
+} nvmlUnitFanSpeeds_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @addtogroup nvmlEvents 
+ *  @{
+ */
+/***************************************************************************************************/
+
+/** 
+ * Handle to an event set
+ */
+typedef struct nvmlEventSet_st* nvmlEventSet_t;
+
+/** @defgroup nvmlEventType Event Types
+ * @{
+ * Event Types which user can be notified about.
+ * See description of particular functions for details.
+ *
+ * See \ref nvmlDeviceRegisterEvents and \ref nvmlDeviceGetSupportedEventTypes to check which devices 
+ * support each event.
+ *
+ * Types can be combined with bitwise or operator '|' when passed to \ref nvmlDeviceRegisterEvents
+ */
+//! Event about single bit ECC errors
+/**
+ * \note A corrected texture memory error is not an ECC error, so it does not generate a single bit event
+ */
+#define nvmlEventTypeSingleBitEccError     0x0000000000000001LL
+
+//! Event about double bit ECC errors
+/**
+ * \note An uncorrected texture memory error is not an ECC error, so it does not generate a double bit event
+ */
+#define nvmlEventTypeDoubleBitEccError     0x0000000000000002LL
+
+//! Event about PState changes
+/**
+ *  \note On Fermi architecture PState changes are also an indicator that GPU is throttling down due to
+ *  no work being executed on the GPU, power capping or thermal capping. In a typical situation,
+ *  Fermi-based GPU should stay in P0 for the duration of the execution of the compute process.
+ */
+#define nvmlEventTypePState                0x0000000000000004LL
+
+//! Event that Xid critical error occurred
+#define nvmlEventTypeXidCriticalError      0x0000000000000008LL
+
+//! Event about clock changes
+/**
+ * Kepler only
+ */
+#define nvmlEventTypeClock                 0x0000000000000010LL
+
+//! Mask with no events
+#define nvmlEventTypeNone                  0x0000000000000000LL
+//! Mask of all events
+#define nvmlEventTypeAll (nvmlEventTypeNone    \
+        | nvmlEventTypeSingleBitEccError       \
+        | nvmlEventTypeDoubleBitEccError       \
+        | nvmlEventTypePState                  \
+        | nvmlEventTypeClock                   \
+        | nvmlEventTypeXidCriticalError        \
+        )
+/** @} */
+
+/** 
+ * Information about occurred event
+ */
+typedef struct nvmlEventData_st
+{
+    nvmlDevice_t        device;         //!< Specific device where the event occurred
+    unsigned long long  eventType;      //!< Information about what specific event occurred
+    unsigned long long  eventData;      //!< Stores last XID error for the device in the event of nvmlEventTypeXidCriticalError, 
+                                        //  eventData is 0 for any other event. eventData is set as 999 for unknown xid error.
+} nvmlEventData_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @addtogroup nvmlClocksThrottleReasons
+ *  @{
+ */
+/***************************************************************************************************/
+
+/** Nothing is running on the GPU and the clocks are dropping to Idle state
+ * \note This limiter may be removed in a later release
+ */
+#define nvmlClocksThrottleReasonGpuIdle                   0x0000000000000001LL
+
+/** GPU clocks are limited by current setting of applications clocks
+ *
+ * @see nvmlDeviceSetApplicationsClocks
+ * @see nvmlDeviceGetApplicationsClock
+ */
+#define nvmlClocksThrottleReasonApplicationsClocksSetting 0x0000000000000002LL
+
+/** 
+ * @deprecated Renamed to \ref nvmlClocksThrottleReasonApplicationsClocksSetting 
+ *             as the name describes the situation more accurately.
+ */
+#define nvmlClocksThrottleReasonUserDefinedClocks         nvmlClocksThrottleReasonApplicationsClocksSetting 
+
+/** SW Power Scaling algorithm is reducing the clocks below requested clocks 
+ *
+ * @see nvmlDeviceGetPowerUsage
+ * @see nvmlDeviceSetPowerManagementLimit
+ * @see nvmlDeviceGetPowerManagementLimit
+ */
+#define nvmlClocksThrottleReasonSwPowerCap                0x0000000000000004LL
+
+/** HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
+ * 
+ * This is an indicator of:
+ *   - temperature being too high
+ *   - External Power Brake Assertion is triggered (e.g. by the system power supply)
+ *   - Power draw is too high and Fast Trigger protection is reducing the clocks
+ *   - May be also reported during PState or clock change
+ *      - This behavior may be removed in a later release.
+ *
+ * @see nvmlDeviceGetTemperature
+ * @see nvmlDeviceGetTemperatureThreshold
+ * @see nvmlDeviceGetPowerUsage
+ */
+#define nvmlClocksThrottleReasonHwSlowdown                0x0000000000000008LL
+
+/** Sync Boost
+ *
+ * This GPU has been added to a Sync boost group with nvidia-smi or DCGM in
+ * order to maximize performance per watt. All GPUs in the sync boost group
+ * will boost to the minimum possible clocks across the entire group. Look at
+ * the throttle reasons for other GPUs in the system to see why those GPUs are
+ * holding this one at lower clocks.
+ *
+ */
+#define nvmlClocksThrottleReasonSyncBoost                 0x0000000000000010LL
+
+/** SW Thermal Slowdown
+ *
+ * This is an indicator of one or more of the following:
+ *  - Current GPU temperature above the GPU Max Operating Temperature
+ *  - Current memory temperature above the Memory Max Operating Temperature
+ *
+ */
+#define nvmlClocksThrottleReasonSwThermalSlowdown         0x0000000000000020LL
+
+/** HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
+ * 
+ * This is an indicator of:
+ *   - temperature being too high
+ *
+ * @see nvmlDeviceGetTemperature
+ * @see nvmlDeviceGetTemperatureThreshold
+ * @see nvmlDeviceGetPowerUsage
+ */
+#define nvmlClocksThrottleReasonHwThermalSlowdown         0x0000000000000040LL
+
+/** HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
+ * 
+ * This is an indicator of:
+ *   - External Power Brake Assertion being triggered (e.g. by the system power supply)
+ *
+ * @see nvmlDeviceGetTemperature
+ * @see nvmlDeviceGetTemperatureThreshold
+ * @see nvmlDeviceGetPowerUsage
+ */
+#define nvmlClocksThrottleReasonHwPowerBrakeSlowdown      0x0000000000000080LL
+
+/** GPU clocks are limited by current setting of Display clocks
+ *
+ * @see bug 1997531
+ */
+#define nvmlClocksThrottleReasonDisplayClockSetting       0x0000000000000100LL
+
+/** Bit mask representing no clocks throttling
+ *
+ * Clocks are as high as possible.
+ * */
+#define nvmlClocksThrottleReasonNone                      0x0000000000000000LL
+
+/** Bit mask representing all supported clocks throttling reasons 
+ * New reasons might be added to this list in the future
+ */
+#define nvmlClocksThrottleReasonAll (nvmlClocksThrottleReasonNone \
+      | nvmlClocksThrottleReasonGpuIdle                           \
+      | nvmlClocksThrottleReasonApplicationsClocksSetting         \
+      | nvmlClocksThrottleReasonSwPowerCap                        \
+      | nvmlClocksThrottleReasonHwSlowdown                        \
+      | nvmlClocksThrottleReasonSyncBoost                         \
+      | nvmlClocksThrottleReasonSwThermalSlowdown                 \
+      | nvmlClocksThrottleReasonHwThermalSlowdown                 \
+      | nvmlClocksThrottleReasonHwPowerBrakeSlowdown              \
+      | nvmlClocksThrottleReasonDisplayClockSetting               \
+)
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlAccountingStats Accounting Statistics
+ *  @{
+ *
+ *  Set of APIs designed to provide per process information about usage of GPU.
+ *
+ *  @note All accounting statistics and accounting mode live in nvidia driver and reset 
+ *        to default (Disabled) when driver unloads.
+ *        It is advised to run with persistence mode enabled.
+ *
+ *  @note Enabling accounting mode has no negative impact on the GPU performance.
+ */
+/***************************************************************************************************/
+
+/**
+ * Describes accounting statistics of a process.
+ */
+typedef struct nvmlAccountingStats_st {
+    unsigned int gpuUtilization;                //!< Percent of time over the process's lifetime during which one or more kernels was executing on the GPU.
+                                                //! Utilization stats just like returned by \ref nvmlDeviceGetUtilizationRates but for the life time of a
+                                                //! process (not just the last sample period).
+                                                //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported
+    
+    unsigned int memoryUtilization;             //!< Percent of time over the process's lifetime during which global (device) memory was being read or written.
+                                                //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported
+    
+    unsigned long long maxMemoryUsage;          //!< Maximum total memory in bytes that was ever allocated by the process.
+                                                //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlProcessInfo_t->usedGpuMemory is not supported
+    
+
+    unsigned long long time;                    //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if 
+                                                //!< the process is not terminated
+    
+    unsigned long long startTime;               //!< CPU Timestamp in usec representing start time for the process
+    
+    unsigned int isRunning;                     //!< Flag to represent if the process is running (1 for running, 0 for terminated)
+
+    unsigned int reserved[5];                   //!< Reserved for future use
+} nvmlAccountingStats_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlVgpuConstants Vgpu Constants
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense
+ */
+#define NVML_GRID_LICENSE_BUFFER_SIZE       128
+
+#define NVML_VGPU_NAME_BUFFER_SIZE          64
+
+#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3
+
+/*!
+ * Macros for pGPU's virtualization capabilities bitfield.
+ */
+#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION         0:0
+#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO      0x0
+#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES     0x1
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlVgpuEnum Vgpu Enum
+ *  @{
+ */
+/***************************************************************************************************/
+
+/*!
+ * Types of VM identifiers
+ */
+typedef enum nvmlVgpuVmIdType {
+    NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID
+    NVML_VGPU_VM_ID_UUID = 1,      //!< VM ID represents UUID
+} nvmlVgpuVmIdType_t;
+
+/**
+ * vGPU GUEST info state.
+ */
+typedef enum nvmlVgpuGuestInfoState_enum
+{
+    NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0,  //!< Guest-dependent fields uninitialized
+    NVML_VGPU_INSTANCE_GUEST_INFO_STATE_INITIALIZED   = 1,  //!< Guest-dependent fields initialized
+} nvmlVgpuGuestInfoState_t;
+
+/**
+ * GRID license feature code
+ */
+typedef enum {
+    NVML_GRID_LICENSE_FEATURE_CODE_VGPU = 1,         //!< Virtual GPU
+    NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = 2  //!< Virtual Workstation
+} nvmlGridLicenseFeatureCode_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlVgpuStructs Vgpu Structs
+ *  @{
+ */
+/***************************************************************************************************/
+
+typedef unsigned int nvmlVgpuTypeId_t;
+
+typedef unsigned int nvmlVgpuInstance_t;
+
+/**
+ * Structure to store Utilization Value and vgpuInstance
+ */
+typedef struct nvmlVgpuInstanceUtilizationSample_st
+{
+    nvmlVgpuInstance_t vgpuInstance;    //!< vGPU Instance
+    unsigned long long timeStamp;       //!< CPU Timestamp in microseconds
+    nvmlValue_t smUtil;                 //!< SM (3D/Compute) Util Value
+    nvmlValue_t memUtil;                //!< Frame Buffer Memory Util Value
+    nvmlValue_t encUtil;                //!< Encoder Util Value
+    nvmlValue_t decUtil;                //!< Decoder Util Value
+} nvmlVgpuInstanceUtilizationSample_t;
+
+/**
+ * Structure to store Utilization Value, vgpuInstance and subprocess information
+ */
+typedef struct nvmlVgpuProcessUtilizationSample_st
+{
+    nvmlVgpuInstance_t vgpuInstance;                //!< vGPU Instance
+    unsigned int pid;                               //!< PID of process running within the vGPU VM
+    char processName[NVML_VGPU_NAME_BUFFER_SIZE];   //!< Name of process running within the vGPU VM
+    unsigned long long timeStamp;                   //!< CPU Timestamp in microseconds
+    unsigned int smUtil;                            //!< SM (3D/Compute) Util Value
+    unsigned int memUtil;                           //!< Frame Buffer Memory Util Value
+    unsigned int encUtil;                           //!< Encoder Util Value
+    unsigned int decUtil;                           //!< Decoder Util Value
+} nvmlVgpuProcessUtilizationSample_t;
+
+/**
+ * Structure to store utilization value and process Id
+ */
+typedef struct nvmlProcessUtilizationSample_st
+{
+    unsigned int pid;                   //!< PID of process
+    unsigned long long timeStamp;       //!< CPU Timestamp in microseconds
+    unsigned int smUtil;                //!< SM (3D/Compute) Util Value
+    unsigned int memUtil;               //!< Frame Buffer Memory Util Value
+    unsigned int encUtil;               //!< Encoder Util Value
+    unsigned int decUtil;               //!< Decoder Util Value
+} nvmlProcessUtilizationSample_t;
+
+/**
+ * Structure containing GRID licensable feature information
+ */
+typedef struct nvmlGridLicensableFeature_st
+{
+    nvmlGridLicenseFeatureCode_t    featureCode;                                 //!< Licensed feature code
+    unsigned int                    featureState;                                //!< Non-zero if feature is currently licensed, otherwise zero
+    char                            licenseInfo[NVML_GRID_LICENSE_BUFFER_SIZE];
+} nvmlGridLicensableFeature_t;
+
+/**
+ * Structure to store GRID licensable features
+ */
+typedef struct nvmlGridLicensableFeatures_st
+{
+    int                         isGridLicenseSupported;                                       //!< Non-zero if GRID Software Licensing is supported on the system, otherwise zero
+    unsigned int                licensableFeaturesCount;                                      //!< Entries returned in \a gridLicensableFeatures array
+    nvmlGridLicensableFeature_t gridLicensableFeatures[NVML_GRID_LICENSE_FEATURE_MAX_COUNT];  //!< Array of GRID licensable features.
+} nvmlGridLicensableFeatures_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlEncoderStructs Encoder Structs
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Represents type of encoder for capacity can be queried
+ */
+typedef enum nvmlEncoderQueryType_enum
+{
+    NVML_ENCODER_QUERY_H264 = 0,        //!< H264 encoder
+    NVML_ENCODER_QUERY_HEVC = 1,        //!< HEVC encoder
+}nvmlEncoderType_t;
+
+/**
+ * Structure to hold encoder session data
+ */
+typedef struct nvmlEncoderSessionInfo_st
+{
+    unsigned int       sessionId;       //!< Unique session ID
+    unsigned int       pid;             //!< Owning process ID
+    nvmlVgpuInstance_t vgpuInstance;    //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero)
+    nvmlEncoderType_t  codecType;       //!< Video encoder type
+    unsigned int       hResolution;     //!< Current encode horizontal resolution
+    unsigned int       vResolution;     //!< Current encode vertical resolution
+    unsigned int       averageFps;      //!< Moving average encode frames per second
+    unsigned int       averageLatency;  //!< Moving average encode latency in microseconds
+}nvmlEncoderSessionInfo_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlFBCStructs Frame Buffer Capture Structures
+*  @{
+*/
+/***************************************************************************************************/
+
+/**
+ * Represents frame buffer capture session type
+ */
+typedef enum nvmlFBCSessionType_enum
+{
+    NVML_FBC_SESSION_TYPE_UNKNOWN = 0,     //!< Unknwon
+    NVML_FBC_SESSION_TYPE_TOSYS,           //!< ToSys
+    NVML_FBC_SESSION_TYPE_CUDA,            //!< Cuda
+    NVML_FBC_SESSION_TYPE_VID,             //!< Vid
+    NVML_FBC_SESSION_TYPE_HWENC,           //!< HEnc
+} nvmlFBCSessionType_t;
+
+/**
+ * Structure to hold frame buffer capture sessions stats
+ */
+typedef struct nvmlFBCStats_st
+{
+    unsigned int      sessionsCount;    //!< Total no of sessions
+    unsigned int      averageFPS;       //!< Moving average new frames captured per second
+    unsigned int      averageLatency;   //!< Moving average new frame capture latency in microseconds
+} nvmlFBCStats_t;
+
+#define NVML_NVFBC_SESSION_FLAG_DIFFMAP_ENABLED                0x00000001    //!< Bit specifying differential map state.
+#define NVML_NVFBC_SESSION_FLAG_CLASSIFICATIONMAP_ENABLED      0x00000002    //!< Bit specifying classification map state.
+#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_NO_WAIT      0x00000004    //!< Bit specifying if capture was requested as non-blocking call.
+#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_INFINITE     0x00000008    //!< Bit specifying if capture was requested as blocking call.
+#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_TIMEOUT      0x00000010    //!< Bit specifying if capture was requested as blocking call with timeout period.
+
+/**
+ * Structure to hold FBC session data
+ */
+typedef struct nvmlFBCSessionInfo_st
+{
+    unsigned int          sessionId;                           //!< Unique session ID
+    unsigned int          pid;                                 //!< Owning process ID
+    nvmlVgpuInstance_t    vgpuInstance;                        //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero)
+    unsigned int          displayOrdinal;                      //!< Display identifier
+    nvmlFBCSessionType_t  sessionType;                         //!< Type of frame buffer capture session
+    unsigned int          sessionFlags;                        //!< Session flags (one or more of NVML_NVFBC_SESSION_FLAG_XXX).
+    unsigned int          hMaxResolution;                      //!< Max horizontal resolution supported by the capture session
+    unsigned int          vMaxResolution;                      //!< Max vertical resolution supported by the capture session
+    unsigned int          hResolution;                         //!< Horizontal resolution requested by caller in capture call
+    unsigned int          vResolution;                         //!< Vertical resolution requested by caller in capture call
+    unsigned int          averageFPS;                          //!< Moving average new frames captured per second
+    unsigned int          averageLatency;                      //!< Moving average new frame capture latency in microseconds
+} nvmlFBCSessionInfo_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlDrainDefs definitions related to the drain state
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ *  Is the GPU device to be removed from the kernel by nvmlDeviceRemoveGpu()
+ */
+typedef enum nvmlDetachGpuState_enum
+{
+    NVML_DETACH_GPU_KEEP         = 0,
+    NVML_DETACH_GPU_REMOVE,
+} nvmlDetachGpuState_t;
+
+/**
+ *  Parent bridge PCIe link state requested by nvmlDeviceRemoveGpu()
+ */
+typedef enum nvmlPcieLinkState_enum
+{
+    NVML_PCIE_LINK_KEEP         = 0,
+    NVML_PCIE_LINK_SHUT_DOWN,
+} nvmlPcieLinkState_t;
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlInitializationAndCleanup Initialization and Cleanup
+ * This chapter describes the methods that handle NVML initialization and cleanup.
+ * It is the user's responsibility to call \ref nvmlInit() before calling any other methods, and 
+ * nvmlShutdown() once NVML is no longer being used.
+ *  @{
+ */
+/***************************************************************************************************/
+
+#define NVML_INIT_FLAG_NO_GPUS      1   //!< Don't fail nvmlInit() when no GPUs are found
+#define NVML_INIT_FLAG_NO_ATTACH    2   //!< Don't attach GPUs
+
+/**
+ * Initialize NVML, but don't initialize any GPUs yet.
+ *
+ * \note nvmlInit_v3 introduces a "flags" argument, that allows passing boolean values
+ *       modifying the behaviour of nvmlInit().
+ * \note In NVML 5.319 new nvmlInit_v2 has replaced nvmlInit"_v1" (default in NVML 4.304 and older) that
+ *       did initialize all GPU devices in the system.
+ *       
+ * This allows NVML to communicate with a GPU
+ * when other GPUs in the system are unstable or in a bad state.  When using this API, GPUs are
+ * discovered and initialized in nvmlDeviceGetHandleBy* functions instead.
+ * 
+ * \note To contrast nvmlInit_v2 with nvmlInit"_v1", NVML 4.304 nvmlInit"_v1" will fail when any detected GPU is in
+ *       a bad or unstable state.
+ * 
+ * For all products.
+ *
+ * This method, should be called once before invoking any other methods in the library.
+ * A reference count of the number of initializations is maintained.  Shutdown only occurs
+ * when the reference count reaches zero.
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                   if NVML has been properly initialized
+ *         - \ref NVML_ERROR_DRIVER_NOT_LOADED   if NVIDIA driver is not running
+ *         - \ref NVML_ERROR_NO_PERMISSION       if NVML does not have permission to talk to the driver
+ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlInit(void);
+
+/**
+ * nvmlInitWithFlags is a variant of nvmlInit(), that allows passing a set of boolean values
+ *       modifying the behaviour of nvmlInit().
+ *       Other than the "flags" parameter it is completely similar to \ref nvmlInit.
+ *       
+ * For all products.
+ *
+ * @param flags                                 behaviour modifier flags
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                   if NVML has been properly initialized
+ *         - \ref NVML_ERROR_DRIVER_NOT_LOADED   if NVIDIA driver is not running
+ *         - \ref NVML_ERROR_NO_PERMISSION       if NVML does not have permission to talk to the driver
+ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlInitWithFlags(unsigned int flags);
+
+/**
+ * Shut down NVML by releasing all GPU resources previously allocated with \ref nvmlInit().
+ * 
+ * For all products.
+ *
+ * This method should be called after NVML work is done, once for each call to \ref nvmlInit()
+ * A reference count of the number of initializations is maintained.  Shutdown only occurs
+ * when the reference count reaches zero.  For backwards compatibility, no error is reported if
+ * nvmlShutdown() is called more times than nvmlInit().
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if NVML has been properly shut down
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlShutdown(void);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlErrorReporting Error reporting
+ * This chapter describes helper functions for error reporting routines.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Helper method for converting NVML error codes into readable strings.
+ *
+ * For all products.
+ *
+ * @param result                               NVML error code to convert
+ *
+ * @return String representation of the error.
+ *
+ */
+const DECLDIR char* nvmlErrorString(nvmlReturn_t result);
+/** @} */
+
+
+/***************************************************************************************************/
+/** @defgroup nvmlConstants Constants
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetInforomVersion and \ref nvmlDeviceGetInforomImageVersion
+ */
+#define NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE       16
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetUUID
+ */
+#define NVML_DEVICE_UUID_BUFFER_SIZE                  80
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetBoardPartNumber
+ */
+#define NVML_DEVICE_PART_NUMBER_BUFFER_SIZE           80
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlSystemGetDriverVersion
+ */
+#define NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE        80
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlSystemGetNVMLVersion
+ */
+#define NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE          80
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetName
+ */
+#define NVML_DEVICE_NAME_BUFFER_SIZE                  64
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetSerial
+ */
+#define NVML_DEVICE_SERIAL_BUFFER_SIZE                30
+
+/**
+ * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetVbiosVersion
+ */
+#define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE         32
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlSystemQueries System Queries
+ * This chapter describes the queries that NVML can perform against the local system. These queries
+ * are not device-specific.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Retrieves the version of the system's graphics driver.
+ * 
+ * For all products.
+ *
+ * The version identifier is an alphanumeric string.  It will not exceed 80 characters in length
+ * (including the NULL terminator).  See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE.
+ *
+ * @param version                              Reference in which to return the version identifier
+ * @param length                               The maximum allowed length of the string returned in \a version
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a version has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
+ */
+nvmlReturn_t DECLDIR nvmlSystemGetDriverVersion(char *version, unsigned int length);
+
+/**
+ * Retrieves the version of the NVML library.
+ * 
+ * For all products.
+ *
+ * The version identifier is an alphanumeric string.  It will not exceed 80 characters in length
+ * (including the NULL terminator).  See \ref nvmlConstants::NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE.
+ *
+ * @param version                              Reference in which to return the version identifier
+ * @param length                               The maximum allowed length of the string returned in \a version
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a version has been set
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
+ */
+nvmlReturn_t DECLDIR nvmlSystemGetNVMLVersion(char *version, unsigned int length);
+
+/**
+ * Retrieves the version of the CUDA driver.
+ *
+ * For all products.
+ *
+ * The returned CUDA driver version is the same as the CUDA API
+ * cuDriverGetVersion() would return on the system.
+ *
+ * @param cudaDriverVersion                    Reference in which to return the version identifier
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a cudaDriverVersion has been set
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a cudaDriverVersion is NULL
+ */
+nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion(int *cudaDriverVersion);
+
+/**
+ * Gets name of the process with provided process id
+ *
+ * For all products.
+ *
+ * Returned process name is cropped to provided length.
+ * name string is encoded in ANSI.
+ *
+ * @param pid                                  The identifier of the process
+ * @param name                                 Reference in which to return the process name
+ * @param length                               The maximum allowed length of the string returned in \a name
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a name has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a name is NULL or \a length is 0.
+ *         - \ref NVML_ERROR_NOT_FOUND         if process doesn't exists
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlSystemGetProcessName(unsigned int pid, char *name, unsigned int length);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlUnitQueries Unit Queries
+ * This chapter describes that queries that NVML can perform against each unit. For S-class systems only.
+ * In each case the device is identified with an nvmlUnit_t handle. This handle is obtained by 
+ * calling \ref nvmlUnitGetHandleByIndex().
+ *  @{
+ */
+/***************************************************************************************************/
+
+ /**
+ * Retrieves the number of units in the system.
+ *
+ * For S-class products.
+ *
+ * @param unitCount                            Reference in which to return the number of units
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a unitCount has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unitCount is NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetCount(unsigned int *unitCount);
+
+/**
+ * Acquire the handle for a particular unit, based on its index.
+ *
+ * For S-class products.
+ *
+ * Valid indices are derived from the \a unitCount returned by \ref nvmlUnitGetCount(). 
+ *   For example, if \a unitCount is 2 the valid indices are 0 and 1, corresponding to UNIT 0 and UNIT 1.
+ *
+ * The order in which NVML enumerates units has no guarantees of consistency between reboots.
+ *
+ * @param index                                The index of the target unit, >= 0 and < \a unitCount
+ * @param unit                                 Reference in which to return the unit handle
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a unit has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a index is invalid or \a unit is NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit);
+
+/**
+ * Retrieves the static information associated with a unit.
+ *
+ * For S-class products.
+ *
+ * See \ref nvmlUnitInfo_t for details on available unit info.
+ *
+ * @param unit                                 The identifier of the target unit
+ * @param info                                 Reference in which to return the unit information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a info has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a info is NULL
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info);
+
+/**
+ * Retrieves the LED state associated with this unit.
+ *
+ * For S-class products.
+ *
+ * See \ref nvmlLedState_t for details on allowed states.
+ *
+ * @param unit                                 The identifier of the target unit
+ * @param state                                Reference in which to return the current LED state
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a state has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a state is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlUnitSetLedState()
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state);
+
+/**
+ * Retrieves the PSU stats for the unit.
+ *
+ * For S-class products.
+ *
+ * See \ref nvmlPSUInfo_t for details on available PSU info.
+ *
+ * @param unit                                 The identifier of the target unit
+ * @param psu                                  Reference in which to return the PSU information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a psu has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a psu is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu);
+
+/**
+ * Retrieves the temperature readings for the unit, in degrees C.
+ *
+ * For S-class products.
+ *
+ * Depending on the product, readings may be available for intake (type=0), 
+ * exhaust (type=1) and board (type=2).
+ *
+ * @param unit                                 The identifier of the target unit
+ * @param type                                 The type of reading to take
+ * @param temp                                 Reference in which to return the intake temperature
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a temp has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit or \a type is invalid or \a temp is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp);
+
+/**
+ * Retrieves the fan speed readings for the unit.
+ *
+ * For S-class products.
+ *
+ * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info.
+ *
+ * @param unit                                 The identifier of the target unit
+ * @param fanSpeeds                            Reference in which to return the fan speed information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a fanSpeeds has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a fanSpeeds is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds);
+
+/**
+ * Retrieves the set of GPU devices that are attached to the specified unit.
+ *
+ * For S-class products.
+ *
+ * The \a deviceCount argument is expected to be set to the size of the input \a devices array.
+ *
+ * @param unit                                 The identifier of the target unit
+ * @param deviceCount                          Reference in which to provide the \a devices array size, and
+ *                                             to return the number of attached GPU devices
+ * @param devices                              Reference in which to return the references to the attached GPU devices
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a deviceCount and \a devices have been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid, either of \a deviceCount or \a devices is NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices);
+
+/**
+ * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system.
+ * 
+ * For S-class products.
+ *
+ * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array.
+ * The HIC must be connected to an S-class system for it to be reported by this function.
+ *
+ * @param hwbcCount                            Size of hwbcEntries array
+ * @param hwbcEntries                          Array holding information about hwbc
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a hwbcCount and \a hwbcEntries have been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if either \a hwbcCount or \a hwbcEntries is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small
+ */
+nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries);
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlDeviceQueries Device Queries
+ * This chapter describes that queries that NVML can perform against each device.
+ * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by  
+ * calling one of \ref nvmlDeviceGetHandleByIndex(), \ref nvmlDeviceGetHandleBySerial(),
+ * \ref nvmlDeviceGetHandleByPciBusId(). or \ref nvmlDeviceGetHandleByUUID(). 
+ *  @{
+ */
+/***************************************************************************************************/
+
+ /**
+ * Retrieves the number of compute devices in the system. A compute device is a single GPU.
+ * 
+ * For all products.
+ *
+ * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system
+ *       even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device.
+ *       Update your code to handle this error, or use NVML 4.304 or older nvml header file.
+ *       For backward binary compatibility reasons _v1 version of the API is still present in the shared
+ *       library.
+ *       Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to.
+ *
+ * @param deviceCount                          Reference in which to return the number of accessible devices
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a deviceCount has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a deviceCount is NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount);
+
+/**
+ * Acquire the handle for a particular device, based on its index.
+ * 
+ * For all products.
+ *
+ * Valid indices are derived from the \a accessibleDevices count returned by 
+ *   \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices  
+ *   are 0 and 1, corresponding to GPU 0 and GPU 1.
+ *
+ * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it
+ *   is recommended that devices be looked up by their PCI ids or UUID. See 
+ *   \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId().
+ *
+ * Note: The NVML index may not correlate with other APIs, such as the CUDA device index.
+ *
+ * Starting from NVML 5, this API causes NVML to initialize the target GPU
+ * NVML may initialize additional GPUs if:
+ *  - The target GPU is an SLI slave
+ * 
+ * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system
+ *       even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device.
+ *       Update your code to handle this error, or use NVML 4.304 or older nvml header file.
+ *       For backward binary compatibility reasons _v1 version of the API is still present in the shared
+ *       library.
+ *       Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to.
+ *
+ *       This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index.
+ *       If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't
+ *       need to worry about that.
+ *
+ * @param index                                The index of the target GPU, >= 0 and < \a accessibleDevices
+ * @param device                               Reference in which to return the device handle
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                  if \a device has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a index is invalid or \a device is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
+ *         - \ref NVML_ERROR_NO_PERMISSION      if the user doesn't have permission to talk to this device
+ *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
+ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ *
+ * @see nvmlDeviceGetIndex
+ * @see nvmlDeviceGetCount
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
+
+/**
+ * Acquire the handle for a particular device, based on its board serial number.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * This number corresponds to the value printed directly on the board, and to the value returned by
+ *   \ref nvmlDeviceGetSerial().
+ *
+ * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor 
+ *             of \ref nvmlDeviceGetHandleByUUID.
+ *             For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT.
+ *
+ * Starting from NVML 5, this API causes NVML to initialize the target GPU
+ * NVML may initialize additional GPUs as it searches for the target GPU
+ *
+ * @param serial                               The board serial number of the target GPU
+ * @param device                               Reference in which to return the device handle
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                  if \a device has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a serial is invalid, \a device is NULL or more than one
+ *                                              device has the same serial (dual GPU boards)
+ *         - \ref NVML_ERROR_NOT_FOUND          if \a serial does not match a valid device on the system
+ *         - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
+ *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
+ *         - \ref NVML_ERROR_GPU_IS_LOST        if any GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ *
+ * @see nvmlDeviceGetSerial
+ * @see nvmlDeviceGetHandleByUUID
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device);
+
+/**
+ * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device.
+ *
+ * For all products.
+ *
+ * @param uuid                                 The UUID of the target GPU
+ * @param device                               Reference in which to return the device handle
+ * 
+ * Starting from NVML 5, this API causes NVML to initialize the target GPU
+ * NVML may initialize additional GPUs as it searches for the target GPU
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                  if \a device has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a uuid is invalid or \a device is null
+ *         - \ref NVML_ERROR_NOT_FOUND          if \a uuid does not match a valid device on the system
+ *         - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
+ *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
+ *         - \ref NVML_ERROR_GPU_IS_LOST        if any GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ *
+ * @see nvmlDeviceGetUUID
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device);
+
+/**
+ * Acquire the handle for a particular device, based on its PCI bus id.
+ * 
+ * For all products.
+ *
+ * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo().
+ *
+ * Starting from NVML 5, this API causes NVML to initialize the target GPU
+ * NVML may initialize additional GPUs if:
+ *  - The target GPU is an SLI slave
+ *
+ * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND 
+ *       instead of NVML_ERROR_NO_PERMISSION.
+ *
+ * @param pciBusId                             The PCI bus id of the target GPU
+ * @param device                               Reference in which to return the device handle
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                  if \a device has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a pciBusId is invalid or \a device is NULL
+ *         - \ref NVML_ERROR_NOT_FOUND          if \a pciBusId does not match a valid device on the system
+ *         - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables
+ *         - \ref NVML_ERROR_NO_PERMISSION      if the user doesn't have permission to talk to this device
+ *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
+ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device);
+
+/**
+ * Retrieves the name of this device. 
+ * 
+ * For all products.
+ *
+ * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not
+ * exceed 64 characters in length (including the NULL terminator).  See \ref
+ * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
+ *
+ * @param device                               The identifier of the target device
+ * @param name                                 Reference in which to return the product name
+ * @param length                               The maximum allowed length of the string returned in \a name
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a name has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a name is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length);
+
+/**
+ * Retrieves the brand of this device.
+ *
+ * For all products.
+ *
+ * The type is a member of \ref nvmlBrandType_t defined above.
+ *
+ * @param device                               The identifier of the target device
+ * @param type                                 Reference in which to return the product brand type
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a name has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a type is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type);
+
+/**
+ * Retrieves the NVML index of this device.
+ *
+ * For all products.
+ * 
+ * Valid indices are derived from the \a accessibleDevices count returned by 
+ *   \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices  
+ *   are 0 and 1, corresponding to GPU 0 and GPU 1.
+ *
+ * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it
+ *   is recommended that devices be looked up by their PCI ids or GPU UUID. See 
+ *   \ref nvmlDeviceGetHandleByPciBusId() and \ref nvmlDeviceGetHandleByUUID().
+ *
+ * Note: The NVML index may not correlate with other APIs, such as the CUDA device index.
+ *
+ * @param device                               The identifier of the target device
+ * @param index                                Reference in which to return the NVML index of the device
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a index has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a index is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetHandleByIndex()
+ * @see nvmlDeviceGetCount()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index);
+
+/**
+ * Retrieves the globally unique board serial number associated with this device's board.
+ *
+ * For all products with an inforom.
+ *
+ * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator).
+ * This number matches the serial number tag that is physically attached to the board.  See \ref
+ * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE.
+ *
+ * @param device                               The identifier of the target device
+ * @param serial                               Reference in which to return the board/module serial number
+ * @param length                               The maximum allowed length of the string returned in \a serial
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a serial has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a serial is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length);
+
+/**
+ * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device
+ * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2,
+ *     result[0] = 0x3, result[1] = 0x3
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Supported on Linux only.
+ *
+ * @param device                               The identifier of the target device
+ * @param cpuSetSize                           The size of the cpuSet array that is safe to access
+ * @param cpuSet                               Array reference in which to return a bitmask of CPUs, 64 CPUs per 
+ *                                                 unsigned long on 64-bit machines, 32 on 32-bit machines
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a cpuAffinity has been filled
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet);
+
+/**
+ * Sets the ideal affinity for the calling thread and device using the guidelines 
+ * given in nvmlDeviceGetCpuAffinity().  Note, this is a change as of version 8.0.  
+ * Older versions set the affinity for a calling process and all children.
+ * Currently supports up to 64 processors.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Supported on Linux only.
+ *
+ * @param device                               The identifier of the target device
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the calling process has been successfully bound
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device);
+
+/**
+ * Clear all affinity bindings for the calling thread.  Note, this is a change as of version
+ * 8.0 as older versions cleared the affinity for a calling process and all children.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Supported on Linux only.
+ *
+ * @param device                               The identifier of the target device
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the calling process has been successfully unbound
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device);
+
+/**
+ * Retrieve the common ancestor for two devices
+ * For all products.
+ * Supported on Linux only.
+ *
+ * @param device1                              The identifier of the first device
+ * @param device2                              The identifier of the second device
+ * @param pathInfo                             A \ref nvmlGpuTopologyLevel_t that gives the path type
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a pathInfo has been set
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device1, or \a device2 is invalid, or \a pathInfo is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or OS does not support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           an error has occurred in underlying topology discovery
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo);
+
+/**
+ * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level
+ * For all products.
+ * Supported on Linux only.
+ *
+ * @param device                               The identifier of the first device
+ * @param level                                The \ref nvmlGpuTopologyLevel_t level to search for other GPUs
+ * @param count                                When zero, is set to the number of matching GPUs such that \a deviceArray 
+ *                                             can be malloc'd.  When non-zero, \a deviceArray will be filled with \a count
+ *                                             number of device handles.
+ * @param deviceArray                          An array of device handles for GPUs found at \a level
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a deviceArray or \a count (if initially zero) has been set
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or OS does not support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           an error has occurred in underlying topology discovery
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray);
+
+/**
+ * Retrieve the set of GPUs that have a CPU affinity with the given CPU number
+ * For all products.
+ * Supported on Linux only.
+ *
+ * @param cpuNumber                            The CPU number
+ * @param count                                When zero, is set to the number of matching GPUs such that \a deviceArray 
+ *                                             can be malloc'd.  When non-zero, \a deviceArray will be filled with \a count
+ *                                             number of device handles.
+ * @param deviceArray                          An array of device handles for GPUs found with affinity to \a cpuNumber
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a deviceArray or \a count (if initially zero) has been set
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or OS does not support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           an error has occurred in underlying topology discovery
+ */
+nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray);
+
+/**
+ * Retrieve the status for a given p2p capability index between a given pair of GPU 
+ * 
+ * @param device1                              The first device 
+ * @param device2                              The second device
+ * @param p2pIndex                             p2p Capability Index being looked for between \a device1 and \a device2
+ * @param p2pStatus                            Reference in which to return the status of the \a p2pIndex 
+ *                                             between \a device1 and \a device2
+ * @return 
+ *         - \ref NVML_SUCCESS         if \a p2pStatus has been populated
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT     if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL
+ *         - \ref NVML_ERROR_UNKNOWN              on any unexpected error
+ */ 
+nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus);
+
+/**
+ * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string,
+ * that augments the immutable, board serial identifier.
+ *
+ * For all products.
+ *
+ * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products.
+ * It does NOT correspond to any identifier printed on the board.  It will not exceed 80 characters in length
+ * (including the NULL terminator).  See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
+ *
+ * @param device                               The identifier of the target device
+ * @param uuid                                 Reference in which to return the GPU UUID
+ * @param length                               The maximum allowed length of the string returned in \a uuid
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a uuid has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a uuid is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length);
+
+/**
+ * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for 
+ * each GPU will have the form /dev/nvidia[minor number].
+ *
+ * For all products.
+ * Supported only for Linux
+ *
+ * @param device                                The identifier of the target device
+ * @param minorNumber                           Reference in which to return the minor number for the device
+ * @return
+ *         - \ref NVML_SUCCESS                 if the minor number is successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a minorNumber is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber);
+
+/**
+ * Retrieves the the device board part number which is programmed into the board's InfoROM
+ *
+ * For all products.
+ *
+ * @param device                                Identifier of the target device
+ * @param partNumber                            Reference to the buffer to return
+ * @param length                                Length of the buffer reference
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                  if \a partNumber has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_NOT_SUPPORTED      if the needed VBIOS fields have not been filled
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device is invalid or \a serial is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length);
+
+/**
+ * Retrieves the version information for the device's infoROM object.
+ *
+ * For all products with an inforom.
+ *
+ * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate 
+ * ECC counts. The version of the data structures in this memory may change from time to time. It will not
+ * exceed 16 characters in length (including the NULL terminator).
+ * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE.
+ *
+ * See \ref nvmlInforomObject_t for details on the available infoROM objects.
+ *
+ * @param device                               The identifier of the target device
+ * @param object                               The target infoROM object
+ * @param version                              Reference in which to return the infoROM version
+ * @param length                               The maximum allowed length of the string returned in \a version
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a version has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have an infoROM
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetInforomImageVersion
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length);
+
+/**
+ * Retrieves the global infoROM image version
+ *
+ * For all products with an inforom.
+ *
+ * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board 
+ * in contrast to infoROM object version which is only an indicator of supported features.
+ * Version string will not exceed 16 characters in length (including the NULL terminator).
+ * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE.
+ *
+ * @param device                               The identifier of the target device
+ * @param version                              Reference in which to return the infoROM image version
+ * @param length                               The maximum allowed length of the string returned in \a version
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a version has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have an infoROM
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetInforomVersion
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length);
+
+/**
+ * Retrieves the checksum of the configuration stored in the device's infoROM.
+ *
+ * For all products with an inforom.
+ *
+ * Can be used to make sure that two GPUs have the exact same configuration.
+ * Current checksum takes into account configuration stored in PWR and ECC infoROM objects.
+ * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC)
+ *
+ * @param device                               The identifier of the target device
+ * @param checksum                             Reference in which to return the infoROM configuration checksum
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a checksum has been set
+ *         - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a checksum is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error 
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum);
+
+/**
+ * Reads the infoROM from the flash and verifies the checksums.
+ *
+ * For all products with an inforom.
+ *
+ * @param device                               The identifier of the target device
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if infoROM is not corrupted
+ *         - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error 
+ */
+nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device);
+
+/**
+ * Retrieves the display mode for the device.
+ *
+ * For all products.
+ *
+ * This method indicates whether a physical display (e.g. monitor) is currently connected to
+ * any of the device's connectors.
+ *
+ * See \ref nvmlEnableState_t for details on allowed modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param display                              Reference in which to return the display mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a display has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a display is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display);
+
+/**
+ * Retrieves the display active state for the device.
+ *
+ * For all products.
+ *
+ * This method indicates whether a display is initialized on the device.
+ * For example whether X Server is attached to this device and has allocated memory for the screen.
+ *
+ * Display can be active even when no monitor is physically attached.
+ *
+ * See \ref nvmlEnableState_t for details on allowed modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param isActive                             Reference in which to return the display active state
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a isActive has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a isActive is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive);
+
+/**
+ * Retrieves the persistence mode associated with this device.
+ *
+ * For all products.
+ * For Linux only.
+ *
+ * When driver persistence mode is enabled the driver software state is not torn down when the last 
+ * client disconnects. By default this feature is disabled. 
+ *
+ * See \ref nvmlEnableState_t for details on allowed modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param mode                                 Reference in which to return the current driver persistence mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a mode has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceSetPersistenceMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode);
+
+/**
+ * Retrieves the PCI attributes of this device.
+ * 
+ * For all products.
+ *
+ * See \ref nvmlPciInfo_t for details on the available PCI info.
+ *
+ * @param device                               The identifier of the target device
+ * @param pci                                  Reference in which to return the PCI info
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a pci has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pci is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci);
+
+/**
+ * Retrieves the maximum PCIe link generation possible with this device and system
+ *
+ * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will
+ * report is generation 1.
+ * 
+ * For Fermi &tm; or newer fully supported devices.
+ * 
+ * @param device                               The identifier of the target device
+ * @param maxLinkGen                           Reference in which to return the max PCIe link generation
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a maxLinkGen has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a maxLinkGen is null
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen);
+
+/**
+ * Retrieves the maximum PCIe link width possible with this device and system
+ *
+ * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report
+ * a max link width of 8.
+ * 
+ * For Fermi &tm; or newer fully supported devices.
+ * 
+ * @param device                               The identifier of the target device
+ * @param maxLinkWidth                         Reference in which to return the max PCIe link generation
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a maxLinkWidth has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a maxLinkWidth is null
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth);
+
+/**
+ * Retrieves the current PCIe link generation
+ * 
+ * For Fermi &tm; or newer fully supported devices.
+ * 
+ * @param device                               The identifier of the target device
+ * @param currLinkGen                          Reference in which to return the current PCIe link generation
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a currLinkGen has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a currLinkGen is null
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen);
+
+/**
+ * Retrieves the current PCIe link width
+ * 
+ * For Fermi &tm; or newer fully supported devices.
+ * 
+ * @param device                               The identifier of the target device
+ * @param currLinkWidth                        Reference in which to return the current PCIe link generation
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a currLinkWidth has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a currLinkWidth is null
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth);
+
+/**
+ * Retrieve PCIe utilization information.
+ * This function is querying a byte counter over a 20ms interval and thus is the 
+ *   PCIe throughput over that interval.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * This method is not supported in virtual machines running virtual GPU (vGPU).
+ *
+ * @param device                               The identifier of the target device
+ * @param counter                              The specific counter that should be queried \ref nvmlPcieUtilCounter_t
+ * @param value                                Reference in which to return throughput in KB/s
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a value has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a counter is invalid, or \a value is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value);
+
+/**  
+ * Retrieve the PCIe replay counter.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param value                                Reference in which to return the counter's value
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a value has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a value is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value);
+
+/**
+ * Retrieves the current clock speeds for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlClockType_t for details on available clock information.
+ *
+ * @param device                               The identifier of the target device
+ * @param type                                 Identify which clock domain to query
+ * @param clock                                Reference in which to return the clock speed in MHz
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a clock has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clock is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device cannot report the specified clock
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
+
+/**
+ * Retrieves the maximum clock speeds for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlClockType_t for details on available clock information.
+ *
+ * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks
+ *       by few MHz.
+ *
+ * @param device                               The identifier of the target device
+ * @param type                                 Identify which clock domain to query
+ * @param clock                                Reference in which to return the clock speed in MHz
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a clock has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clock is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device cannot report the specified clock
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
+
+/**
+ * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs.
+ * Can be changed using \ref nvmlDeviceSetApplicationsClocks.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param clockType                            Identify which clock domain to query
+ * @param clockMHz                             Reference in which to return the clock in MHz
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
+
+/**
+ * Retrieves the default applications clock that GPU boots with or 
+ * defaults to after \ref nvmlDeviceResetApplicationsClocks call.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param clockType                            Identify which clock domain to query
+ * @param clockMHz                             Reference in which to return the default clock in MHz
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * \see nvmlDeviceGetApplicationsClock
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
+
+/**
+ * Resets the application clock to the default value
+ *
+ * This is the applications clock that will be used after system reboot or driver reload.
+ * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks.
+ *
+ * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks,
+ * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above
+ * base clocks as thermal limits allow.
+ *
+ * @see nvmlDeviceGetApplicationsClock
+ * @see nvmlDeviceSetApplicationsClocks
+ *
+ * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
+ *
+ * @param device                               The identifier of the target device
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if new settings were successfully set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device);
+
+/**
+ * Retrieves the clock speed for the clock specified by the clock type and clock ID.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param clockType                            Identify which clock domain to query
+ * @param clockId                              Identify which clock in the domain to query
+ * @param clockMHz                             Reference in which to return the clock in MHz
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz);
+
+/**
+ * Retrieves the customer defined maximum boost clock speed specified by the given clock type.
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param clockType                            Identify which clock domain to query
+ * @param clockMHz                             Reference in which to return the clock in MHz
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or the \a clockType on this device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
+
+/**
+ * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param count                                Reference in which to provide the \a clocksMHz array size, and
+ *                                             to return the number of elements
+ * @param clocksMHz                            Reference in which to return the clock in MHz
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a count and \a clocksMHz have been populated 
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a count is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of
+ *                                                required elements)
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceSetApplicationsClocks
+ * @see nvmlDeviceGetSupportedGraphicsClocks
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz);
+
+/**
+ * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param memoryClockMHz                       Memory clock for which to return possible graphics clocks
+ * @param count                                Reference in which to provide the \a clocksMHz array size, and
+ *                                             to return the number of elements
+ * @param clocksMHz                            Reference in which to return the clocks in MHz
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a count and \a clocksMHz have been populated 
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_NOT_FOUND         if the specified \a memoryClockMHz is not a supported frequency
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clock is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small 
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceSetApplicationsClocks
+ * @see nvmlDeviceGetSupportedMemoryClocks
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz);
+
+/**
+ * Retrieve the current state of Auto Boosted clocks on a device and store it in \a isEnabled
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
+ * to maximize performance as thermal limits allow.
+ *
+ * On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks.
+ * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost
+ * behavior.
+ *
+ * @param device                               The identifier of the target device
+ * @param isEnabled                            Where to store the current state of Auto Boosted clocks of the target device
+ * @param defaultIsEnabled                     Where to store the default Auto Boosted clocks behavior of the target device that the device will
+ *                                                 revert to when no applications are using the GPU
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 If \a isEnabled has been been set with the Auto Boosted clocks state of \a device
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a isEnabled is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support Auto Boosted clocks
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled);
+
+/**
+ * Try to set the current state of Auto Boosted clocks on a device.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
+ * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock
+ * rates are desired.
+ *
+ * Non-root users may use this API by default but can be restricted by root from using this API by calling
+ * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS.
+ * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled.
+ *
+ * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks.
+ * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost
+ * behavior.
+ *
+ * @param device                               The identifier of the target device
+ * @param enabled                              What state to try to set Auto Boosted clocks of the target device to
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 If the Auto Boosted clocks were successfully set to the state specified by \a enabled
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support Auto Boosted clocks
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled);
+
+/**
+ * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will
+ * return to when no compute running processes (e.g. CUDA application which have an active context) are running
+ *
+ * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
+ * Requires root/admin permissions.
+ *
+ * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
+ * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock
+ * rates are desired.
+ *
+ * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks.
+ * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost
+ * behavior.
+ *
+ * @param device                               The identifier of the target device
+ * @param enabled                              What state to try to set default Auto Boosted clocks of the target device to
+ * @param flags                                Flags that change the default behavior. Currently Unused.
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_NO_PERMISSION     If the calling user does not have permission to change Auto Boosted clock's default state.
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support Auto Boosted clocks
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags);
+
+
+/**
+ * Retrieves the intended operating speed of the device's fan.
+ *
+ * Note: The reported speed is the intended fan speed.  If the fan is physically blocked and unable to spin, the
+ * output will not match the actual fan speed.
+ * 
+ * For all discrete products with dedicated fans.
+ *
+ * The fan speed is expressed as a percent of the maximum, i.e. full speed is 100%.
+ *
+ * @param device                               The identifier of the target device
+ * @param speed                                Reference in which to return the fan speed percentage
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a speed has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a speed is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have a fan
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed);
+
+/**
+ * Retrieves the current temperature readings for the device, in degrees C. 
+ * 
+ * For all products.
+ *
+ * See \ref nvmlTemperatureSensors_t for details on available temperature sensors.
+ *
+ * @param device                               The identifier of the target device
+ * @param sensorType                           Flag that indicates which sensor reading to retrieve
+ * @param temp                                 Reference in which to return the temperature reading
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a temp has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a sensorType is invalid or \a temp is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have the specified sensor
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp);
+
+/**
+ * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds.
+ *
+ * @param device                               The identifier of the target device
+ * @param thresholdType                        The type of threshold value queried
+ * @param temp                                 Reference in which to return the temperature reading
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a temp has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a thresholdType is invalid or \a temp is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have a temperature sensor or is unsupported
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp);
+
+/**
+ * Retrieves the current performance state for the device. 
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlPstates_t for details on allowed performance states.
+ *
+ * @param device                               The identifier of the target device
+ * @param pState                               Reference in which to return the performance state reading
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a pState has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pState is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState);
+
+/**
+ * Retrieves current clocks throttling reasons.
+ *
+ * For all fully supported products.
+ *
+ * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once.
+ *
+ * @param device                                The identifier of the target device
+ * @param clocksThrottleReasons                 Reference in which to return bitmask of active clocks throttle
+ *                                                  reasons
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a clocksThrottleReasons has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clocksThrottleReasons is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlClocksThrottleReasons
+ * @see nvmlDeviceGetSupportedClocksThrottleReasons
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons);
+
+/**
+ * Retrieves bitmask of supported clocks throttle reasons that can be returned by 
+ * \ref nvmlDeviceGetCurrentClocksThrottleReasons
+ *
+ * For all fully supported products.
+ *
+ * This method is not supported in virtual machines running virtual GPU (vGPU).
+ *
+ * @param device                               The identifier of the target device
+ * @param supportedClocksThrottleReasons       Reference in which to return bitmask of supported
+ *                                              clocks throttle reasons
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a supportedClocksThrottleReasons has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a supportedClocksThrottleReasons is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlClocksThrottleReasons
+ * @see nvmlDeviceGetCurrentClocksThrottleReasons
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons);
+
+/**
+ * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization.
+ *
+ * Retrieve the current performance state for the device. 
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlPstates_t for details on allowed performance states.
+ *
+ * @param device                               The identifier of the target device
+ * @param pState                               Reference in which to return the performance state reading
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a pState has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pState is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState);
+
+/**
+ * This API has been deprecated.
+ *
+ * Retrieves the power management mode associated with this device.
+ *
+ * For products from the Fermi family.
+ *     - Requires \a NVML_INFOROM_POWER version 3.0 or higher.
+ *
+ * For from the Kepler or newer families.
+ *     - Does not require \a NVML_INFOROM_POWER object.
+ *
+ * This flag indicates whether any power management algorithm is currently active on the device. An 
+ * enabled state does not necessarily mean the device is being actively throttled -- only that 
+ * that the driver will do so if the appropriate conditions are met.
+ *
+ * See \ref nvmlEnableState_t for details on allowed modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param mode                                 Reference in which to return the current power management mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a mode has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode);
+
+/**
+ * Retrieves the power management limit associated with this device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * The power limit defines the upper boundary for the card's power draw. If
+ * the card's total power draw reaches this limit the power management algorithm kicks in.
+ *
+ * This reading is only available if power management mode is supported. 
+ * See \ref nvmlDeviceGetPowerManagementMode.
+ *
+ * @param device                               The identifier of the target device
+ * @param limit                                Reference in which to return the power management limit in milliwatts
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a limit has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a limit is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit);
+
+/**
+ * Retrieves information about possible values of power management limits on this device.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param minLimit                             Reference in which to return the minimum power management limit in milliwatts
+ * @param maxLimit                             Reference in which to return the maximum power management limit in milliwatts
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a minLimit and \a maxLimit have been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a minLimit or \a maxLimit is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceSetPowerManagementLimit
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit);
+
+/**
+ * Retrieves default power management limit on this device, in milliwatts.
+ * Default power management limit is a power management limit that the device boots with.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param defaultLimit                         Reference in which to return the default power management limit in milliwatts
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a defaultLimit has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a defaultLimit is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit);
+
+/**
+ * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
+ *
+ * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode.
+ *
+ * @param device                               The identifier of the target device
+ * @param power                                Reference in which to return the power usage information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a power has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a power is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support power readings
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power);
+
+/**
+ * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded
+ *
+ * For newer than Pascal &tm; fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param energy                               Reference in which to return the energy consumption information
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a energy has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a energy is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support energy readings
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long *energy);
+
+/**
+ * Get the effective power limit that the driver enforces after taking into account all limiters
+ *
+ * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere
+ * This includes the out of band power limit interface
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                           The device to communicate with
+ * @param limit                            Reference in which to return the power management limit in milliwatts
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a limit has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a limit is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit);
+
+/**
+ * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot).
+ *
+ * For GK110 M-class and X-class Tesla &tm; products from the Kepler family.
+ * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products.
+ * Not supported on Quadro &reg; and Tesla &tm; C-class products.
+ *
+ * @param device                               The identifier of the target device
+ * @param current                              Reference in which to return the current GOM
+ * @param pending                              Reference in which to return the pending GOM
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a mode has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a current or \a pending is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlGpuOperationMode_t
+ * @see nvmlDeviceSetGpuOperationMode
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending);
+
+/**
+ * Retrieves the amount of used, free and total memory available on the device, in bytes.
+ * 
+ * For all products.
+ *
+ * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits.
+ * Under WDDM most device memory is allocated and managed on startup by Windows.
+ *
+ * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated 
+ * by all active channels on the device.
+ *
+ * See \ref nvmlMemory_t for details on available memory info.
+ *
+ * @param device                               The identifier of the target device
+ * @param memory                               Reference in which to return the memory information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a memory has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a memory is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory);
+
+/**
+ * Retrieves the current compute mode for the device.
+ *
+ * For all products.
+ *
+ * See \ref nvmlComputeMode_t for details on allowed compute modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param mode                                 Reference in which to return the current compute mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a mode has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceSetComputeMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode);
+
+/**
+ * Retrieves the CUDA compute capability of the device.
+ *
+ * For all products.
+ *
+ * Returns the major and minor compute capability version numbers of the
+ * device.  The major and minor versions are equivalent to the
+ * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and
+ * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be
+ * returned by CUDA's cuDeviceGetAttribute().
+ *
+ * @param device                               The identifier of the target device
+ * @param major                                Reference in which to return the major CUDA compute capability
+ * @param minor                                Reference in which to return the minor CUDA compute capability
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a major and \a minor have been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a major or \a minor are NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor);
+
+/**
+ * Retrieves the current and pending ECC modes for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * Only applicable to devices with ECC.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
+ *
+ * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following
+ * the next reboot.
+ *
+ * See \ref nvmlEnableState_t for details on allowed modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param current                              Reference in which to return the current ECC mode
+ * @param pending                              Reference in which to return the pending ECC mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a current and \a pending have been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or either \a current or \a pending is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceSetEccMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending);
+
+/**
+ * Retrieves the device boardId from 0-N.
+ * Devices with the same boardId indicate GPUs connected to the same PLX.  Use in conjunction with 
+ *  \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well.
+ *  The boardId returned is a unique ID for the current configuration.  Uniqueness and ordering across 
+ *  reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and
+ *  the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will 
+ *  always return those values but they will always be different from each other).
+ *  
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param boardId                              Reference in which to return the device's board ID
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a boardId has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a boardId is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId);
+
+/**
+ * Retrieves whether the device is on a Multi-GPU Board
+ * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param multiGpuBool                         Reference in which to return a zero or non-zero value
+ *                                                 to indicate whether the device is on a multi GPU board
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a multiGpuBool has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a multiGpuBool is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool);
+
+/**
+ * Retrieves the total ECC error counts for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * Only applicable to devices with ECC.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
+ * Requires ECC Mode to be enabled.
+ *
+ * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of 
+ * errors across the entire device.
+ *
+ * See \ref nvmlMemoryErrorType_t for a description of available error types.\n
+ * See \ref nvmlEccCounterType_t for a description of available counter types.
+ *
+ * @param device                               The identifier of the target device
+ * @param errorType                            Flag that specifies the type of the errors. 
+ * @param counterType                          Flag that specifies the counter-type of the errors. 
+ * @param eccCounts                            Reference in which to return the specified ECC errors
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a eccCounts has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceClearEccErrorCounts()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts);
+
+/**
+ * Retrieves the detailed ECC error counts for the device.
+ *
+ * @deprecated   This API supports only a fixed set of ECC error locations
+ *               On different GPU architectures different locations are supported
+ *               See \ref nvmlDeviceGetMemoryErrorCounter
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * Only applicable to devices with ECC.
+ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts.
+ * Requires ECC Mode to be enabled.
+ *
+ * Detailed errors provide separate ECC counts for specific parts of the memory system.
+ *
+ * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported.
+ *
+ * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n
+ * See \ref nvmlEccCounterType_t for a description of available counter types.\n
+ * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts.
+ *
+ * @param device                               The identifier of the target device
+ * @param errorType                            Flag that specifies the type of the errors. 
+ * @param counterType                          Flag that specifies the counter-type of the errors. 
+ * @param eccCounts                            Reference in which to return the specified ECC errors
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a eccCounts has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceClearEccErrorCounts()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts);
+
+/**
+ * Retrieves the requested memory error counter for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts.
+ *
+ * Only applicable to devices with ECC.
+ *
+ * Requires ECC Mode to be enabled.
+ *
+ * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n
+ * See \ref nvmlEccCounterType_t for a description of available counter types.\n
+ * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n
+ * 
+ * @param device                               The identifier of the target device
+ * @param errorType                            Flag that specifies the type of error.
+ * @param counterType                          Flag that specifies the counter-type of the errors. 
+ * @param locationType                         Specifies the location of the counter. 
+ * @param count                                Reference in which to return the ECC counter
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a count has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a bitTyp,e \a counterType or \a locationType is
+ *                                             invalid, or \a count is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support ECC error reporting in the specified memory
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType,
+                                                   nvmlEccCounterType_t counterType,
+                                                   nvmlMemoryLocation_t locationType, unsigned long long *count);
+
+/**
+ * Retrieves the current utilization rates for the device's major subsystems.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlUtilization_t for details on available utilization rates.
+ *
+ * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings.
+ *       This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization.
+ *
+ * @param device                               The identifier of the target device
+ * @param utilization                          Reference in which to return the utilization information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a utilization has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a utilization is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization);
+
+/**
+ * Retrieves the current utilization and sampling size in microseconds for the Encoder
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param utilization                          Reference to an unsigned int for encoder utilization info
+ * @param samplingPeriodUs                     Reference to an unsigned int for the sampling period in US
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a utilization has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs);
+
+/**
+ * Retrieves the current capacity of the device's encoder, as a percentage of maximum encoder capacity with valid values in the range 0-100.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * @param device                            The identifier of the target device
+ * @param encoderQueryType                  Type of encoder to query
+ * @param encoderCapacity                   Reference to an unsigned int for the encoder capacity
+ * 
+ * @return
+ *         - \ref NVML_SUCCESS                  if \a encoderCapacity is fetched
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a encoderCapacity is NULL, or \a device or \a encoderQueryType
+ *                                              are invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED      if device does not support the encoder specified in \a encodeQueryType
+ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int *encoderCapacity);
+
+/**
+ * Retrieves the current encoder statistics for a given device.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * @param device                            The identifier of the target device
+ * @param sessionCount                      Reference to an unsigned int for count of active encoder sessions
+ * @param averageFps                        Reference to an unsigned int for trailing average FPS of all active sessions
+ * @param averageLatency                    Reference to an unsigned int for encode latency in microseconds
+ * 
+ * @return
+ *         - \ref NVML_SUCCESS                  if \a sessionCount, \a averageFps and \a averageLatency is fetched
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a sessionCount, or \a device or \a averageFps,
+ *                                              or \a averageLatency is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned int *sessionCount,
+                                                unsigned int *averageFps, unsigned int *averageLatency);
+
+/**
+ * Retrieves information about active encoder sessions on a target device.
+ *
+ * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The
+ * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
+ * written to the buffer.
+ *
+ * If the supplied buffer is not large enough to accomodate the active session array, the function returns
+ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount.
+ * To query the number of active encoder sessions, call this function with *sessionCount = 0.  The code will return
+ * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * @param device                            The identifier of the target device
+ * @param sessionCount                      Reference to caller supplied array size, and returns the number of sessions.
+ * @param sessionInfos                      Reference in which to return the session information
+ * 
+ * @return
+ *         - \ref NVML_SUCCESS                  if \a sessionInfos is fetched
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a sessionCount is too small, array element count is returned in \a sessionCount
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a sessionCount is NULL.
+ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfos);
+
+/**
+ * Retrieves the current utilization and sampling size in microseconds for the Decoder
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param utilization                          Reference to an unsigned int for decoder utilization info
+ * @param samplingPeriodUs                     Reference to an unsigned int for the sampling period in US
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a utilization has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs);
+
+/**
+* Retrieves the active frame buffer capture sessions statistics for a given device.
+*
+* For Maxwell &tm; or newer fully supported devices.
+*
+* @param device                            The identifier of the target device
+* @param fbcStats                          Reference to nvmlFBCStats_t structure contianing NvFBC stats
+*
+* @return
+*         - \ref NVML_SUCCESS                  if \a fbcStats is fetched
+*         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+*         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a fbcStats is NULL
+*         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
+*         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+*/
+nvmlReturn_t DECLDIR nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t *fbcStats);
+
+/**
+* Retrieves information about active frame buffer capture sessions on a target device.
+*
+* An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The
+* array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
+* written to the buffer.
+*
+* If the supplied buffer is not large enough to accomodate the active session array, the function returns
+* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount.
+* To query the number of active FBC sessions, call this function with *sessionCount = 0.  The code will return
+* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount.
+*
+* For Maxwell &tm; or newer fully supported devices.
+*
+* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may
+*       be zero if there are no new frames captured since the session started.
+*
+* @param device                            The identifier of the target device
+* @param sessionCount                      Reference to caller supplied array size, and returns the number of sessions.
+* @param sessionInfo                       Reference in which to return the session information
+*
+* @return
+*         - \ref NVML_SUCCESS                  if \a sessionInfo is fetched
+*         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+*         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a sessionCount is too small, array element count is returned in \a sessionCount
+*         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a sessionCount is NULL.
+*         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
+*         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+*/
+nvmlReturn_t DECLDIR nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo);
+
+/**
+ * Retrieves the current and pending driver model for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * For windows only.
+ *
+ * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached
+ * to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached.
+ *
+ * See \ref nvmlDriverModel_t for details on available driver models.
+ *
+ * @param device                               The identifier of the target device
+ * @param current                              Reference in which to return the current driver model
+ * @param pending                              Reference in which to return the pending driver model
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if either \a current and/or \a pending have been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or both \a current and \a pending are NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the platform is not windows
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlDeviceSetDriverModel()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending);
+
+/**
+ * Get VBIOS version of the device.
+ *
+ * For all products.
+ *
+ * The VBIOS version may change from time to time. It will not exceed 32 characters in length 
+ * (including the NULL terminator).  See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE.
+ *
+ * @param device                               The identifier of the target device
+ * @param version                              Reference to which to return the VBIOS version
+ * @param length                               The maximum allowed length of the string returned in \a version
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a version has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a version is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length);
+
+/**
+ * Get Bridge Chip Information for all the bridge chips on the board.
+ * 
+ * For all fully supported products.
+ * Only applicable to multi-GPU products.
+ * 
+ * @param device                                The identifier of the target device
+ * @param bridgeHierarchy                       Reference to the returned bridge chip Hierarchy
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if bridge chip exists
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a bridgeInfo is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if bridge chip not supported on the device
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy);
+
+/**
+ * Get information about processes with a compute context on a device
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * This function returns information only about compute running processes (e.g. CUDA application which have
+ * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function.
+ *
+ * To query the current number of running compute processes, call this function with *infoCount = 0. The
+ * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
+ * \a infos is allowed to be NULL.
+ *
+ * The usedGpuMemory field returned is all of the memory used by the application.
+ *
+ * Keep in mind that information returned by this call is dynamic and the number of elements might change in
+ * time. Allocate more space for \a infos table in case new compute processes are spawned.
+ *
+ * @param device                               The identifier of the target device
+ * @param infoCount                            Reference in which to provide the \a infos array size, and
+ *                                             to return the number of returned elements
+ * @param infos                                Reference in which to return the process information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a infoCount and \a infos have been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small
+ *                                             \a infoCount will contain minimal amount of space necessary for
+ *                                             the call to complete
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, either of \a infoCount or \a infos is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see \ref nvmlSystemGetProcessName
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos);
+
+/**
+ * Get information about processes with a graphics context on a device
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * This function returns information only about graphics based processes 
+ * (eg. applications using OpenGL, DirectX)
+ *
+ * To query the current number of running graphics processes, call this function with *infoCount = 0. The
+ * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
+ * \a infos is allowed to be NULL.
+ *
+ * The usedGpuMemory field returned is all of the memory used by the application.
+ *
+ * Keep in mind that information returned by this call is dynamic and the number of elements might change in
+ * time. Allocate more space for \a infos table in case new graphics processes are spawned.
+ *
+ * @param device                               The identifier of the target device
+ * @param infoCount                            Reference in which to provide the \a infos array size, and
+ *                                             to return the number of returned elements
+ * @param infos                                Reference in which to return the process information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a infoCount and \a infos have been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small
+ *                                             \a infoCount will contain minimal amount of space necessary for
+ *                                             the call to complete
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, either of \a infoCount or \a infos is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see \ref nvmlSystemGetProcessName
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos);
+
+/**
+ * Check if the GPU devices are on the same physical board.
+ *
+ * For all fully supported products.
+ *
+ * @param device1                               The first GPU device
+ * @param device2                               The second GPU device
+ * @param onSameBoard                           Reference in which to return the status.
+ *                                              Non-zero indicates that the GPUs are on the same board.
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a onSameBoard has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this check is not supported by the device
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the either GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard);
+
+/**
+ * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs.
+ * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions.
+ *
+ * For all fully supported products.
+ *
+ * @param device                               The identifier of the target device
+ * @param apiType                              Target API type for this operation
+ * @param isRestricted                         Reference in which to return the current restriction 
+ *                                             NVML_FEATURE_ENABLED indicates that the API is root-only
+ *                                             NVML_FEATURE_DISABLED indicates that the API is accessible to all users
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a isRestricted has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device or the device does not support
+ *                                                 the feature that is being queried (E.G. Enabling/disabling Auto Boosted clocks is
+ *                                                 not supported by the device)
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlRestrictedAPI_t
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted);
+
+/**
+ * Gets recent samples for the GPU.
+ * 
+ * For Kepler &tm; or newer fully supported devices.
+ * 
+ * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by 
+ * the driver.
+ * 
+ * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t.
+ * 
+ * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL. 
+ * The returned samplesCount will provide the number of samples that can be queried. The user needs to 
+ * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t).
+ * 
+ * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the 
+ * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query 
+ * to get more recent samples.
+ * 
+ * This method fetches the number of entries which can be accommodated in the provided samples array, and the 
+ * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this 
+ * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost.
+ * 
+ * @param device                        The identifier for the target device
+ * @param type                          Type of sampling event
+ * @param lastSeenTimeStamp             Return only samples with timestamp greater than lastSeenTimeStamp. 
+ * @param sampleValType                 Output parameter to represent the type of sample value as described in nvmlSampleVal_t
+ * @param sampleCount                   Reference to provide the number of elements which can be queried in samples array
+ * @param samples                       Reference in which samples are returned
+ 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if samples are successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a samplesCount is NULL or 
+ *                                             reference to \a sampleCount is 0 for non null \a samples
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_NOT_FOUND         if sample entries are not found
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp,
+        nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples);
+
+/**
+ * Gets Total, Available and Used size of BAR1 memory.
+ * 
+ * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party 
+ * devices (peer-to-peer on the PCIE bus). 
+ * 
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param bar1Memory                           Reference in which BAR1 memory
+ *                                             information is returned.
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if BAR1 memory is successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a bar1Memory is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory);
+
+
+/**
+ * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power 
+ * or thermal constraints.
+ *
+ * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The
+ * difference in violation times at two different reference times gives the indication of GPU throttling event. 
+ *
+ * Violation for thermal capping is not supported at this time.
+ * 
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param perfPolicyType                       Represents Performance policy which can trigger GPU throttling
+ * @param violTime                             Reference to which violation time related information is returned 
+ *                                         
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if violation time is successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime);
+
+/**
+ * @}
+ */
+
+/** @addtogroup nvmlAccountingStats
+ *  @{
+ */
+
+/**
+ * Queries the state of per process accounting mode.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * See \ref nvmlDeviceGetAccountingStats for more details.
+ * See \ref nvmlDeviceSetAccountingMode
+ *
+ * @param device                               The identifier of the target device
+ * @param mode                                 Reference in which to return the current accounting mode
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the mode has been successfully retrieved 
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode are NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode);
+
+/**
+ * Queries process's accounting stats.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * 
+ * Accounting stats capture GPU utilization and other statistics across the lifetime of a process.
+ * Accounting stats can be queried during life time of the process and after its termination.
+ * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and 
+ * updated to actual running time after its termination.
+ * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old
+ * processes.
+ *
+ * See \ref nvmlAccountingStats_t for description of each returned metric.
+ * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids.
+ *
+ * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode.
+ * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be
+ *         queried since they don't contribute to GPU utilization.
+ * @note In case of pid collision stats of only the latest process (that terminated last) will be reported
+ *
+ * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU.
+ * 
+ * @param device                               The identifier of the target device
+ * @param pid                                  Process Id of the target process to query stats for
+ * @param stats                                Reference in which to return the process's accounting stats
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if stats have been successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a stats are NULL
+ *         - \ref NVML_ERROR_NOT_FOUND         if process stats were not found
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature or accounting mode is disabled
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetAccountingBufferSize
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats);
+
+/**
+ * Queries list of processes that can be queried for accounting stats. The list of processes returned 
+ * can be in running or terminated state.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * To just query the number of processes ready to be queried, call this function with *count = 0 and
+ * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty.
+ * 
+ * For more details see \ref nvmlDeviceGetAccountingStats.
+ *
+ * @note In case of PID collision some processes might not be accessible before the circular buffer is full.
+ *
+ * @param device                               The identifier of the target device
+ * @param count                                Reference in which to provide the \a pids array size, and
+ *                                               to return the number of elements ready to be queried
+ * @param pids                                 Reference in which to return list of process ids
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if pids were successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a count is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature or accounting mode is disabled
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to
+ *                                                 expected value)
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetAccountingBufferSize
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids);
+
+/**
+ * Returns the number of processes that the circular buffer with accounting pids can hold.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * This is the maximum number of processes that accounting information will be stored for before information
+ * about oldest processes will get overwritten by information about new processes.
+ *
+ * @param device                               The identifier of the target device
+ * @param bufferSize                           Reference in which to provide the size (in number of elements)
+ *                                               of the circular buffer for accounting stats.
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if buffer size was successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a bufferSize is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature or accounting mode is disabled
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlDeviceGetAccountingStats
+ * @see nvmlDeviceGetAccountingPids
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize);
+
+/** @} */
+
+/** @addtogroup nvmlDeviceQueries
+ *  @{
+ */
+
+/**
+ * Returns the list of retired pages by source, including pages that are pending retirement
+ * The address information provided from this API is the hardware address of the page that was retired.  Note
+ * that this does not match the virtual address used in CUDA, but will match the address information in XID 63
+ * 
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                            The identifier of the target device
+ * @param cause                             Filter page addresses by cause of retirement
+ * @param pageCount                         Reference in which to provide the \a addresses buffer size, and
+ *                                          to return the number of retired pages that match \a cause
+ *                                          Set to 0 to query the size without allocating an \a addresses buffer
+ * @param addresses                         Buffer to write the page addresses into
+ * 
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a pageCount was populated and \a addresses was filled
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the
+ *                                             matching page addresses.  \a pageCount is set to the needed size.
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or 
+ *                                             \a addresses is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause,
+    unsigned int *pageCount, unsigned long long *addresses);
+
+/**
+ * Returns the list of retired pages by source, including pages that are pending retirement
+ * The address information provided from this API is the hardware address of the page that was retired.  Note
+ * that this does not match the virtual address used in CUDA, but will match the address information in XID 63
+ *
+ * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps paramter to return the time of each page's
+ *       retirement.
+ * 
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                            The identifier of the target device
+ * @param cause                             Filter page addresses by cause of retirement
+ * @param pageCount                         Reference in which to provide the \a addresses buffer size, and
+ *                                          to return the number of retired pages that match \a cause
+ *                                          Set to 0 to query the size without allocating an \a addresses buffer
+ * @param addresses                         Buffer to write the page addresses into
+ * @param timestamps                        Buffer to write the timestamps of page retirement, additional for _v2
+ * 
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a pageCount was populated and \a addresses was filled
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the
+ *                                             matching page addresses.  \a pageCount is set to the needed size.
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or 
+ *                                             \a addresses is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetirementCause_t cause,
+    unsigned int *pageCount, unsigned long long *addresses, unsigned long long *timestamps);
+
+/**
+ * Check if any pages are pending retirement and need a reboot to fully retire.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                            The identifier of the target device
+ * @param isPending                         Reference in which to return the pending status
+ * 
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a isPending was populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a isPending is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlUnitCommands Unit Commands
+ *  This chapter describes NVML operations that change the state of the unit. For S-class products.
+ *  Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION
+ *  error code when invoking any of these methods.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Set the LED state for the unit. The LED can be either green (0) or amber (1).
+ *
+ * For S-class products.
+ * Requires root/admin permissions.
+ *
+ * This operation takes effect immediately.
+ * 
+ *
+ * <b>Current S-Class products don't provide unique LEDs for each unit. As such, both front 
+ * and back LEDs will be toggled in unison regardless of which unit is specified with this command.</b>
+ *
+ * See \ref nvmlLedColor_t for available colors.
+ *
+ * @param unit                                 The identifier of the target unit
+ * @param color                                The target LED color
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the LED color has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit or \a color is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlUnitGetLedState()
+ */
+nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlDeviceCommands Device Commands
+ *  This chapter describes NVML operations that change the state of the device.
+ *  Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION
+ *  error code when invoking any of these methods.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Set the persistence mode for the device.
+ *
+ * For all products.
+ * For Linux only.
+ * Requires root/admin permissions.
+ *
+ * The persistence mode determines whether the GPU driver software is torn down after the last client
+ * exits.
+ *
+ * This operation takes effect immediately. It is not persistent across reboots. After each reboot the
+ * persistence mode is reset to "Disabled".
+ *
+ * See \ref nvmlEnableState_t for available modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param mode                                 The target persistence mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the persistence mode was set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetPersistenceMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode);
+
+/**
+ * Set the compute mode for the device.
+ *
+ * For all products.
+ * Requires root/admin permissions.
+ *
+ * The compute mode determines whether a GPU can be used for compute operations and whether it can
+ * be shared across contexts.
+ *
+ * This operation takes effect immediately. Under Linux it is not persistent across reboots and
+ * always resets to "Default". Under windows it is persistent.
+ *
+ * Under windows compute mode may only be set to DEFAULT when running in WDDM
+ *
+ * See \ref nvmlComputeMode_t for details on available compute modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param mode                                 The target compute mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the compute mode was set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetComputeMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode);
+
+/**
+ * Set the ECC mode for the device.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Only applicable to devices with ECC.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
+ * Requires root/admin permissions.
+ *
+ * The ECC mode determines whether the GPU enables its ECC support.
+ *
+ * This operation takes effect after the next reboot.
+ *
+ * See \ref nvmlEnableState_t for details on available modes.
+ *
+ * @param device                               The identifier of the target device
+ * @param ecc                                  The target ECC mode
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the ECC mode was set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a ecc is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetEccMode()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc);  
+
+/**
+ * Clear the ECC error and other memory error counts for the device.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Only applicable to devices with ECC.
+ * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts.
+ * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts.
+ * Requires root/admin permissions.
+ * Requires ECC Mode to be enabled.
+ *
+ * Sets all of the specified ECC counters to 0, including both detailed and total counts.
+ *
+ * This operation takes effect immediately.
+ *
+ * See \ref nvmlMemoryErrorType_t for details on available counter types.
+ *
+ * @param device                               The identifier of the target device
+ * @param counterType                          Flag that indicates which type of errors should be cleared.
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the error counts were cleared
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a counterType is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see 
+ *      - nvmlDeviceGetDetailedEccErrors()
+ *      - nvmlDeviceGetTotalEccErrors()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType);
+
+/**
+ * Set the driver model for the device.
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * For windows only.
+ * Requires root/admin permissions.
+ *
+ * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached
+ * to the device it must run in WDDM mode.  
+ *
+ * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce).
+ * This should only be done if the host is subsequently powered down and the display is detached from the device
+ * before the next reboot. 
+ *
+ * This operation takes effect after the next reboot.
+ * 
+ * Windows driver model may only be set to WDDM when running in DEFAULT compute mode.
+ *
+ * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or 
+ * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode.
+ *
+ * See \ref nvmlDriverModel_t for details on available driver models.
+ * See \ref nvmlFlagDefault and \ref nvmlFlagForce
+ *
+ * @param device                               The identifier of the target device
+ * @param driverModel                          The target driver model
+ * @param flags                                Flags that change the default behavior
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the driver model has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a driverModel is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the platform is not windows or the device does not support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlDeviceGetDriverModel()
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags);
+
+/**
+ * Set clocks that device will lock to.
+ *
+ * Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz.
+ * Setting this will supercede application clock values and take effect regardless if a cuda app is running.
+ * See /ref nvmlDeviceSetApplicationsClocks
+ *
+ * Can be used as a setting to request constant performance.
+ *
+ * Requires root/admin permissions.
+ *
+ * After system reboot or driver reload applications clocks go back to their default value.
+ * See \ref nvmlDeviceResetGpuLockedClocks.
+ *
+ * For newer than Pascal &tm; fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param minGpuClockMHz                       Requested minimum gpu clock in MHz
+ * @param maxGpuClockMHz                       Requested maximum gpu clock in MHz
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if new settings were successfully set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz
+ *                                                 is not a valid clock combination
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minGpuClockMHz, unsigned int maxGpuClockMHz);
+
+/**
+ * Resets the gpu clock to the default value
+ *
+ * This is the gpu clock that will be used after system reboot or driver reload.
+ * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks.
+ *
+ * @see nvmlDeviceSetGpuLockedClocks
+ *
+ * For newer than Pascal &tm; fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if new settings were successfully set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device);
+
+/**
+ * Set clocks that applications will lock to.
+ *
+ * Sets the clocks that compute and graphics applications will be running at.
+ * e.g. CUDA driver requests these clocks during context creation which means this property
+ * defines clocks at which CUDA applications will be running unless some overspec event
+ * occurs (e.g. over power, over thermal or external HW brake).
+ *
+ * Can be used as a setting to request constant performance.
+ *
+ * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks.
+ *
+ * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call
+ * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting
+ * above the clock value being set.
+ *
+ * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
+ * Requires root/admin permissions.
+ *
+ * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks
+ * for details on how to list available clocks combinations.
+ *
+ * After system reboot or driver reload applications clocks go back to their default value.
+ * See \ref nvmlDeviceResetApplicationsClocks.
+ *
+ * @param device                               The identifier of the target device
+ * @param memClockMHz                          Requested memory clock in MHz
+ * @param graphicsClockMHz                     Requested graphics clock in MHz
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if new settings were successfully set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a memClockMHz and \a graphicsClockMHz
+ *                                                 is not a valid clock combination
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz);
+
+/**
+ * Set new power limit of this device.
+ * 
+ * For Kepler &tm; or newer fully supported devices.
+ * Requires root/admin permissions.
+ *
+ * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values.
+ *
+ * \note Limit is not persistent across reboots or driver unloads.
+ * Enable persistent mode to prevent driver from unloading when no application is using the device.
+ *
+ * @param device                               The identifier of the target device
+ * @param limit                                Power management limit in milliwatts to set
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a limit has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a defaultLimit is out of range
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlDeviceGetPowerManagementLimitConstraints
+ * @see nvmlDeviceGetPowerManagementDefaultLimit
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit);
+
+/**
+ * Sets new GOM. See \a nvmlGpuOperationMode_t for details.
+ *
+ * For GK110 M-class and X-class Tesla &tm; products from the Kepler family.
+ * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products.
+ * Not supported on Quadro &reg; and Tesla &tm; C-class products.
+ * Requires root/admin permissions.
+ * 
+ * Changing GOMs requires a reboot. 
+ * The reboot requirement might be removed in the future.
+ *
+ * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when
+ * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel.
+ * 
+ * @param device                               The identifier of the target device
+ * @param mode                                 Target GOM
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a mode has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode incorrect
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support GOM or specific mode
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlGpuOperationMode_t
+ * @see nvmlDeviceGetGpuOperationMode
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode);
+
+/**
+ * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs.
+ * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs.
+ * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction
+ * to query the current restriction settings.
+ * 
+ * For Kepler &tm; or newer fully supported devices.
+ * Requires root/admin permissions.
+ *
+ * @param device                               The identifier of the target device
+ * @param apiType                              Target API type for this operation
+ * @param isRestricted                         The target restriction
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a isRestricted has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a apiType incorrect
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support changing API restrictions or the device does not support
+ *                                                 the feature that api restrictions are being set for (E.G. Enabling/disabling auto 
+ *                                                 boosted clocks is not supported by the device)
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlRestrictedAPI_t
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted);
+
+/**
+ * @}
+ */
+ 
+/** @addtogroup nvmlAccountingStats
+ *  @{
+ */
+
+/**
+ * Enables or disables per process accounting.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Requires root/admin permissions.
+ *
+ * @note This setting is not persistent and will default to disabled after driver unloads.
+ *       Enable persistence mode to be sure the setting doesn't switch off to disabled.
+ * 
+ * @note Enabling accounting mode has no negative impact on the GPU performance.
+ *
+ * @note Disabling accounting clears all accounting pids information.
+ *
+ * See \ref nvmlDeviceGetAccountingMode
+ * See \ref nvmlDeviceGetAccountingStats
+ * See \ref nvmlDeviceClearAccountingPids
+ *
+ * @param device                               The identifier of the target device
+ * @param mode                                 The target accounting mode
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the new mode has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a mode are invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode);
+
+/**
+ * Clears accounting information about all processes that have already terminated.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ * Requires root/admin permissions.
+ *
+ * See \ref nvmlDeviceGetAccountingMode
+ * See \ref nvmlDeviceGetAccountingStats
+ * See \ref nvmlDeviceSetAccountingMode
+ *
+ * @param device                               The identifier of the target device
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if accounting information has been cleared 
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device are invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup NvLink NvLink Methods
+ * This chapter describes methods that NVML can perform on NVLINK enabled devices.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Retrieves the state of the device's NvLink for the link specified
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param isActive                             \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that
+ *                                             the link is active and NVML_FEATURE_DISABLED indicates it 
+ *                                             is inactive
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a isActive has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid or \a isActive is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
+
+/**
+ * Retrieves the version of the device's NvLink for the link specified
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param version                              Requested NvLink version
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a version has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid or \a version is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version);
+
+/**
+ * Retrieves the requested capability from the device's NvLink for the link specified
+ * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried
+ * The return value should be treated as a boolean.
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param capability                           Specifies the \a nvmlNvLinkCapability_t to be queried
+ * @param capResult                            A boolean for the queried capability indicating that feature is available
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a capResult has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, or \a capability is invalid or \a capResult is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult); 
+
+/**
+ * Retrieves the PCI information for the remote node on a NvLink link 
+ * Note: pciSubSystemId is not filled in this function and is indeterminate
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param pci                                  \a nvmlPciInfo_t of the remote node for the specified link                            
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a pci has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid or \a pci is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
+
+/**
+ * Retrieves the specified error counter value
+ * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param counter                              Specifies the NvLink counter to be queried
+ * @param counterValue                         Returned counter value
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a counter has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, or \a counter is invalid or \a counterValue is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link,
+                                                     nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue);
+
+/**
+ * Resets all error counters to zero
+ * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the reset is successful
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link);
+
+/**
+ * Set the NVLINK utilization counter control information for the specified counter, 0 or 1.
+ * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition.  Performs a reset
+ * of the counters if the reset parameter is non-zero.
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param counter                              Specifies the counter that should be set (0 or 1).
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param control                              A reference to the \a nvmlNvLinkUtilizationControl_t to set
+ * @param reset                                Resets the counters on set if non-zero
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the control has been set successfully
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a counter, \a link, or \a control is invalid 
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter,
+                                                           nvmlNvLinkUtilizationControl_t *control, unsigned int reset);
+
+/**
+ * Get the NVLINK utilization counter control information for the specified counter, 0 or 1.
+ * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param counter                              Specifies the counter that should be set (0 or 1).
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param control                              A reference to the \a nvmlNvLinkUtilizationControl_t to place information
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the control has been set successfully
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a counter, \a link, or \a control is invalid 
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter,
+                                                           nvmlNvLinkUtilizationControl_t *control);
+
+
+/**
+ * Retrieve the NVLINK utilization counter based on the current control for a specified counter.
+ * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl
+ *  before reading the utilization counters as they have no default state
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param counter                              Specifies the counter that should be read (0 or 1).
+ * @param rxcounter                            Receive counter return value
+ * @param txcounter                            Transmit counter return value
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a rxcounter and \a txcounter have been successfully set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, 
+                                                           unsigned long long *rxcounter, unsigned long long *txcounter);
+
+/**
+ * Freeze the NVLINK utilization counters 
+ * Both the receive and transmit counters are operated on by this function
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be queried
+ * @param counter                              Specifies the counter that should be frozen (0 or 1).
+ * @param freeze                               NVML_FEATURE_ENABLED = freeze the receive and transmit counters
+ *                                             NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if counters were successfully frozen or unfrozen
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, \a counter, or \a freeze is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, 
+                                            unsigned int counter, nvmlEnableState_t freeze);
+
+/**
+ * Reset the NVLINK utilization counters 
+ * Both the receive and transmit counters are operated on by this function
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ *
+ * @param device                               The identifier of the target device
+ * @param link                                 Specifies the NvLink link to be reset
+ * @param counter                              Specifies the counter that should be reset (0 or 1)
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if counters were successfully reset
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, or \a counter is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlEvents Event Handling Methods
+ * This chapter describes methods that NVML can perform against each device to register and wait for 
+ * some event to occur.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Create an empty set of events.
+ * Event set should be freed by \ref nvmlEventSetFree
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * @param set                                  Reference in which to return the event handle
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the event has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a set is NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlEventSetFree
+ */
+nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set);
+
+/**
+ * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors)
+ * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode)
+ *
+ * For Linux only.
+ *
+ * \b IMPORTANT: Operations on \a set are not thread safe
+ *
+ * This call starts recording of events on specific device.
+ * All events that occurred before this call are not recorded.
+ * Checking if some event occurred can be done with \ref nvmlEventSetWait
+ *
+ * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed.
+ * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes
+ *     are registered in that case.
+ *
+ * @param device                               The identifier of the target device
+ * @param eventTypes                           Bitmask of \ref nvmlEventType to record
+ * @param set                                  Set to which add new event types
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the event has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a eventTypes is invalid or \a set is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the platform does not support this feature or some of requested event types
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlEventType
+ * @see nvmlDeviceGetSupportedEventTypes
+ * @see nvmlEventSetWait
+ * @see nvmlEventSetFree
+ */
+nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set);
+
+/**
+ * Returns information about events supported on device
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows.
+ *
+ * @param device                               The identifier of the target device
+ * @param eventTypes                           Reference in which to return bitmask of supported events
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the eventTypes has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a eventType is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlEventType
+ * @see nvmlDeviceRegisterEvents
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes);
+
+/**
+ * Waits on events and delivers events
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * If some events are ready to be delivered at the time of the call, function returns immediately.
+ * If there are no events ready to be delivered, function sleeps till event arrives 
+ * but not longer than specified timeout. This function in certain conditions can return before
+ * specified timeout passes (e.g. when interrupt arrives)
+ * 
+ * In case of xid error, the function returns the most recent xid error type seen by the system. If there are multiple
+ * xid errors generated before nvmlEventSetWait is invoked then the last seen xid error type is returned for all
+ * xid error events.
+ * 
+ * @param set                                  Reference to set of events to wait on
+ * @param data                                 Reference in which to return event data
+ * @param timeoutms                            Maximum amount of wait time in milliseconds for registered event
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the data has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a data is NULL
+ *         - \ref NVML_ERROR_TIMEOUT           if no event arrived in specified timeout or interrupt arrived
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if a GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlEventType
+ * @see nvmlDeviceRegisterEvents
+ */
+nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms);
+
+/**
+ * Releases events in the set
+ *
+ * For Fermi &tm; or newer fully supported devices.
+ *
+ * @param set                                  Reference to events to be released 
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the event has been successfully released
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ * 
+ * @see nvmlDeviceRegisterEvents
+ */
+nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlZPI Drain states 
+ * This chapter describes methods that NVML can perform against each device to control their drain state
+ * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to
+ * power on/off GPUs, enable robust reset scenarios, etc.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Modify the drain state of a GPU.  This method forces a GPU to no longer accept new incoming requests.
+ * Any new NVML process will no longer see this GPU.  Persistence mode for this GPU must be turned off before
+ * this call is made.
+ * Must be called as administrator.
+ * For Linux only.
+ * 
+ * For Pascal &tm; or newer fully supported devices.
+ * Some Kepler devices supported.
+ *
+ * @param pciInfo                              The PCI address of the GPU drain state to be modified
+ * @param newState                             The drain state that should be entered, see \ref nvmlEnableState_t
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if counters were successfully reset
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a nvmlIndex or \a newState is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the calling process has insufficient permissions to perform operation
+ *         - \ref NVML_ERROR_IN_USE            if the device has persistence mode turned on
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState);
+
+/**
+ * Query the drain state of a GPU.  This method is used to check if a GPU is in a currently draining
+ * state.
+ * For Linux only.
+ * 
+ * For Pascal &tm; or newer fully supported devices.
+ * Some Kepler devices supported.
+ *
+ * @param pciInfo                              The PCI address of the GPU drain state to be queried
+ * @param currentState                         The current drain state for this GPU, see \ref nvmlEnableState_t
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if counters were successfully reset
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a nvmlIndex or \a currentState is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState);
+
+/**
+ * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver
+ * as long as no other processes are attached. If other processes are attached, this call will return
+ * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the
+ * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called
+ * to initiate the draining state is if that process was using, and is still using, a GPU before the 
+ * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled
+ * prior to this call.
+ *
+ * For long-running NVML processes please note that this will change the enumeration of current GPUs.
+ * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2.
+ * Also, device handles after the removed GPU will not be valid and must be re-established.
+ * Must be run as administrator. 
+ * For Linux only.
+ *
+ * For Pascal &tm; or newer fully supported devices.
+ * Some Kepler devices supported.
+ *
+ * @param pciInfo                              The PCI address of the GPU to be removed
+ * @param gpuState                             Whether the GPU is to be removed, from the OS
+ *                                             see \ref nvmlDetachGpuState_t
+ * @param linkState                            Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if counters were successfully reset
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a nvmlIndex is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
+ *         - \ref NVML_ERROR_IN_USE            if the device is still in use and cannot be removed
+ */
+nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu (nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState);
+
+/**
+ * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that
+ * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device.  
+ * If all are zeroes then the entire PCI tree will be searched.  Please note that for long-running NVML processes
+ * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order.
+ *
+ * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds
+ * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery.
+ *
+ * Must be run as administrator.
+ * For Linux only.
+ * 
+ * For Pascal &tm; or newer fully supported devices.
+ * Some Kepler devices supported.
+ *
+ * @param pciInfo                              The PCI tree to be searched.  Only the domain, bus, and device
+ *                                             fields are used in this call.
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if counters were successfully reset
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a pciInfo is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the operating system does not support this feature
+ *         - \ref NVML_ERROR_OPERATING_SYSTEM  if the operating system is denying this feature
+ *         - \ref NVML_ERROR_NO_PERMISSION     if the calling process has insufficient permissions to perform operation
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlFieldValueQueries Field Value Queries
+ *  This chapter describes NVML operations that are associated with retrieving Field Values from NVML
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Request values for a list of fields for a device. This API allows multiple fields to be queried at once.
+ * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs
+ * will be populated from a single call rather than making a driver call for each fieldId.
+ *
+ * @param device                               The device handle of the GPU to request field values for
+ * @param valuesCount                          Number of entries in values that should be retrieved
+ * @param values                               Array of \a valuesCount structures to hold field values.
+ *                                             Each value's fieldId must be populated prior to this call
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if any values in \a values were populated. Note that you must
+ *                                             check the nvmlReturn field of each value for each individual
+ *                                             status
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a values is NULL
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
+
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlGridQueries Grid Queries
+ *  This chapter describes NVML operations that are associated with NVIDIA GRID products.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * This method is used to get the virtualization mode corresponding to the GPU.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                    Identifier of the target device
+ * @param pVirtualMode              Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_?
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                  if \a pVirtualMode is fetched
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device is invalid or \a pVirtualMode is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlGridCommands Grid Commands
+ *  This chapter describes NVML operations that are associated with NVIDIA GRID products.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * This method is used to set the virtualization mode corresponding to the GPU.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                    Identifier of the target device
+ * @param virtualMode               virtualization mode. One of NVML_GPU_VIRTUALIZATION_?
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                  if \a pVirtualMode is set
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device is invalid or \a pVirtualMode is NULL
+ *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_NOT_SUPPORTED      if setting of virtualization mode is not supported.
+ *         - \ref NVML_ERROR_NO_PERMISSION      if setting of virtualization mode is not allowed for this client.
+ */
+nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlVgpu vGPU Management
+ * @{
+ *
+ * Set of APIs supporting GRID vGPU
+ */
+/***************************************************************************************************/
+
+/**
+ * Retrieve the supported vGPU types on a physical GPU (device).
+ *
+ * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer
+ * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount
+ * is used to return the number of vGPU types written to the buffer.
+ *
+ * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns
+ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount.
+ * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0.
+ * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported.
+ *
+ * @param device                   The identifier of the target device
+ * @param vgpuCount                Pointer to caller-supplied array size, and returns number of vGPU types
+ * @param vgpuTypeIds              Pointer to caller-supplied array in which to return list of vGPU types
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                      successful completion
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE      \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT       if \a vgpuCount is NULL or \a device is invalid
+ *         - \ref NVML_ERROR_NOT_SUPPORTED          if vGPU is not supported by the device
+ *         - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device
+ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds);
+
+/**
+ * Retrieve the currently creatable vGPU types on a physical GPU (device).
+ *
+ * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer
+ * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount
+ * is used to return the number of vGPU types written to the buffer.
+ *
+ * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types
+ * can concurrently run on a device.  For example, if only one vGPU type is allowed at a time on a device, then the creatable
+ * list will be restricted to whatever vGPU type is already running on the device.
+ *
+ * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns
+ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount.
+ * To query the number of vGPU types createable for the GPU, call this function with *vgpuCount = 0.
+ * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable.
+ *
+ * @param device                   The identifier of the target device
+ * @param vgpuCount                Pointer to caller-supplied array size, and returns number of vGPU types
+ * @param vgpuTypeIds              Pointer to caller-supplied array in which to return list of vGPU types
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                      successful completion
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE      \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT       if \a vgpuCount is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED          if vGPU is not supported by the device
+ *         - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device
+ *         - \ref NVML_ERROR_UNKNOWN                on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds);
+
+/**
+ * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator).
+ * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId               Handle to vGPU type
+ * @param vgpuTypeClass            Pointer to string array to return class in
+ * @param size                     Size of string
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                   successful completion
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE   if \a size is too small
+ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size);
+
+/**
+ * Retrieve the vGPU type name.
+ *
+ * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not
+ * exceed 64 characters in length (including the NUL terminator).  See \ref
+ * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId               Handle to vGPU type
+ * @param vgpuTypeName             Pointer to buffer to return name
+ * @param size                     Size of buffer
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 successful completion
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a name is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size);
+
+/**
+ * Retrieve the device ID of a vGPU type.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId               Handle to vGPU type
+ * @param deviceID                 Device ID and vendor ID of the device contained in single 32 bit value
+ * @param subsystemID              Subsytem ID and subsytem vendor ID of the device contained in single 32 bit value
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 successful completion
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID);
+
+/**
+ * Retrieve the vGPU framebuffer size in bytes.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId               Handle to vGPU type
+ * @param fbSize                   Pointer to framebuffer size in bytes
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 successful completion
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a fbSize is NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize);
+
+/**
+ * Retrieve count of vGPU's supported display heads.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId               Handle to vGPU type
+ * @param numDisplayHeads          Pointer to number of display heads
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 successful completion
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads);
+
+/**
+ * Retrieve vGPU display head's maximum supported resolution.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId               Handle to vGPU type
+ * @param displayIndex             Zero-based index of display head
+ * @param xdim                     Pointer to maximum number of pixels in X dimension
+ * @param ydim                     Pointer to maximum number of pixels in Y dimension
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 successful completion
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex
+ *                                             is out of range.
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim);
+
+/**
+ * Retrieve license requirements for a vGPU type
+ *
+ * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form
+ * "<license name>,<version>", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license,
+ * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0".
+ *
+ * The total length of the returned string will not exceed 128 characters, including the NUL terminator.
+ * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId               Handle to vGPU type
+ * @param vgpuTypeLicenseString    Pointer to buffer to return license info
+ * @param size                     Size of \a vgpuTypeLicenseString buffer
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 successful completion
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size);
+
+/**
+ * Retrieve the static frame rate limit value of the vGPU type
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuTypeId               Handle to vGPU type
+ * @param frameRateLimit           Reference to return the frame rate limit value
+ * @return
+ *         - \ref NVML_SUCCESS                 successful completion
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if frame rate limiter is turned off for the vGPU type
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a frameRateLimit is NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit);
+
+/**
+ * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                   The identifier of the target device
+ * @param vgpuTypeId               Handle to vGPU type
+ * @param vgpuInstanceCount        Pointer to get the max number of vGPU instances
+ *                                 that can be created on a deicve for given vgpuTypeId
+ * @return
+ *         - \ref NVML_SUCCESS                 successful completion
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid or is not supported on target device,
+ *                                             or \a vgpuInstanceCount is NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount);
+
+/**
+ * Retrieve the active vGPU instances on a device.
+ *
+ * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The
+ * array elememt count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances
+ * written to the buffer.
+ *
+ * If the supplied buffer is not large enough to accomodate the vGPU instance array, the function returns
+ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount.
+ * To query the number of active vGPU instances, call this function with *vgpuCount = 0.  The code will return
+ * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param device                   The identifier of the target device
+ * @param vgpuCount                Pointer which passes in the array size as well as get
+ *                                 back the number of types
+ * @param vgpuInstances            Pointer to array in which to return list of vGPU instances
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                  successful completion
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device is invalid, or \a vgpuCount is NULL
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a size is too small
+ *         - \ref NVML_ERROR_NOT_SUPPORTED      if vGPU is not supported by the device
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances);
+
+/**
+ * Retrieve the VM ID associated with a vGPU instance.
+ *
+ * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator).
+ * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
+ *
+ * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance             Identifier of the target vGPU instance
+ * @param vmId                     Pointer to caller-supplied buffer to hold VM ID
+ * @param size                     Size of buffer in bytes
+ * @param vmIdType                 Pointer to hold VM ID type
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 successful completion
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vmId or \a vmIdType is NULL, or \a vgpuInstance is 0
+ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType);
+
+/**
+ * Retrieve the UUID of a vGPU instance.
+ *
+ * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string,
+ * not exceeding 80 characters in length (including the NULL terminator).
+ * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance             Identifier of the target vGPU instance
+ * @param uuid                     Pointer to caller-supplied buffer to hold vGPU UUID
+ * @param size                     Size of buffer in bytes
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 successful completion
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a uuid is NULL
+ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size);
+
+/**
+ * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU.
+ *
+ * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version
+ * string will not exceed 80 characters in length (including the NUL terminator).
+ * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE.
+ *
+ * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is
+ * returned as "Unknown" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the
+ * NVIDIA driver is loaded and initialized.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance             Identifier of the target vGPU instance
+ * @param version                  Caller-supplied buffer to return driver version string
+ * @param length                   Size of \a version buffer
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a version has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0
+ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length);
+
+/**
+ * Retrieve the framebuffer usage in bytes.
+ *
+ * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance             The identifier of the target instance
+ * @param fbUsage                  Pointer to framebuffer usage in bytes
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 successful completion
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a fbUsage is NULL
+ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage);
+
+/**
+ * Retrieve the current licensing state of the vGPU instance.
+ *
+ * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance             Identifier of the target vGPU instance
+ * @param licensed                 Reference to return the licensing status
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a licensed has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a licensed is NULL
+ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed);
+
+/**
+ * Retrieve the vGPU type of a vGPU instance.
+ *
+ * Returns the vGPU type ID of vgpu assigned to the vGPU instance.
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance             Identifier of the target vGPU instance
+ * @param vgpuTypeId               Reference to return the vgpuTypeId
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a vgpuTypeId has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a vgpuTypeId is NULL
+ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId);
+
+/**
+ * Retrieve the frame rate limit set for the vGPU instance.
+ *
+ * Returns the value of the frame rate limit set for the vGPU instance
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance             Identifier of the target vGPU instance
+ * @param frameRateLimit           Reference to return the frame rate limit
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a frameRateLimit has been set
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if frame rate limiter is turned off for the vGPU type
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a frameRateLimit is NULL
+ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit);
+
+/**
+ * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance             Identifier of the target vGPU instance
+ * @param encoderCapacity          Reference to an unsigned int for the encoder capacity
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a encoderCapacity has been retrived
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a encoderQueryType is invalid
+ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity);
+
+/**
+ * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance             Identifier of the target vGPU instance
+ * @param encoderCapacity          Unsigned int for the encoder capacity value
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a encoderCapacity has been set
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0
+ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int  encoderCapacity);
+
+/**
+ * Retrieves current utilization for vGPUs on a physical GPU (device).
+ *
+ * For Kepler &tm; or newer fully supported devices.
+ *
+ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running
+ * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer
+ * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the
+ * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values
+ * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to
+ * indicate the returned value type.
+ *
+ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
+ * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance
+ * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate
+ * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with
+ * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the
+ * buffer is sized for.
+ *
+ * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample
+ * structures that were actually written. This may differ from a previously read value as vGPU instances are created or
+ * destroyed.
+ *
+ * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
+ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp
+ * to a timeStamp retrieved from a previous query to read utilization since the previous query.
+ *
+ * @param device                        The identifier for the target device
+ * @param lastSeenTimeStamp             Return only samples with timestamp greater than lastSeenTimeStamp.
+ * @param sampleValType                 Pointer to caller-supplied buffer to hold the type of returned sample values
+ * @param vgpuInstanceSamplesCount      Pointer to caller-supplied array size, and returns number of vGPU instances
+ * @param utilizationSamples            Pointer to caller-supplied buffer in which vGPU utilization samples are returned
+
+ * @return
+ *         - \ref NVML_SUCCESS                 if utilization samples are successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is
+ *                                             NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all
+ *                                             vGPU instances currently executing on the device
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if vGPU is not supported by the device
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_NOT_FOUND         if sample entries are not found
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp,
+                                                  nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount,
+                                                  nvmlVgpuInstanceUtilizationSample_t *utilizationSamples);
+
+/**
+ * Retrieves current utilization for processes running on vGPUs on a physical GPU (device).
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on
+ * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the
+ * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running
+ * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which
+ * the samples were recorded. Individual utilization values are returned as "unsigned int" values.
+ *
+ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
+ * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance
+ * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size
+ * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with
+ * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the
+ * buffer is sized for.
+ *
+ * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample
+ * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active
+ * in any given sample period.
+ *
+ * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
+ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp
+ * to a timeStamp retrieved from a previous query to read utilization since the previous query.
+ *
+ * @param device                        The identifier for the target device
+ * @param lastSeenTimeStamp             Return only samples with timestamp greater than lastSeenTimeStamp.
+ * @param vgpuProcessSamplesCount       Pointer to caller-supplied array size, and returns number of processes running on vGPU instances
+ * @param utilizationSamples            Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned
+
+ * @return
+ *         - \ref NVML_SUCCESS                 if utilization samples are successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is
+ *                                             passed with a non-NULL \a utilizationSamples
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all
+ *                                             vGPU instances currently executing on the device
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if vGPU is not supported by the device
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_NOT_FOUND         if sample entries are not found
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp,
+                                                         unsigned int *vgpuProcessSamplesCount,
+                                                         nvmlVgpuProcessUtilizationSample_t *utilizationSamples);
+/**
+ * Retrieve the GRID licensable features.
+ *
+ * Identifies whether the system supports GRID Software Licensing. If it does, return the list of licensable feature(s)
+ * and their current license status.
+ *
+ * @param device                    Identifier of the target device
+ * @param pGridLicensableFeatures   Pointer to structure in which GRID licensable features are returned
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                 if licensable features are successfully retrieved
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a pGridLicensableFeatures is NULL
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures);
+
+/**
+ * Retrieves the current encoder statistics of a vGPU Instance
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance                      Identifier of the target vGPU instance
+ * @param sessionCount                      Reference to an unsigned int for count of active encoder sessions
+ * @param averageFps                        Reference to an unsigned int for trailing average FPS of all active sessions
+ * @param averageLatency                    Reference to an unsigned int for encode latency in microseconds
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                  if \a sessionCount, \a averageFps and \a averageLatency is fetched
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a sessionCount , or \a averageFps or \a averageLatency is NULL
+ *                                              or \a vgpuInstance is 0.
+ *         - \ref NVML_ERROR_NOT_FOUND          if \a vgpuInstance does not match a valid active vGPU instance on the system
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount,
+                                                     unsigned int *averageFps, unsigned int *averageLatency);
+
+/**
+ * Retrieves information about all active encoder sessions on a vGPU Instance.
+ *
+ * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The
+ * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
+ * written to the buffer.
+ *
+ * If the supplied buffer is not large enough to accomodate the active session array, the function returns
+ * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount.
+ * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return
+ * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance                      Identifier of the target vGPU instance
+ * @param sessionCount                      Reference to caller supplied array size, and returns
+ *                                          the number of sessions.
+ * @param sessionInfo                       Reference to caller supplied array in which the list
+ *                                          of session information us returned.
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                  if \a sessionInfo is fetched
+ *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a sessionCount is too small, array element count is
+                                                returned in \a sessionCount
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a sessionCount is NULL, or \a vgpuInstance is 0.
+ *         - \ref NVML_ERROR_NOT_FOUND          if \a vgpuInstance does not match a valid active vGPU instance on the system
+ *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo);
+
+/**
+* Retrieves the active frame buffer capture sessions statistics of a vGPU Instance
+*
+* For Maxwell &tm; or newer fully supported devices.
+*
+* @param vgpuInstance                      Identifier of the target vGPU instance
+* @param fbcStats                          Reference to nvmlFBCStats_t structure contianing NvFBC stats
+*
+* @return
+*         - \ref NVML_SUCCESS                  if \a fbcStats is fetched
+*         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+*         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a vgpuInstance is 0, or \a fbcStats is NULL
+*         - \ref NVML_ERROR_NOT_FOUND          if \a vgpuInstance does not match a valid active vGPU instance on the system
+*         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+*/
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance, nvmlFBCStats_t *fbcStats);
+
+/**
+* Retrieves information about active frame buffer capture sessions on a vGPU Instance.
+*
+* An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The
+* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
+* written to the buffer.
+*
+* If the supplied buffer is not large enough to accomodate the active session array, the function returns
+* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount.
+* To query the number of active FBC sessions, call this function with *sessionCount = 0.  The code will return
+* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount.
+*
+* For Maxwell &tm; or newer fully supported devices.
+*
+* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may
+*       be zero if there are no new frames captured since the session started.
+*
+* @param vgpuInstance                      Identifier of the target vGPU instance
+* @param sessionCount                      Reference to caller supplied array size, and returns the number of sessions.
+* @param sessionInfo                       Reference in which to return the session information
+*
+* @return
+*         - \ref NVML_SUCCESS                  if \a sessionInfo is fetched
+*         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
+*         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a vgpuInstance is 0, or \a sessionCount is NULL.
+*         - \ref NVML_ERROR_NOT_FOUND          if \a vgpuInstance does not match a valid active vGPU instance on the system
+*         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a sessionCount is too small, array element count is returned in \a sessionCount
+*         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
+*/
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo);
+
+/**
+ * Retrieves the current utilization and process ID
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running.
+ * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at
+ * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization
+ * during the last sample period. It includes the CPU timestamp at which  the samples were recorded. Individual utilization values
+ * are returned as "unsigned int" values.
+ *
+ * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
+ * \a utilization set to NULL. The caller should allocate a buffer of size
+ * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed
+ * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for.
+ *
+ * On successful return, the function updates \a processSamplesCount with the number of process utilization sample
+ * structures that were actually written. This may differ from a previously read value as instances are created or
+ * destroyed.
+ *
+ * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
+ * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp
+ * to a timeStamp retrieved from a previous query to read utilization since the previous query.
+ *
+ * @param device                    The identifier of the target device
+ * @param utilization               Pointer to caller-supplied buffer in which guest process utilization samples are returned
+ * @param processSamplesCount       Pointer to caller-supplied array size, and returns number of processes running
+ * @param lastSeenTimeStamp         Return only samples with timestamp greater than lastSeenTimeStamp.
+
+ * @return
+ *         - \ref NVML_SUCCESS                 if \a utilization has been populated
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
+ *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization,
+                                              unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp);
+
+/**
+ * Queries the state of per process accounting mode on vGPU.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ *
+ * @param vgpuInstance            The identifier of the target vGPU VM
+ * @param mode                    Reference in which to return the current accounting mode
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if the mode has been successfully retrieved 
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a mode is NULL
+ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the vGPU doesn't support this feature
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *mode);
+
+/**
+ * Queries list of processes running on vGPU that can be queried for accounting stats. The list of processes 
+ * returned can be in running or terminated state.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ * 
+ * To just query the maximum number of processes that can be queried, call this function with *count = 0 and
+ * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty.
+ * 
+ * For more details see \ref nvmlVgpuInstanceGetAccountingStats.
+ *
+ * @note In case of PID collision some processes might not be accessible before the circular buffer is full.
+ *
+ * @param vgpuInstance            The identifier of the target vGPU VM
+ * @param count                   Reference in which to provide the \a pids array size, and
+ *                                to return the number of elements ready to be queried
+ * @param pids                    Reference in which to return list of process ids
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if pids were successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a count is NULL
+ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the vGPU doesn't support this feature or accounting mode is disabled
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to expected value)
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ *
+ * @see nvmlVgpuInstanceGetAccountingPids
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance, unsigned int *count, unsigned int *pids);
+
+/**
+ * Queries process's accounting stats.
+ *
+ * For Maxwell &tm; or newer fully supported devices.
+ * 
+ * Accounting stats capture GPU utilization and other statistics across the lifetime of a process, and
+ * can be queried during life time of the process or after its termination.
+ * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and 
+ * updated to actual running time after its termination.
+ * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old
+ * processes.
+ *
+ * See \ref nvmlAccountingStats_t for description of each returned metric.
+ * List of processes that can be queried can be retrieved from \ref nvmlVgpuInstanceGetAccountingPids.
+ *
+ * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode.
+ * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be
+ *         queried since they don't contribute to GPU utilization.
+ * @note In case of pid collision stats of only the latest process (that terminated last) will be reported
+ *
+ * @param vgpuInstance            The identifier of the target vGPU VM
+ * @param pid                     Process Id of the target process to query stats for
+ * @param stats                   Reference in which to return the process's accounting stats
+ *
+ * @return 
+ *         - \ref NVML_SUCCESS                 if stats have been successfully retrieved
+ *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a stats is NULL
+ *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
+ *                                             or \a stats is not found
+ *         - \ref NVML_ERROR_NOT_SUPPORTED     if the vGPU doesn't support this feature or accounting mode is disabled
+ *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance, unsigned int pid, nvmlAccountingStats_t *stats);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvml vGPU Migration
+ * This chapter describes NVML operations that are associated with vGPU Migration.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * vGPU metadata structure.
+ */
+typedef struct nvmlVgpuMetadata_st
+{
+    unsigned int             version;                                                    //!< Current version of the structure
+    unsigned int             revision;                                                   //!< Current revision of the structure
+    nvmlVgpuGuestInfoState_t guestInfoState;                                             //!< Current state of Guest-dependent fields
+    char                     guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest
+    char                     hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];  //!< Version of driver installed in host
+    unsigned int             reserved[8];                                                //!< Reserved for internal use
+    unsigned int             opaqueDataSize;                                             //!< Size of opaque data field in bytes
+    char                     opaqueData[4];                                              //!< Opaque data
+} nvmlVgpuMetadata_t;
+
+/**
+ * Physical GPU metadata structure
+ */
+typedef struct nvmlVgpuPgpuMetadata_st
+{
+    unsigned int            version;                                                    //!< Current version of the structure
+    unsigned int            revision;                                                   //!< Current revision of the structure
+    char                    hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];  //!< Host driver version
+    unsigned int            pgpuVirtualizationCaps;                                     //!< Pgpu virtualizaion capabilities bitfileld
+    unsigned int            reserved[7];                                                //!< Reserved for internal use
+    unsigned int            opaqueDataSize;                                             //!< Size of opaque data field in bytes
+    char                    opaqueData[4];                                              //!< Opaque data
+} nvmlVgpuPgpuMetadata_t;
+
+/**
+ * vGPU VM compatibility codes
+ */
+typedef enum nvmlVgpuVmCompatibility_enum
+{
+    NVML_VGPU_VM_COMPATIBILITY_NONE         = 0x0,    //!< vGPU is not runnable
+    NVML_VGPU_VM_COMPATIBILITY_COLD         = 0x1,    //!< vGPU is runnable from a cold / powered-off state (ACPI S5)
+    NVML_VGPU_VM_COMPATIBILITY_HIBERNATE    = 0x2,    //!< vGPU is runnable from a hibernated state (ACPI S4)
+    NVML_VGPU_VM_COMPATIBILITY_SLEEP        = 0x4,    //!< vGPU is runnable from a sleeped state (ACPI S3)
+    NVML_VGPU_VM_COMPATIBILITY_LIVE         = 0x8,    //!< vGPU is runnable from a live/paused (ACPI S0)
+} nvmlVgpuVmCompatibility_t;
+
+/**
+ *  vGPU-pGPU compatibility limit codes
+ */
+typedef enum nvmlVgpuPgpuCompatibilityLimitCode_enum
+{
+    NVML_VGPU_COMPATIBILITY_LIMIT_NONE          = 0x0,           //!< Compatibility is not limited.
+    NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER   = 0x1,           //!< Compatibility is limited by host driver version.
+    NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER  = 0x2,           //!< Compatibility is limited by guest driver version.
+    NVML_VGPU_COMPATIBILITY_LIMIT_GPU           = 0x4,           //!< Compatibility is limited by GPU hardware.
+    NVML_VGPU_COMPATIBILITY_LIMIT_OTHER         = 0x80000000,    //!< Compatibility is limited by an undefined factor.
+} nvmlVgpuPgpuCompatibilityLimitCode_t;
+
+/**
+ * vGPU-pGPU compatibility structure
+ */
+typedef struct nvmlVgpuPgpuCompatibility_st
+{
+    nvmlVgpuVmCompatibility_t               vgpuVmCompatibility;    //!< Compatibility of vGPU VM. See \ref nvmlVgpuVmCompatibility_t
+    nvmlVgpuPgpuCompatibilityLimitCode_t    compatibilityLimitCode; //!< Limiting factor for vGPU-pGPU compatibility. See \ref nvmlVgpuPgpuCompatibilityLimitCode_t
+} nvmlVgpuPgpuCompatibility_t;
+
+/**
+ * Returns vGPU metadata structure for a running vGPU. The structure contains information about the vGPU and its associated VM
+ * such as the currently installed NVIDIA guest driver version, together with host driver version and an opaque data section
+ * containing internal state.
+ *
+ * nvmlVgpuInstanceGetMetadata() may be called at any time for a vGPU instance. Some fields in the returned structure are
+ * dependent on information obtained from the guest VM, which may not yet have reached a state where that information
+ * is available. The current state of these dependent fields is reflected in the info structure's \ref guestInfoState field.
+ *
+ * The VMM may choose to read and save the vGPU's VM info as persistent metadata associated with the VM, and provide
+ * it to GRID Virtual GPU Manager when creating a vGPU for subsequent instances of the VM.
+ *
+ * The caller passes in a buffer via \a vgpuMetadata, with the size of the buffer in \a bufferSize. If the vGPU Metadata structure
+ * is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed
+ * in \a bufferSize.
+ *
+ * @param vgpuInstance             vGPU instance handle
+ * @param vgpuMetadata             Pointer to caller-supplied buffer into which vGPU metadata is written
+ * @param bufferSize               Size of vgpuMetadata buffer
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                   vGPU metadata structure was successfully returned
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE   vgpuMetadata buffer is too small, required size is returned in \a bufferSize
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a bufferSize is NULL or \a vgpuInstance is 0; if \a vgpuMetadata is NULL and the value of \a bufferSize is not 0.
+ *         - \ref NVML_ERROR_NOT_FOUND           if \a vgpuInstance does not match a valid active vGPU instance on the system
+ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t *vgpuMetadata, unsigned int *bufferSize);
+
+/**
+ * Returns a vGPU metadata structure for the physical GPU indicated by \a device. The structure contains information about
+ * the GPU and the currently installed NVIDIA host driver version that's controlling it, together with an opaque data section
+ * containing internal state.
+ *
+ * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the \a pgpuMetadata
+ * structure is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed
+ * in \a bufferSize.
+ *
+ * @param device                The identifier of the target device
+ * @param pgpuMetadata          Pointer to caller-supplied buffer into which \a pgpuMetadata is written
+ * @param bufferSize            Pointer to size of \a pgpuMetadata buffer
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                   GPU metadata structure was successfully returned
+ *         - \ref NVML_ERROR_INSUFFICIENT_SIZE   pgpuMetadata buffer is too small, required size is returned in \a bufferSize
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0.
+ *         - \ref NVML_ERROR_NOT_SUPPORTED       vGPU is not supported by the system
+ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t *pgpuMetadata, unsigned int *bufferSize);
+
+/**
+ * Takes a vGPU instance metadata structure read from \ref nvmlVgpuInstanceGetMetadata(), and a vGPU metadata structure for a
+ * physical GPU read from \ref nvmlDeviceGetVgpuMetadata(), and returns compatibility information of the vGPU instance and the
+ * physical GPU.
+ *
+ * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The
+ * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility
+ * with the physical GPU is limited, a limit code indicates the factor limiting compability.
+ * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details).
+ *
+ * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to
+ *       boot a given vGPU or associated VM.
+ *
+ * @param vgpuMetadata          Pointer to caller-supplied vGPU metadata structure
+ * @param pgpuMetadata          Pointer to caller-supplied GPU metadata structure
+ * @param compatibilityInfo     Pointer to caller-supplied buffer to hold compatibility info
+ *
+ * @return
+ *         - \ref NVML_SUCCESS                   vGPU metadata structure was successfully returned
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a vgpuMetadata or \a pgpuMetadata or \a bufferSize are NULL
+ *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
+ */
+nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo);
+
+/** @} */
+
+/***************************************************************************************************/
+/** @defgroup nvmlGpuBlacklistQueries GPU Blacklist Queries
+ * This chapter describes NVML operations that are associated with blacklisted GPUs.
+ *  @{
+ */
+/***************************************************************************************************/
+
+/**
+ * Blacklist GPU device information
+ **/
+typedef struct nvmlBlacklistDeviceInfo_st
+{
+    nvmlPciInfo_t pciInfo;                   //!< The PCI information for the blacklisted GPU
+    char uuid[NVML_DEVICE_UUID_BUFFER_SIZE]; //!< The ASCII string UUID for the blacklisted GPU
+} nvmlBlacklistDeviceInfo_t;
+
+ /**
+ * Retrieves the number of blacklisted GPU devices in the system.
+ * 
+ * For all products.
+ *
+ * @param deviceCount                          Reference in which to return the number of blacklisted devices
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                 if \a deviceCount has been set
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a deviceCount is NULL
+ */
+nvmlReturn_t DECLDIR nvmlGetBlacklistDeviceCount(unsigned int *deviceCount);
+
+/**
+ * Acquire the device information for a blacklisted device, based on its index.
+ * 
+ * For all products.
+ *
+ * Valid indices are derived from the \a deviceCount returned by 
+ *   \ref nvmlGetBlacklistDeviceCount(). For example, if \a deviceCount is 2 the valid indices  
+ *   are 0 and 1, corresponding to GPU 0 and GPU 1.
+ *
+ * @param index                                The index of the target GPU, >= 0 and < \a deviceCount
+ * @param info                                 Reference in which to return the device information
+ * 
+ * @return 
+ *         - \ref NVML_SUCCESS                  if \a device has been set
+ *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a index is invalid or \a info is NULL
+ *
+ * @see nvmlGetBlacklistDeviceCount
+ */
+nvmlReturn_t DECLDIR nvmlGetBlacklistDeviceInfoByIndex(unsigned int index, nvmlBlacklistDeviceInfo_t *info);
+
+/** @} */
+
+/**
+ * NVML API versioning support
+ */
+#if defined(__NVML_API_VERSION_INTERNAL)
+#undef nvmlDeviceRemoveGpu
+#undef nvmlDeviceGetNvLinkRemotePciInfo
+#undef nvmlDeviceGetPciInfo
+#undef nvmlDeviceGetCount
+#undef nvmlDeviceGetHandleByIndex
+#undef nvmlDeviceGetHandleByPciBusId
+#undef nvmlInit
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/triton/external/half.hpp b/include/triton/external/half.hpp
new file mode 100644
index 000000000..625cce7cb
--- /dev/null
+++ b/include/triton/external/half.hpp
@@ -0,0 +1,3067 @@
+// half - IEEE 754-based half-precision floating point library.
+//
+// Copyright (c) 2012-2017 Christian Rau <rauy@users.sourceforge.net>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation 
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+// Version 1.12.0
+
+/// \file
+/// Main header file for half precision functionality.
+
+#ifndef HALF_HALF_HPP
+#define HALF_HALF_HPP
+
+/// Combined gcc version number.
+#define HALF_GNUC_VERSION (__GNUC__*100+__GNUC_MINOR__)
+
+//check C++11 language features
+#if defined(__clang__)										//clang
+	#if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+		#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+	#endif
+	#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+		#define HALF_ENABLE_CPP11_CONSTEXPR 1
+	#endif
+	#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+		#define HALF_ENABLE_CPP11_NOEXCEPT 1
+	#endif
+	#if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+		#define HALF_ENABLE_CPP11_USER_LITERALS 1
+	#endif
+	#if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+		#define HALF_ENABLE_CPP11_LONG_LONG 1
+	#endif
+/*#elif defined(__INTEL_COMPILER)								//Intel C++
+	#if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)		????????
+		#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+	#endif
+	#if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)			????????
+		#define HALF_ENABLE_CPP11_CONSTEXPR 1
+	#endif
+	#if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)			????????
+		#define HALF_ENABLE_CPP11_NOEXCEPT 1
+	#endif
+	#if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_LONG_LONG)			????????
+		#define HALF_ENABLE_CPP11_LONG_LONG 1
+	#endif*/
+#elif defined(__GNUC__)										//gcc
+	#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
+		#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+			#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+		#endif
+		#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+			#define HALF_ENABLE_CPP11_CONSTEXPR 1
+		#endif
+		#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+			#define HALF_ENABLE_CPP11_NOEXCEPT 1
+		#endif
+		#if HALF_GNUC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+			#define HALF_ENABLE_CPP11_USER_LITERALS 1
+		#endif
+		#if !defined(HALF_ENABLE_CPP11_LONG_LONG)
+			#define HALF_ENABLE_CPP11_LONG_LONG 1
+		#endif
+	#endif
+#elif defined(_MSC_VER)										//Visual C++
+	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+		#define HALF_ENABLE_CPP11_CONSTEXPR 1
+	#endif
+	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+		#define HALF_ENABLE_CPP11_NOEXCEPT 1
+	#endif
+	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+		#define HALF_ENABLE_CPP11_USER_LITERALS 1
+	#endif
+	#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+		#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+	#endif
+	#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+		#define HALF_ENABLE_CPP11_LONG_LONG 1
+	#endif
+	#define HALF_POP_WARNINGS 1
+	#pragma warning(push)
+	#pragma warning(disable : 4099 4127 4146)	//struct vs class, constant in if, negative unsigned
+#endif
+
+//check C++11 library features
+#include <utility>
+#if defined(_LIBCPP_VERSION)								//libc++
+	#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+		#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
+			#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_CSTDINT
+			#define HALF_ENABLE_CPP11_CSTDINT 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_CMATH
+			#define HALF_ENABLE_CPP11_CMATH 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_HASH
+			#define HALF_ENABLE_CPP11_HASH 1
+		#endif
+	#endif
+#elif defined(__GLIBCXX__)									//libstdc++
+	#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+		#ifdef __clang__
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+				#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+			#endif
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+				#define HALF_ENABLE_CPP11_CSTDINT 1
+			#endif
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH)
+				#define HALF_ENABLE_CPP11_CMATH 1
+			#endif
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH)
+				#define HALF_ENABLE_CPP11_HASH 1
+			#endif
+		#else
+			#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+				#define HALF_ENABLE_CPP11_CSTDINT 1
+			#endif
+			#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH)
+				#define HALF_ENABLE_CPP11_CMATH 1
+			#endif
+			#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH)
+				#define HALF_ENABLE_CPP11_HASH 1
+			#endif
+		#endif
+	#endif
+#elif defined(_CPPLIB_VER)									//Dinkumware/Visual C++
+	#if _CPPLIB_VER >= 520
+		#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
+			#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_CSTDINT
+			#define HALF_ENABLE_CPP11_CSTDINT 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_HASH
+			#define HALF_ENABLE_CPP11_HASH 1
+		#endif
+	#endif
+	#if _CPPLIB_VER >= 610
+		#ifndef HALF_ENABLE_CPP11_CMATH
+			#define HALF_ENABLE_CPP11_CMATH 1
+		#endif
+	#endif
+#endif
+#undef HALF_GNUC_VERSION
+
+//support constexpr
+#if HALF_ENABLE_CPP11_CONSTEXPR
+	#define HALF_CONSTEXPR			constexpr
+	#define HALF_CONSTEXPR_CONST	constexpr
+#else
+	#define HALF_CONSTEXPR
+	#define HALF_CONSTEXPR_CONST	const
+#endif
+
+//support noexcept
+#if HALF_ENABLE_CPP11_NOEXCEPT
+	#define HALF_NOEXCEPT	noexcept
+	#define HALF_NOTHROW	noexcept
+#else
+	#define HALF_NOEXCEPT
+	#define HALF_NOTHROW	throw()
+#endif
+
+#include <algorithm>
+#include <iostream>
+#include <limits>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#if HALF_ENABLE_CPP11_TYPE_TRAITS
+	#include <type_traits>
+#endif
+#if HALF_ENABLE_CPP11_CSTDINT
+	#include <cstdint>
+#endif
+#if HALF_ENABLE_CPP11_HASH
+	#include <functional>
+#endif
+
+
+/// Default rounding mode.
+/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s and `float`s as well as 
+/// for the half_cast() if not specifying a rounding mode explicitly. It can be redefined (before including half.hpp) to one 
+/// of the standard rounding modes using their respective constants or the equivalent values of `std::float_round_style`:
+///
+/// `std::float_round_style`         | value | rounding
+/// ---------------------------------|-------|-------------------------
+/// `std::round_indeterminate`       | -1    | fastest (default)
+/// `std::round_toward_zero`         | 0     | toward zero
+/// `std::round_to_nearest`          | 1     | to nearest
+/// `std::round_toward_infinity`     | 2     | toward positive infinity
+/// `std::round_toward_neg_infinity` | 3     | toward negative infinity
+///
+/// By default this is set to `-1` (`std::round_indeterminate`), which uses truncation (round toward zero, but with overflows 
+/// set to infinity) and is the fastest rounding mode possible. It can even be set to `std::numeric_limits<float>::round_style` 
+/// to synchronize the rounding mode with that of the underlying single-precision implementation.
+#ifndef HALF_ROUND_STYLE
+	#define HALF_ROUND_STYLE	-1			// = std::round_indeterminate
+#endif
+
+/// Tie-breaking behaviour for round to nearest.
+/// This specifies if ties in round to nearest should be resolved by rounding to the nearest even value. By default this is 
+/// defined to `0` resulting in the faster but slightly more biased behaviour of rounding away from zero in half-way cases (and 
+/// thus equal to the round() function), but can be redefined to `1` (before including half.hpp) if more IEEE-conformant 
+/// behaviour is needed.
+#ifndef HALF_ROUND_TIES_TO_EVEN
+	#define HALF_ROUND_TIES_TO_EVEN	0		// ties away from zero
+#endif
+
+/// Value signaling overflow.
+/// In correspondence with `HUGE_VAL[F|L]` from `<cmath>` this symbol expands to a positive value signaling the overflow of an 
+/// operation, in particular it just evaluates to positive infinity.
+#define HUGE_VALH	std::numeric_limits<half_float::half>::infinity()
+
+/// Fast half-precision fma function.
+/// This symbol is only defined if the fma() function generally executes as fast as, or faster than, a separate 
+/// half-precision multiplication followed by an addition. Due to the internal single-precision implementation of all 
+/// arithmetic operations, this is in fact always the case.
+#define FP_FAST_FMAH	1
+
+#ifndef FP_ILOGB0
+	#define FP_ILOGB0		INT_MIN
+#endif
+#ifndef FP_ILOGBNAN
+	#define FP_ILOGBNAN		INT_MAX
+#endif
+#ifndef FP_SUBNORMAL
+	#define FP_SUBNORMAL	0
+#endif
+#ifndef FP_ZERO
+	#define FP_ZERO			1
+#endif
+#ifndef FP_NAN
+	#define FP_NAN			2
+#endif
+#ifndef FP_INFINITE
+	#define FP_INFINITE		3
+#endif
+#ifndef FP_NORMAL
+	#define FP_NORMAL		4
+#endif
+
+
+/// Main namespace for half precision functionality.
+/// This namespace contains all the functionality provided by the library.
+namespace half_float
+{
+	class half;
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+	/// Library-defined half-precision literals.
+	/// Import this namespace to enable half-precision floating point literals:
+	/// ~~~~{.cpp}
+	/// using namespace half_float::literal;
+	/// half_float::half = 4.2_h;
+	/// ~~~~
+	namespace literal
+	{
+		half operator""_h(long double);
+	}
+#endif
+
+	/// \internal
+	/// \brief Implementation details.
+	namespace detail
+	{
+	#if HALF_ENABLE_CPP11_TYPE_TRAITS
+		/// Conditional type.
+		template<bool B,typename T,typename F> struct conditional : std::conditional<B,T,F> {};
+
+		/// Helper for tag dispatching.
+		template<bool B> struct bool_type : std::integral_constant<bool,B> {};
+		using std::true_type;
+		using std::false_type;
+
+		/// Type traits for floating point types.
+		template<typename T> struct is_float : std::is_floating_point<T> {};
+	#else
+		/// Conditional type.
+		template<bool,typename T,typename> struct conditional { typedef T type; };
+		template<typename T,typename F> struct conditional<false,T,F> { typedef F type; };
+
+		/// Helper for tag dispatching.
+		template<bool> struct bool_type {};
+		typedef bool_type<true> true_type;
+		typedef bool_type<false> false_type;
+
+		/// Type traits for floating point types.
+		template<typename> struct is_float : false_type {};
+		template<typename T> struct is_float<const T> : is_float<T> {};
+		template<typename T> struct is_float<volatile T> : is_float<T> {};
+		template<typename T> struct is_float<const volatile T> : is_float<T> {};
+		template<> struct is_float<float> : true_type {};
+		template<> struct is_float<double> : true_type {};
+		template<> struct is_float<long double> : true_type {};
+	#endif
+
+		/// Type traits for floating point bits.
+		template<typename T> struct bits { typedef unsigned char type; };
+		template<typename T> struct bits<const T> : bits<T> {};
+		template<typename T> struct bits<volatile T> : bits<T> {};
+		template<typename T> struct bits<const volatile T> : bits<T> {};
+
+	#if HALF_ENABLE_CPP11_CSTDINT
+		/// Unsigned integer of (at least) 16 bits width.
+		typedef std::uint_least16_t uint16;
+
+		/// Unsigned integer of (at least) 32 bits width.
+		template<> struct bits<float> { typedef std::uint_least32_t type; };
+
+		/// Unsigned integer of (at least) 64 bits width.
+		template<> struct bits<double> { typedef std::uint_least64_t type; };
+	#else
+		/// Unsigned integer of (at least) 16 bits width.
+		typedef unsigned short uint16;
+
+		/// Unsigned integer of (at least) 32 bits width.
+		template<> struct bits<float> : conditional<std::numeric_limits<unsigned int>::digits>=32,unsigned int,unsigned long> {};
+
+		#if HALF_ENABLE_CPP11_LONG_LONG
+			/// Unsigned integer of (at least) 64 bits width.
+			template<> struct bits<double> : conditional<std::numeric_limits<unsigned long>::digits>=64,unsigned long,unsigned long long> {};
+		#else
+			/// Unsigned integer of (at least) 64 bits width.
+			template<> struct bits<double> { typedef unsigned long type; };
+		#endif
+	#endif
+
+		/// Tag type for binary construction.
+		struct binary_t {};
+
+		/// Tag for binary construction.
+		HALF_CONSTEXPR_CONST binary_t binary = binary_t();
+
+		/// Temporary half-precision expression.
+		/// This class represents a half-precision expression which just stores a single-precision value internally.
+		struct expr
+		{
+			/// Conversion constructor.
+			/// \param f single-precision value to convert
+			explicit HALF_CONSTEXPR expr(float f) HALF_NOEXCEPT : value_(f) {}
+
+			/// Conversion to single-precision.
+			/// \return single precision value representing expression value
+			HALF_CONSTEXPR operator float() const HALF_NOEXCEPT { return value_; }
+
+		private:
+			/// Internal expression value stored in single-precision.
+			float value_;
+		};
+
+		/// SFINAE helper for generic half-precision functions.
+		/// This class template has to be specialized for each valid combination of argument types to provide a corresponding 
+		/// `type` member equivalent to \a T.
+		/// \tparam T type to return
+		template<typename T,typename,typename=void,typename=void> struct enable {};
+		template<typename T> struct enable<T,half,void,void> { typedef T type; };
+		template<typename T> struct enable<T,expr,void,void> { typedef T type; };
+		template<typename T> struct enable<T,half,half,void> { typedef T type; };
+		template<typename T> struct enable<T,half,expr,void> { typedef T type; };
+		template<typename T> struct enable<T,expr,half,void> { typedef T type; };
+		template<typename T> struct enable<T,expr,expr,void> { typedef T type; };
+		template<typename T> struct enable<T,half,half,half> { typedef T type; };
+		template<typename T> struct enable<T,half,half,expr> { typedef T type; };
+		template<typename T> struct enable<T,half,expr,half> { typedef T type; };
+		template<typename T> struct enable<T,half,expr,expr> { typedef T type; };
+		template<typename T> struct enable<T,expr,half,half> { typedef T type; };
+		template<typename T> struct enable<T,expr,half,expr> { typedef T type; };
+		template<typename T> struct enable<T,expr,expr,half> { typedef T type; };
+		template<typename T> struct enable<T,expr,expr,expr> { typedef T type; };
+
+		/// Return type for specialized generic 2-argument half-precision functions.
+		/// This class template has to be specialized for each valid combination of argument types to provide a corresponding 
+		/// `type` member denoting the appropriate return type.
+		/// \tparam T first argument type
+		/// \tparam U first argument type
+		template<typename T,typename U> struct result : enable<expr,T,U> {};
+		template<> struct result<half,half> { typedef half type; };
+
+		/// \name Classification helpers
+		/// \{
+
+		/// Check for infinity.
+		/// \tparam T argument type (builtin floating point type)
+		/// \param arg value to query
+		/// \retval true if infinity
+		/// \retval false else
+		template<typename T> bool builtin_isinf(T arg)
+		{
+		#if HALF_ENABLE_CPP11_CMATH
+			return std::isinf(arg);
+		#elif defined(_MSC_VER)
+			return !::_finite(static_cast<double>(arg)) && !::_isnan(static_cast<double>(arg));
+		#else
+			return arg == std::numeric_limits<T>::infinity() || arg == -std::numeric_limits<T>::infinity();
+		#endif
+		}
+
+		/// Check for NaN.
+		/// \tparam T argument type (builtin floating point type)
+		/// \param arg value to query
+		/// \retval true if not a number
+		/// \retval false else
+		template<typename T> bool builtin_isnan(T arg)
+		{
+		#if HALF_ENABLE_CPP11_CMATH
+			return std::isnan(arg);
+		#elif defined(_MSC_VER)
+			return ::_isnan(static_cast<double>(arg)) != 0;
+		#else
+			return arg != arg;
+		#endif
+		}
+
+		/// Check sign.
+		/// \tparam T argument type (builtin floating point type)
+		/// \param arg value to query
+		/// \retval true if signbit set
+		/// \retval false else
+		template<typename T> bool builtin_signbit(T arg)
+		{
+		#if HALF_ENABLE_CPP11_CMATH
+			return std::signbit(arg);
+		#else
+			return arg < T() || (arg == T() && T(1)/arg < T());
+		#endif
+		}
+
+		/// \}
+		/// \name Conversion
+		/// \{
+
+		/// Convert IEEE single-precision to half-precision.
+		/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+		/// \param value single-precision value
+		/// \return binary representation of half-precision value
+		template<std::float_round_style R> uint16 float2half_impl(float value, true_type)
+		{
+			typedef bits<float>::type uint32;
+			uint32 bits;// = *reinterpret_cast<uint32*>(&value);		//violating strict aliasing!
+			std::memcpy(&bits, &value, sizeof(float));
+/*			uint16 hbits = (bits>>16) & 0x8000;
+			bits &= 0x7FFFFFFF;
+			int exp = bits >> 23;
+			if(exp == 255)
+				return hbits | 0x7C00 | (0x3FF&-static_cast<unsigned>((bits&0x7FFFFF)!=0));
+			if(exp > 142)
+			{
+				if(R == std::round_toward_infinity)
+					return hbits | 0x7C00 - (hbits>>15);
+				if(R == std::round_toward_neg_infinity)
+					return hbits | 0x7BFF + (hbits>>15);
+				return hbits | 0x7BFF + (R!=std::round_toward_zero);
+			}
+			int g, s;
+			if(exp > 112)
+			{
+				g = (bits>>12) & 1;
+				s = (bits&0xFFF) != 0;
+				hbits |= ((exp-112)<<10) | ((bits>>13)&0x3FF);
+			}
+			else if(exp > 101)
+			{
+				int i = 125 - exp;
+				bits = (bits&0x7FFFFF) | 0x800000;
+				g = (bits>>i) & 1;
+				s = (bits&((1L<<i)-1)) != 0;
+				hbits |= bits >> (i+1);
+			}
+			else
+			{
+				g = 0;
+				s = bits != 0;
+			}
+			if(R == std::round_to_nearest)
+				#if HALF_ROUND_TIES_TO_EVEN
+					hbits += g & (s|hbits);
+				#else
+					hbits += g;
+				#endif
+			else if(R == std::round_toward_infinity)
+				hbits += ~(hbits>>15) & (s|g);
+			else if(R == std::round_toward_neg_infinity)
+				hbits += (hbits>>15) & (g|s);
+*/			static const uint16 base_table[512] = { 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 
+				0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, 
+				0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, 
+				0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 
+				0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 
+				0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 
+				0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 
+				0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 
+				0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 
+				0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 
+				0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, 
+				0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, 
+				0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 
+				0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 
+				0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 
+				0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 
+				0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 
+				0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 
+				0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 };
+			static const unsigned char shift_table[512] = { 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 
+				13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 
+				13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 };
+			uint16 hbits = base_table[bits>>23] + static_cast<uint16>((bits&0x7FFFFF)>>shift_table[bits>>23]);
+			if(R == std::round_to_nearest)
+				hbits += (((bits&0x7FFFFF)>>(shift_table[bits>>23]-1))|(((bits>>23)&0xFF)==102)) & ((hbits&0x7C00)!=0x7C00)
+				#if HALF_ROUND_TIES_TO_EVEN
+					& (((((static_cast<uint32>(1)<<(shift_table[bits>>23]-1))-1)&bits)!=0)|hbits)
+				#endif
+				;
+			else if(R == std::round_toward_zero)
+				hbits -= ((hbits&0x7FFF)==0x7C00) & ~shift_table[bits>>23];
+			else if(R == std::round_toward_infinity)
+				hbits += ((((bits&0x7FFFFF&((static_cast<uint32>(1)<<(shift_table[bits>>23]))-1))!=0)|(((bits>>23)<=102)&
+					((bits>>23)!=0)))&(hbits<0x7C00)) - ((hbits==0xFC00)&((bits>>23)!=511));
+			else if(R == std::round_toward_neg_infinity)
+				hbits += ((((bits&0x7FFFFF&((static_cast<uint32>(1)<<(shift_table[bits>>23]))-1))!=0)|(((bits>>23)<=358)&
+					((bits>>23)!=256)))&(hbits<0xFC00)&(hbits>>15)) - ((hbits==0x7C00)&((bits>>23)!=255));
+			return hbits;
+		}
+
+		/// Convert IEEE double-precision to half-precision.
+		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+		/// \param value double-precision value
+		/// \return binary representation of half-precision value
+		template<std::float_round_style R> uint16 float2half_impl(double value, true_type)
+		{
+			typedef bits<float>::type uint32;
+			typedef bits<double>::type uint64;
+			uint64 bits;// = *reinterpret_cast<uint64*>(&value);		//violating strict aliasing!
+			std::memcpy(&bits, &value, sizeof(double));
+			uint32 hi = bits >> 32, lo = bits & 0xFFFFFFFF;
+			uint16 hbits = (hi>>16) & 0x8000;
+			hi &= 0x7FFFFFFF;
+			int exp = hi >> 20;
+			if(exp == 2047)
+				return hbits | 0x7C00 | (0x3FF&-static_cast<unsigned>((bits&0xFFFFFFFFFFFFF)!=0));
+			if(exp > 1038)
+			{
+				if(R == std::round_toward_infinity)
+					return hbits | 0x7C00 - (hbits>>15);
+				if(R == std::round_toward_neg_infinity)
+					return hbits | 0x7BFF + (hbits>>15);
+				return hbits | 0x7BFF + (R!=std::round_toward_zero);
+			}
+			int g, s = lo != 0;
+			if(exp > 1008)
+			{
+				g = (hi>>9) & 1;
+				s |= (hi&0x1FF) != 0;
+				hbits |= ((exp-1008)<<10) | ((hi>>10)&0x3FF);
+			}
+			else if(exp > 997)
+			{
+				int i = 1018 - exp;
+				hi = (hi&0xFFFFF) | 0x100000;
+				g = (hi>>i) & 1;
+				s |= (hi&((1L<<i)-1)) != 0;
+				hbits |= hi >> (i+1);
+			}
+			else
+			{
+				g = 0;
+				s |= hi != 0;
+			}
+			if(R == std::round_to_nearest)
+				#if HALF_ROUND_TIES_TO_EVEN
+					hbits += g & (s|hbits);
+				#else
+					hbits += g;
+				#endif
+			else if(R == std::round_toward_infinity)
+				hbits += ~(hbits>>15) & (s|g);
+			else if(R == std::round_toward_neg_infinity)
+				hbits += (hbits>>15) & (g|s);
+			return hbits;
+		}
+
+		/// Convert non-IEEE floating point to half-precision.
+		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+		/// \tparam T source type (builtin floating point type)
+		/// \param value floating point value
+		/// \return binary representation of half-precision value
+		template<std::float_round_style R,typename T> uint16 float2half_impl(T value, ...)
+		{
+			uint16 hbits = static_cast<unsigned>(builtin_signbit(value)) << 15;
+			if(value == T())
+				return hbits;
+			if(builtin_isnan(value))
+				return hbits | 0x7FFF;
+			if(builtin_isinf(value))
+				return hbits | 0x7C00;
+			int exp;
+			std::frexp(value, &exp);
+			if(exp > 16)
+			{
+				if(R == std::round_toward_infinity)
+					return hbits | 0x7C00 - (hbits>>15);
+				else if(R == std::round_toward_neg_infinity)
+					return hbits | 0x7BFF + (hbits>>15);
+				return hbits | 0x7BFF + (R!=std::round_toward_zero);
+			}
+			if(exp < -13)
+				value = std::ldexp(value, 24);
+			else
+			{
+				value = std::ldexp(value, 11-exp);
+				hbits |= ((exp+13)<<10);
+			}
+			T ival, frac = std::modf(value, &ival);
+			hbits += static_cast<uint16>(std::abs(static_cast<int>(ival)));
+			if(R == std::round_to_nearest)
+			{
+				frac = std::abs(frac);
+				#if HALF_ROUND_TIES_TO_EVEN
+					hbits += (frac>T(0.5)) | ((frac==T(0.5))&hbits);
+				#else
+					hbits += frac >= T(0.5);
+				#endif
+			}
+			else if(R == std::round_toward_infinity)
+				hbits += frac > T();
+			else if(R == std::round_toward_neg_infinity)
+				hbits += frac < T();
+			return hbits;
+		}
+
+		/// Convert floating point to half-precision.
+		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+		/// \tparam T source type (builtin floating point type)
+		/// \param value floating point value
+		/// \return binary representation of half-precision value
+		template<std::float_round_style R,typename T> uint16 float2half(T value)
+		{
+			return float2half_impl<R>(value, bool_type<std::numeric_limits<T>::is_iec559&&sizeof(typename bits<T>::type)==sizeof(T)>());
+		}
+
+		/// Convert integer to half-precision floating point.
+		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+		/// \tparam S `true` if value negative, `false` else
+		/// \tparam T type to convert (builtin integer type)
+		/// \param value non-negative integral value
+		/// \return binary representation of half-precision value
+		template<std::float_round_style R,bool S,typename T> uint16 int2half_impl(T value)
+		{
+		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+			static_assert(std::is_integral<T>::value, "int to half conversion only supports builtin integer types");
+		#endif
+			if(S)
+				value = -value;
+			uint16 bits = S << 15;
+			if(value > 0xFFFF)
+			{
+				if(R == std::round_toward_infinity)
+					bits |= 0x7C00 - S;
+				else if(R == std::round_toward_neg_infinity)
+					bits |= 0x7BFF + S;
+				else
+					bits |= 0x7BFF + (R!=std::round_toward_zero);
+			}
+			else if(value)
+			{
+				unsigned int m = value, exp = 24;
+				for(; m<0x400; m<<=1,--exp) ;
+				for(; m>0x7FF; m>>=1,++exp) ;
+				bits |= (exp<<10) + m;
+				if(exp > 24)
+				{
+					if(R == std::round_to_nearest)
+						bits += (value>>(exp-25)) & 1
+						#if HALF_ROUND_TIES_TO_EVEN
+							& (((((1<<(exp-25))-1)&value)!=0)|bits)
+						#endif
+						;
+					else if(R == std::round_toward_infinity)
+						bits += ((value&((1<<(exp-24))-1))!=0) & !S;
+					else if(R == std::round_toward_neg_infinity)
+						bits += ((value&((1<<(exp-24))-1))!=0) & S;
+				}
+			}
+			return bits;
+		}
+
+		/// Convert integer to half-precision floating point.
+		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+		/// \tparam T type to convert (builtin integer type)
+		/// \param value integral value
+		/// \return binary representation of half-precision value
+		template<std::float_round_style R,typename T> uint16 int2half(T value)
+		{
+			return (value<0) ? int2half_impl<R,true>(value) : int2half_impl<R,false>(value);
+		}
+
+		/// Convert half-precision to IEEE single-precision.
+		/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+		/// \param value binary representation of half-precision value
+		/// \return single-precision value
+		inline float half2float_impl(uint16 value, float, true_type)
+		{
+			typedef bits<float>::type uint32;
+/*			uint32 bits = static_cast<uint32>(value&0x8000) << 16;
+			int abs = value & 0x7FFF;
+			if(abs)
+			{
+				bits |= 0x38000000 << static_cast<unsigned>(abs>=0x7C00);
+				for(; abs<0x400; abs<<=1,bits-=0x800000) ;
+				bits += static_cast<uint32>(abs) << 13;
+			}
+*/			static const uint32 mantissa_table[2048] = { 
+				0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, 
+				0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, 
+				0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, 
+				0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000, 
+				0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000, 
+				0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, 
+				0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, 
+				0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, 
+				0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, 
+				0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, 
+				0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, 
+				0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, 
+				0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 
+				0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, 
+				0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, 
+				0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 
+				0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, 
+				0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, 
+				0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000, 
+				0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, 
+				0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, 
+				0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, 
+				0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, 
+				0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, 
+				0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, 
+				0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, 
+				0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, 
+				0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, 
+				0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, 
+				0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, 
+				0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 
+				0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000, 
+				0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, 
+				0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, 
+				0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, 
+				0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, 
+				0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000, 
+				0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, 
+				0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, 
+				0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, 
+				0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, 
+				0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, 
+				0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, 
+				0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 
+				0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, 
+				0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, 
+				0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, 
+				0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, 
+				0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000, 
+				0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000, 
+				0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, 
+				0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, 
+				0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, 
+				0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, 
+				0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, 
+				0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, 
+				0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, 
+				0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, 
+				0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, 
+				0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, 
+				0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, 
+				0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, 
+				0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, 
+				0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, 
+				0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, 
+				0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, 
+				0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, 
+				0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, 
+				0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, 
+				0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, 
+				0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, 
+				0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, 
+				0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, 
+				0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, 
+				0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, 
+				0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 
+				0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000, 
+				0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, 
+				0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 
+				0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, 
+				0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, 
+				0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, 
+				0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, 
+				0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, 
+				0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, 
+				0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, 
+				0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, 
+				0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, 
+				0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, 
+				0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, 
+				0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, 
+				0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, 
+				0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, 
+				0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 
+				0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000, 
+				0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, 
+				0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, 
+				0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, 
+				0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, 
+				0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, 
+				0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, 
+				0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, 
+				0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, 
+				0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, 
+				0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, 
+				0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, 
+				0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, 
+				0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, 
+				0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, 
+				0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, 
+				0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, 
+				0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, 
+				0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, 
+				0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, 
+				0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, 
+				0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, 
+				0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, 
+				0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, 
+				0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, 
+				0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, 
+				0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 
+				0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, 
+				0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, 
+				0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, 
+				0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, 
+				0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, 
+				0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, 
+				0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 };
+			static const uint32 exponent_table[64] = { 
+				0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, 
+				0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, 
+				0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, 
+				0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 };
+			static const unsigned short offset_table[64] = { 
+				   0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 
+				   0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 };
+			uint32 bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10];
+//			return *reinterpret_cast<float*>(&bits);			//violating strict aliasing!
+			float out;
+			std::memcpy(&out, &bits, sizeof(float));
+			return out;
+		}
+
+		/// Convert half-precision to IEEE double-precision.
+		/// \param value binary representation of half-precision value
+		/// \return double-precision value
+		inline double half2float_impl(uint16 value, double, true_type)
+		{
+			typedef bits<float>::type uint32;
+			typedef bits<double>::type uint64;
+			uint32 hi = static_cast<uint32>(value&0x8000) << 16;
+			int abs = value & 0x7FFF;
+			if(abs)
+			{
+				hi |= 0x3F000000 << static_cast<unsigned>(abs>=0x7C00);
+				for(; abs<0x400; abs<<=1,hi-=0x100000) ;
+				hi += static_cast<uint32>(abs) << 10;
+			}
+			uint64 bits = static_cast<uint64>(hi) << 32;
+//			return *reinterpret_cast<double*>(&bits);			//violating strict aliasing!
+			double out;
+			std::memcpy(&out, &bits, sizeof(double));
+			return out;
+		}
+
+		/// Convert half-precision to non-IEEE floating point.
+		/// \tparam T type to convert to (builtin integer type)
+		/// \param value binary representation of half-precision value
+		/// \return floating point value
+		template<typename T> T half2float_impl(uint16 value, T, ...)
+		{
+			T out;
+			int abs = value & 0x7FFF;
+			if(abs > 0x7C00)
+				out = std::numeric_limits<T>::has_quiet_NaN ? std::numeric_limits<T>::quiet_NaN() : T();
+			else if(abs == 0x7C00)
+				out = std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() : std::numeric_limits<T>::max();
+			else if(abs > 0x3FF)
+				out = std::ldexp(static_cast<T>((abs&0x3FF)|0x400), (abs>>10)-25);
+			else
+				out = std::ldexp(static_cast<T>(abs), -24);
+			return (value&0x8000) ? -out : out;
+		}
+
+		/// Convert half-precision to floating point.
+		/// \tparam T type to convert to (builtin integer type)
+		/// \param value binary representation of half-precision value
+		/// \return floating point value
+		template<typename T> T half2float(uint16 value)
+		{
+			return half2float_impl(value, T(), bool_type<std::numeric_limits<T>::is_iec559&&sizeof(typename bits<T>::type)==sizeof(T)>());
+		}
+
+		/// Convert half-precision floating point to integer.
+		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+		/// \tparam E `true` for round to even, `false` for round away from zero
+		/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits)
+		/// \param value binary representation of half-precision value
+		/// \return integral value
+		template<std::float_round_style R,bool E,typename T> T half2int_impl(uint16 value)
+		{
+		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+			static_assert(std::is_integral<T>::value, "half to int conversion only supports builtin integer types");
+		#endif
+			unsigned int e = value & 0x7FFF;
+			if(e >= 0x7C00)
+				return (value&0x8000) ? std::numeric_limits<T>::min() : std::numeric_limits<T>::max();
+			if(e < 0x3800)
+			{
+				if(R == std::round_toward_infinity)
+					return T(~(value>>15)&(e!=0));
+				else if(R == std::round_toward_neg_infinity)
+					return -T(value>0x8000);
+				return T();
+			}
+			unsigned int m = (value&0x3FF) | 0x400;
+			e >>= 10;
+			if(e < 25)
+			{
+				if(R == std::round_to_nearest)
+					m += (1<<(24-e)) - (~(m>>(25-e))&E);
+				else if(R == std::round_toward_infinity)
+					m += ((value>>15)-1) & ((1<<(25-e))-1U);
+				else if(R == std::round_toward_neg_infinity)
+					m += -(value>>15) & ((1<<(25-e))-1U);
+				m >>= 25 - e;
+			}
+			else
+				m <<= e - 25;
+			return (value&0x8000) ? -static_cast<T>(m) : static_cast<T>(m);
+		}
+
+		/// Convert half-precision floating point to integer.
+		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+		/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits)
+		/// \param value binary representation of half-precision value
+		/// \return integral value
+		template<std::float_round_style R,typename T> T half2int(uint16 value) { return half2int_impl<R,HALF_ROUND_TIES_TO_EVEN,T>(value); }
+
+		/// Convert half-precision floating point to integer using round-to-nearest-away-from-zero.
+		/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits)
+		/// \param value binary representation of half-precision value
+		/// \return integral value
+		template<typename T> T half2int_up(uint16 value) { return half2int_impl<std::round_to_nearest,0,T>(value); }
+
+		/// Round half-precision number to nearest integer value.
+		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+		/// \tparam E `true` for round to even, `false` for round away from zero
+		/// \param value binary representation of half-precision value
+		/// \return half-precision bits for nearest integral value
+		template<std::float_round_style R,bool E> uint16 round_half_impl(uint16 value)
+		{
+			unsigned int e = value & 0x7FFF;
+			uint16 result = value;
+			if(e < 0x3C00)
+			{
+				result &= 0x8000;
+				if(R == std::round_to_nearest)
+					result |= 0x3C00U & -(e>=(0x3800+E));
+				else if(R == std::round_toward_infinity)
+					result |= 0x3C00U & -(~(value>>15)&(e!=0));
+				else if(R == std::round_toward_neg_infinity)
+					result |= 0x3C00U & -(value>0x8000);
+			}
+			else if(e < 0x6400)
+			{
+				e = 25 - (e>>10);
+				unsigned int mask = (1<<e) - 1;
+				if(R == std::round_to_nearest)
+					result += (1<<(e-1)) - (~(result>>e)&E);
+				else if(R == std::round_toward_infinity)
+					result += mask & ((value>>15)-1);
+				else if(R == std::round_toward_neg_infinity)
+					result += mask & -(value>>15);
+				result &= ~mask;
+			}
+			return result;
+		}
+
+		/// Round half-precision number to nearest integer value.
+		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
+		/// \param value binary representation of half-precision value
+		/// \return half-precision bits for nearest integral value
+		template<std::float_round_style R> uint16 round_half(uint16 value) { return round_half_impl<R,HALF_ROUND_TIES_TO_EVEN>(value); }
+
+		/// Round half-precision number to nearest integer value using round-to-nearest-away-from-zero.
+		/// \param value binary representation of half-precision value
+		/// \return half-precision bits for nearest integral value
+		inline uint16 round_half_up(uint16 value) { return round_half_impl<std::round_to_nearest,0>(value); }
+		/// \}
+
+		struct functions;
+		template<typename> struct unary_specialized;
+		template<typename,typename> struct binary_specialized;
+		template<typename,typename,std::float_round_style> struct half_caster;
+	}
+
+	/// Half-precision floating point type.
+	/// This class implements an IEEE-conformant half-precision floating point type with the usual arithmetic operators and 
+	/// conversions. It is implicitly convertible to single-precision floating point, which makes artihmetic expressions and 
+	/// functions with mixed-type operands to be of the most precise operand type. Additionally all arithmetic operations 
+	/// (and many mathematical functions) are carried out in single-precision internally. All conversions from single- to 
+	/// half-precision are done using the library's default rounding mode, but temporary results inside chained arithmetic 
+	/// expressions are kept in single-precision as long as possible (while of course still maintaining a strong half-precision type).
+	///
+	/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and 
+	/// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which 
+	/// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the 
+	/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not neccessarily have to be of 
+	/// exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will most 
+	/// probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying 16-bit 
+	/// IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 bits if 
+	/// your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this should be the case on 
+	/// nearly any reasonable platform.
+	///
+	/// So if your C++ implementation is not totally exotic or imposes special alignment requirements, it is a reasonable 
+	/// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE representation.
+	class half
+	{
+		friend struct detail::functions;
+		friend struct detail::unary_specialized<half>;
+		friend struct detail::binary_specialized<half,half>;
+		template<typename,typename,std::float_round_style> friend struct detail::half_caster;
+		friend class std::numeric_limits<half>;
+	#if HALF_ENABLE_CPP11_HASH
+		friend struct std::hash<half>;
+	#endif
+	#if HALF_ENABLE_CPP11_USER_LITERALS
+		friend half literal::operator""_h(long double);
+	#endif
+
+	public:
+		/// Default constructor.
+		/// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics 
+		/// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics.
+		HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {}
+
+		/// Copy constructor.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to copy from
+		half(detail::expr rhs) : data_(detail::float2half<round_style>(static_cast<float>(rhs))) {}
+
+		/// Conversion constructor.
+		/// \param rhs float to convert
+		explicit half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
+	
+		/// Conversion to single-precision.
+		/// \return single precision value representing expression value
+		operator float() const { return detail::half2float<float>(data_); }
+
+		/// Assignment operator.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to copy from
+		/// \return reference to this half
+		half& operator=(detail::expr rhs) { return *this = static_cast<float>(rhs); }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to add
+		/// \return reference to this half
+		template<typename T> typename detail::enable<half&,T>::type operator+=(T rhs) { return *this += static_cast<float>(rhs); }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to subtract
+		/// \return reference to this half
+		template<typename T> typename detail::enable<half&,T>::type operator-=(T rhs) { return *this -= static_cast<float>(rhs); }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to multiply with
+		/// \return reference to this half
+		template<typename T> typename detail::enable<half&,T>::type operator*=(T rhs) { return *this *= static_cast<float>(rhs); }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to divide by
+		/// \return reference to this half
+		template<typename T> typename detail::enable<half&,T>::type operator/=(T rhs) { return *this /= static_cast<float>(rhs); }
+
+		/// Assignment operator.
+		/// \param rhs single-precision value to copy from
+		/// \return reference to this half
+		half& operator=(float rhs) { data_ = detail::float2half<round_style>(rhs); return *this; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to add
+		/// \return reference to this half
+		half& operator+=(float rhs) { data_ = detail::float2half<round_style>(detail::half2float<float>(data_)+rhs); return *this; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to subtract
+		/// \return reference to this half
+		half& operator-=(float rhs) { data_ = detail::float2half<round_style>(detail::half2float<float>(data_)-rhs); return *this; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to multiply with
+		/// \return reference to this half
+		half& operator*=(float rhs) { data_ = detail::float2half<round_style>(detail::half2float<float>(data_)*rhs); return *this; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to divide by
+		/// \return reference to this half
+		half& operator/=(float rhs) { data_ = detail::float2half<round_style>(detail::half2float<float>(data_)/rhs); return *this; }
+
+		/// Prefix increment.
+		/// \return incremented half value
+		half& operator++() { return *this += 1.0f; }
+
+		/// Prefix decrement.
+		/// \return decremented half value
+		half& operator--() { return *this -= 1.0f; }
+
+		/// Postfix increment.
+		/// \return non-incremented half value
+		half operator++(int) { half out(*this); ++*this; return out; }
+
+		/// Postfix decrement.
+		/// \return non-decremented half value
+		half operator--(int) { half out(*this); --*this; return out; }
+	
+	private:
+		/// Rounding mode to use
+		static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE);
+
+		/// Constructor.
+		/// \param bits binary representation to set half to
+		HALF_CONSTEXPR half(detail::binary_t, detail::uint16 bits) HALF_NOEXCEPT : data_(bits) {}
+
+		/// Internal binary representation
+		detail::uint16 data_;
+	};
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+	namespace literal
+	{
+		/// Half literal.
+		/// While this returns an actual half-precision value, half literals can unfortunately not be constant expressions due 
+		/// to rather involved conversions.
+		/// \param value literal value
+		/// \return half with given value (if representable)
+		inline half operator""_h(long double value) { return half(detail::binary, detail::float2half<half::round_style>(value)); }
+	}
+#endif
+
+	namespace detail
+	{
+		/// Wrapper implementing unspecialized half-precision functions.
+		struct functions
+		{
+			/// Addition implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \return Half-precision sum stored in single-precision
+			static expr plus(float x, float y) { return expr(x+y); }
+
+			/// Subtraction implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \return Half-precision difference stored in single-precision
+			static expr minus(float x, float y) { return expr(x-y); }
+
+			/// Multiplication implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \return Half-precision product stored in single-precision
+			static expr multiplies(float x, float y) { return expr(x*y); }
+
+			/// Division implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \return Half-precision quotient stored in single-precision
+			static expr divides(float x, float y) { return expr(x/y); }
+
+			/// Output implementation.
+			/// \param out stream to write to
+			/// \param arg value to write
+			/// \return reference to stream
+			template<typename charT,typename traits> static std::basic_ostream<charT,traits>& write(std::basic_ostream<charT,traits> &out, float arg) { return out << arg; }
+
+			/// Input implementation.
+			/// \param in stream to read from
+			/// \param arg half to read into
+			/// \return reference to stream
+			template<typename charT,typename traits> static std::basic_istream<charT,traits>& read(std::basic_istream<charT,traits> &in, half &arg)
+			{
+				float f;
+				if(in >> f)
+					arg = f;
+				return in;
+			}
+
+			/// Modulo implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \return Half-precision division remainder stored in single-precision
+			static expr fmod(float x, float y) { return expr(std::fmod(x, y)); }
+
+			/// Remainder implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \return Half-precision division remainder stored in single-precision
+			static expr remainder(float x, float y)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::remainder(x, y));
+			#else
+				if(builtin_isnan(x) || builtin_isnan(y))
+					return expr(std::numeric_limits<float>::quiet_NaN());
+				float ax = std::fabs(x), ay = std::fabs(y);
+				if(ax >= 65536.0f || ay < std::ldexp(1.0f, -24))
+					return expr(std::numeric_limits<float>::quiet_NaN());
+				if(ay >= 65536.0f)
+					return expr(x);
+				if(ax == ay)
+					return expr(builtin_signbit(x) ? -0.0f : 0.0f);
+				ax = std::fmod(ax, ay+ay);
+				float y2 = 0.5f * ay;
+				if(ax > y2)
+				{
+					ax -= ay;
+					if(ax >= y2)
+						ax -= ay;
+				}
+				return expr(builtin_signbit(x) ? -ax : ax);
+			#endif
+			}
+
+			/// Remainder implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \param quo address to store quotient bits at
+			/// \return Half-precision division remainder stored in single-precision
+			static expr remquo(float x, float y, int *quo)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::remquo(x, y, quo));
+			#else
+				if(builtin_isnan(x) || builtin_isnan(y))
+					return expr(std::numeric_limits<float>::quiet_NaN());
+				bool sign = builtin_signbit(x), qsign = static_cast<bool>(sign^builtin_signbit(y));
+				float ax = std::fabs(x), ay = std::fabs(y);
+				if(ax >= 65536.0f || ay < std::ldexp(1.0f, -24))
+					return expr(std::numeric_limits<float>::quiet_NaN());
+				if(ay >= 65536.0f)
+					return expr(x);
+				if(ax == ay)
+					return *quo = qsign ? -1 : 1, expr(sign ? -0.0f : 0.0f);
+				ax = std::fmod(ax, 8.0f*ay);
+				int cquo = 0;
+				if(ax >= 4.0f * ay)
+				{
+					ax -= 4.0f * ay;
+					cquo += 4;
+				}
+				if(ax >= 2.0f * ay)
+				{
+					ax -= 2.0f * ay;
+					cquo += 2;
+				}
+				float y2 = 0.5f * ay;
+				if(ax > y2)
+				{
+					ax -= ay;
+					++cquo;
+					if(ax >= y2)
+					{
+						ax -= ay;
+						++cquo;
+					}
+				}
+				return *quo = qsign ? -cquo : cquo, expr(sign ? -ax : ax);
+			#endif
+			}
+
+			/// Positive difference implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \return Positive difference stored in single-precision
+			static expr fdim(float x, float y)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::fdim(x, y));
+			#else
+				return expr((x<=y) ? 0.0f : (x-y));
+			#endif
+			}
+
+			/// Fused multiply-add implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \param z third operand
+			/// \return \a x * \a y + \a z stored in single-precision
+			static expr fma(float x, float y, float z)
+			{
+			#if HALF_ENABLE_CPP11_CMATH && defined(FP_FAST_FMAF)
+				return expr(std::fma(x, y, z));
+			#else
+				return expr(x*y+z);
+			#endif
+			}
+
+			/// Get NaN.
+			/// \return Half-precision quiet NaN
+			static half nanh() { return half(binary, 0x7FFF); }
+
+			/// Exponential implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr exp(float arg) { return expr(std::exp(arg)); }
+
+			/// Exponential implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr expm1(float arg)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::expm1(arg));
+			#else
+				return expr(static_cast<float>(std::exp(static_cast<double>(arg))-1.0));
+			#endif
+			}
+
+			/// Binary exponential implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr exp2(float arg)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::exp2(arg));
+			#else
+				return expr(static_cast<float>(std::exp(arg*0.69314718055994530941723212145818)));
+			#endif
+			}
+
+			/// Logarithm implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr log(float arg) { return expr(std::log(arg)); }
+
+			/// Common logarithm implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr log10(float arg) { return expr(std::log10(arg)); }
+
+			/// Logarithm implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr log1p(float arg)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::log1p(arg));
+			#else
+				return expr(static_cast<float>(std::log(1.0+arg)));
+			#endif
+			}
+
+			/// Binary logarithm implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr log2(float arg)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::log2(arg));
+			#else
+				return expr(static_cast<float>(std::log(static_cast<double>(arg))*1.4426950408889634073599246810019));
+			#endif
+			}
+
+			/// Square root implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr sqrt(float arg) { return expr(std::sqrt(arg)); }
+
+			/// Cubic root implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr cbrt(float arg)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::cbrt(arg));
+			#else
+				if(builtin_isnan(arg) || builtin_isinf(arg))
+					return expr(arg);
+				return expr(builtin_signbit(arg) ? -static_cast<float>(std::pow(-static_cast<double>(arg), 1.0/3.0)) : 
+					static_cast<float>(std::pow(static_cast<double>(arg), 1.0/3.0)));
+			#endif
+			}
+
+			/// Hypotenuse implementation.
+			/// \param x first argument
+			/// \param y second argument
+			/// \return function value stored in single-preicision
+			static expr hypot(float x, float y)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::hypot(x, y));
+			#else
+				return expr((builtin_isinf(x) || builtin_isinf(y)) ? std::numeric_limits<float>::infinity() : 
+					static_cast<float>(std::sqrt(static_cast<double>(x)*x+static_cast<double>(y)*y)));
+			#endif
+			}
+
+			/// Power implementation.
+			/// \param base value to exponentiate
+			/// \param exp power to expontiate to
+			/// \return function value stored in single-preicision
+			static expr pow(float base, float exp) { return expr(std::pow(base, exp)); }
+
+			/// Sine implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr sin(float arg) { return expr(std::sin(arg)); }
+
+			/// Cosine implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr cos(float arg) { return expr(std::cos(arg)); }
+
+			/// Tan implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr tan(float arg) { return expr(std::tan(arg)); }
+
+			/// Arc sine implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr asin(float arg) { return expr(std::asin(arg)); }
+
+			/// Arc cosine implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr acos(float arg) { return expr(std::acos(arg)); }
+
+			/// Arc tangent implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr atan(float arg) { return expr(std::atan(arg)); }
+
+			/// Arc tangent implementation.
+			/// \param x first argument
+			/// \param y second argument
+			/// \return function value stored in single-preicision
+			static expr atan2(float x, float y) { return expr(std::atan2(x, y)); }
+
+			/// Hyperbolic sine implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr sinh(float arg) { return expr(std::sinh(arg)); }
+
+			/// Hyperbolic cosine implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr cosh(float arg) { return expr(std::cosh(arg)); }
+
+			/// Hyperbolic tangent implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr tanh(float arg) { return expr(std::tanh(arg)); }
+
+			/// Hyperbolic area sine implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr asinh(float arg)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::asinh(arg));
+			#else
+				return expr((arg==-std::numeric_limits<float>::infinity()) ? arg : static_cast<float>(std::log(arg+std::sqrt(arg*arg+1.0))));
+			#endif
+			}
+
+			/// Hyperbolic area cosine implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr acosh(float arg)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::acosh(arg));
+			#else
+				return expr((arg<-1.0f) ? std::numeric_limits<float>::quiet_NaN() : static_cast<float>(std::log(arg+std::sqrt(arg*arg-1.0))));
+			#endif
+			}
+
+			/// Hyperbolic area tangent implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr atanh(float arg)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::atanh(arg));
+			#else
+				return expr(static_cast<float>(0.5*std::log((1.0+arg)/(1.0-arg))));
+			#endif
+			}
+
+			/// Error function implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr erf(float arg)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::erf(arg));
+			#else
+				return expr(static_cast<float>(erf(static_cast<double>(arg))));
+			#endif
+			}
+
+			/// Complementary implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr erfc(float arg)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::erfc(arg));
+			#else
+				return expr(static_cast<float>(1.0-erf(static_cast<double>(arg))));
+			#endif
+			}
+
+			/// Gamma logarithm implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr lgamma(float arg)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::lgamma(arg));
+			#else
+				if(builtin_isinf(arg))
+					return expr(std::numeric_limits<float>::infinity());
+				if(arg < 0.0f)
+				{
+					float i, f = std::modf(-arg, &i);
+					if(f == 0.0f)
+						return expr(std::numeric_limits<float>::infinity());
+					return expr(static_cast<float>(1.1447298858494001741434273513531-
+						std::log(std::abs(std::sin(3.1415926535897932384626433832795*f)))-lgamma(1.0-arg)));
+				}
+				return expr(static_cast<float>(lgamma(static_cast<double>(arg))));
+			#endif
+			}
+
+			/// Gamma implementation.
+			/// \param arg function argument
+			/// \return function value stored in single-preicision
+			static expr tgamma(float arg)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::tgamma(arg));
+			#else
+				if(arg == 0.0f)
+					return builtin_signbit(arg) ? expr(-std::numeric_limits<float>::infinity()) : expr(std::numeric_limits<float>::infinity());
+				if(arg < 0.0f)
+				{
+					float i, f = std::modf(-arg, &i);
+					if(f == 0.0f)
+						return expr(std::numeric_limits<float>::quiet_NaN());
+					double value = 3.1415926535897932384626433832795 / (std::sin(3.1415926535897932384626433832795*f)*std::exp(lgamma(1.0-arg)));
+					return expr(static_cast<float>((std::fmod(i, 2.0f)==0.0f) ? -value : value));
+				}
+				if(builtin_isinf(arg))
+					return expr(arg);
+				return expr(static_cast<float>(std::exp(lgamma(static_cast<double>(arg)))));
+			#endif
+			}
+
+			/// Floor implementation.
+			/// \param arg value to round
+			/// \return rounded value
+			static half floor(half arg) { return half(binary, round_half<std::round_toward_neg_infinity>(arg.data_)); }
+
+			/// Ceiling implementation.
+			/// \param arg value to round
+			/// \return rounded value
+			static half ceil(half arg) { return half(binary, round_half<std::round_toward_infinity>(arg.data_)); }
+
+			/// Truncation implementation.
+			/// \param arg value to round
+			/// \return rounded value
+			static half trunc(half arg) { return half(binary, round_half<std::round_toward_zero>(arg.data_)); }
+
+			/// Nearest integer implementation.
+			/// \param arg value to round
+			/// \return rounded value
+			static half round(half arg) { return half(binary, round_half_up(arg.data_)); }
+
+			/// Nearest integer implementation.
+			/// \param arg value to round
+			/// \return rounded value
+			static long lround(half arg) { return detail::half2int_up<long>(arg.data_); }
+
+			/// Nearest integer implementation.
+			/// \param arg value to round
+			/// \return rounded value
+			static half rint(half arg) { return half(binary, round_half<half::round_style>(arg.data_)); }
+
+			/// Nearest integer implementation.
+			/// \param arg value to round
+			/// \return rounded value
+			static long lrint(half arg) { return detail::half2int<half::round_style,long>(arg.data_); }
+
+		#if HALF_ENABLE_CPP11_LONG_LONG
+			/// Nearest integer implementation.
+			/// \param arg value to round
+			/// \return rounded value
+			static long long llround(half arg) { return detail::half2int_up<long long>(arg.data_); }
+
+			/// Nearest integer implementation.
+			/// \param arg value to round
+			/// \return rounded value
+			static long long llrint(half arg) { return detail::half2int<half::round_style,long long>(arg.data_); }
+		#endif
+
+			/// Decompression implementation.
+			/// \param arg number to decompress
+			/// \param exp address to store exponent at
+			/// \return normalized significant
+			static half frexp(half arg, int *exp)
+			{
+				int m = arg.data_ & 0x7FFF, e = -14;
+				if(m >= 0x7C00 || !m)
+					return *exp = 0, arg;
+				for(; m<0x400; m<<=1,--e) ;
+				return *exp = e+(m>>10), half(binary, (arg.data_&0x8000)|0x3800|(m&0x3FF));
+			}
+
+			/// Decompression implementation.
+			/// \param arg number to decompress
+			/// \param iptr address to store integer part at
+			/// \return fractional part
+			static half modf(half arg, half *iptr)
+			{
+				unsigned int e = arg.data_ & 0x7FFF;
+				if(e >= 0x6400)
+					return *iptr = arg, half(binary, arg.data_&(0x8000U|-(e>0x7C00)));
+				if(e < 0x3C00)
+					return iptr->data_ = arg.data_ & 0x8000, arg;
+				e >>= 10;
+				unsigned int mask = (1<<(25-e)) - 1, m = arg.data_ & mask;
+				iptr->data_ = arg.data_ & ~mask;
+				if(!m)
+					return half(binary, arg.data_&0x8000);
+				for(; m<0x400; m<<=1,--e) ;
+				return half(binary, static_cast<uint16>((arg.data_&0x8000)|(e<<10)|(m&0x3FF)));
+			}
+
+			/// Scaling implementation.
+			/// \param arg number to scale
+			/// \param exp power of two to scale by
+			/// \return scaled number
+			static half scalbln(half arg, long exp)
+			{
+				unsigned int m = arg.data_ & 0x7FFF;
+				if(m >= 0x7C00 || !m)
+					return arg;
+				for(; m<0x400; m<<=1,--exp) ;
+				exp += m >> 10;
+				uint16 value = arg.data_ & 0x8000;
+				if(exp > 30)
+				{
+					if(half::round_style == std::round_toward_zero)
+						value |= 0x7BFF;
+					else if(half::round_style == std::round_toward_infinity)
+						value |= 0x7C00 - (value>>15);
+					else if(half::round_style == std::round_toward_neg_infinity)
+						value |= 0x7BFF + (value>>15);
+					else
+						value |= 0x7C00;
+				}
+				else if(exp > 0)
+					value |= (exp<<10) | (m&0x3FF);
+				else if(exp > -11)
+				{
+					m = (m&0x3FF) | 0x400;
+					if(half::round_style == std::round_to_nearest)
+					{
+						m += 1 << -exp;
+					#if HALF_ROUND_TIES_TO_EVEN
+						m -= (m>>(1-exp)) & 1;
+					#endif
+					}
+					else if(half::round_style == std::round_toward_infinity)
+						m += ((value>>15)-1) & ((1<<(1-exp))-1U);
+					else if(half::round_style == std::round_toward_neg_infinity)
+						m += -(value>>15) & ((1<<(1-exp))-1U);
+					value |= m >> (1-exp);
+				}
+				else if(half::round_style == std::round_toward_infinity)
+					value -= (value>>15) - 1;
+				else if(half::round_style == std::round_toward_neg_infinity)
+					value += value >> 15;
+				return half(binary, value);
+			}
+
+			/// Exponent implementation.
+			/// \param arg number to query
+			/// \return floating point exponent
+			static int ilogb(half arg)
+			{
+				int abs = arg.data_ & 0x7FFF;
+				if(!abs)
+					return FP_ILOGB0;
+				if(abs < 0x7C00)
+				{
+					int exp = (abs>>10) - 15;
+					if(abs < 0x400)
+						for(; abs<0x200; abs<<=1,--exp) ;
+					return exp;
+				}
+				if(abs > 0x7C00)
+					return FP_ILOGBNAN;
+				return INT_MAX;
+			}
+
+			/// Exponent implementation.
+			/// \param arg number to query
+			/// \return floating point exponent
+			static half logb(half arg)
+			{
+				int abs = arg.data_ & 0x7FFF;
+				if(!abs)
+					return half(binary, 0xFC00);
+				if(abs < 0x7C00)
+				{
+					int exp = (abs>>10) - 15;
+					if(abs < 0x400)
+						for(; abs<0x200; abs<<=1,--exp) ;
+					uint16 bits = (exp<0) << 15;
+					if(exp)
+					{
+						unsigned int m = std::abs(exp) << 6, e = 18;
+						for(; m<0x400; m<<=1,--e) ;
+						bits |= (e<<10) + m;
+					}
+					return half(binary, bits);
+				}
+				if(abs > 0x7C00)
+					return arg;
+				return half(binary, 0x7C00);
+			}
+
+			/// Enumeration implementation.
+			/// \param from number to increase/decrease
+			/// \param to direction to enumerate into
+			/// \return next representable number
+			static half nextafter(half from, half to)
+			{
+				uint16 fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF;
+				if(fabs > 0x7C00)
+					return from;
+				if(tabs > 0x7C00 || from.data_ == to.data_ || !(fabs|tabs))
+					return to;
+				if(!fabs)
+					return half(binary, (to.data_&0x8000)+1);
+				bool lt = ((fabs==from.data_) ? static_cast<int>(fabs) : -static_cast<int>(fabs)) < 
+					((tabs==to.data_) ? static_cast<int>(tabs) : -static_cast<int>(tabs));
+				return half(binary, from.data_+(((from.data_>>15)^static_cast<unsigned>(lt))<<1)-1);
+			}
+
+			/// Enumeration implementation.
+			/// \param from number to increase/decrease
+			/// \param to direction to enumerate into
+			/// \return next representable number
+			static half nexttoward(half from, long double to)
+			{
+				if(isnan(from))
+					return from;
+				long double lfrom = static_cast<long double>(from);
+				if(builtin_isnan(to) || lfrom == to)
+					return half(static_cast<float>(to));
+				if(!(from.data_&0x7FFF))
+					return half(binary, (static_cast<detail::uint16>(builtin_signbit(to))<<15)+1);
+				return half(binary, from.data_+(((from.data_>>15)^static_cast<unsigned>(lfrom<to))<<1)-1);
+			}
+
+			/// Sign implementation
+			/// \param x first operand
+			/// \param y second operand
+			/// \return composed value
+			static half copysign(half x, half y) { return half(binary, x.data_^((x.data_^y.data_)&0x8000)); }
+
+			/// Classification implementation.
+			/// \param arg value to classify
+			/// \retval true if infinite number
+			/// \retval false else
+			static int fpclassify(half arg)
+			{
+				unsigned int abs = arg.data_ & 0x7FFF;
+				return abs ? ((abs>0x3FF) ? ((abs>=0x7C00) ? ((abs>0x7C00) ? FP_NAN : FP_INFINITE) : FP_NORMAL) :FP_SUBNORMAL) : FP_ZERO;
+			}
+
+			/// Classification implementation.
+			/// \param arg value to classify
+			/// \retval true if finite number
+			/// \retval false else
+			static bool isfinite(half arg) { return (arg.data_&0x7C00) != 0x7C00; }
+
+			/// Classification implementation.
+			/// \param arg value to classify
+			/// \retval true if infinite number
+			/// \retval false else
+			static bool isinf(half arg) { return (arg.data_&0x7FFF) == 0x7C00; }
+
+			/// Classification implementation.
+			/// \param arg value to classify
+			/// \retval true if not a number
+			/// \retval false else
+			static bool isnan(half arg) { return (arg.data_&0x7FFF) > 0x7C00; }
+
+			/// Classification implementation.
+			/// \param arg value to classify
+			/// \retval true if normal number
+			/// \retval false else
+			static bool isnormal(half arg) { return ((arg.data_&0x7C00)!=0) & ((arg.data_&0x7C00)!=0x7C00); }
+
+			/// Sign bit implementation.
+			/// \param arg value to check
+			/// \retval true if signed
+			/// \retval false if unsigned
+			static bool signbit(half arg) { return (arg.data_&0x8000) != 0; }
+
+			/// Comparison implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \retval true if operands equal
+			/// \retval false else
+			static bool isequal(half x, half y) { return (x.data_==y.data_ || !((x.data_|y.data_)&0x7FFF)) && !isnan(x); }
+
+			/// Comparison implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \retval true if operands not equal
+			/// \retval false else
+			static bool isnotequal(half x, half y) { return (x.data_!=y.data_ && ((x.data_|y.data_)&0x7FFF)) || isnan(x); }
+
+			/// Comparison implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \retval true if \a x > \a y
+			/// \retval false else
+			static bool isgreater(half x, half y)
+			{
+				int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+				return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) > ((yabs==y.data_) ? yabs : -yabs));
+			}
+
+			/// Comparison implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \retval true if \a x >= \a y
+			/// \retval false else
+			static bool isgreaterequal(half x, half y)
+			{
+				int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+				return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) >= ((yabs==y.data_) ? yabs : -yabs));
+			}
+
+			/// Comparison implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \retval true if \a x < \a y
+			/// \retval false else
+			static bool isless(half x, half y)
+			{
+				int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+				return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) < ((yabs==y.data_) ? yabs : -yabs));
+			}
+
+			/// Comparison implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \retval true if \a x <= \a y
+			/// \retval false else
+			static bool islessequal(half x, half y)
+			{
+				int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+				return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) <= ((yabs==y.data_) ? yabs : -yabs));
+			}
+
+			/// Comparison implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \retval true if either \a x > \a y nor \a x < \a y
+			/// \retval false else
+			static bool islessgreater(half x, half y)
+			{
+				int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+				if(xabs > 0x7C00 || yabs > 0x7C00)
+					return false;
+				int a = (xabs==x.data_) ? xabs : -xabs, b = (yabs==y.data_) ? yabs : -yabs;
+				return a < b || a > b;
+			}
+
+			/// Comparison implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \retval true if operand unordered
+			/// \retval false else
+			static bool isunordered(half x, half y) { return isnan(x) || isnan(y); }
+
+		private:
+			static double erf(double arg)
+			{
+				if(builtin_isinf(arg))
+					return (arg<0.0) ? -1.0 : 1.0;
+				double x2 = arg * arg, ax2 = 0.147 * x2, value = std::sqrt(1.0-std::exp(-x2*(1.2732395447351626861510701069801+ax2)/(1.0+ax2)));
+				return builtin_signbit(arg) ? -value : value;
+			}
+
+			static double lgamma(double arg)
+			{
+				double v = 1.0;
+				for(; arg<8.0; ++arg) v *= arg;
+				double w = 1.0 / (arg*arg);
+				return (((((((-0.02955065359477124183006535947712*w+0.00641025641025641025641025641026)*w+
+					-0.00191752691752691752691752691753)*w+8.4175084175084175084175084175084e-4)*w+
+					-5.952380952380952380952380952381e-4)*w+7.9365079365079365079365079365079e-4)*w+
+					-0.00277777777777777777777777777778)*w+0.08333333333333333333333333333333)/arg + 
+					0.91893853320467274178032973640562 - std::log(v) - arg + (arg-0.5) * std::log(arg);
+			}
+		};
+
+		/// Wrapper for unary half-precision functions needing specialization for individual argument types.
+		/// \tparam T argument type
+		template<typename T> struct unary_specialized
+		{
+			/// Negation implementation.
+			/// \param arg value to negate
+			/// \return negated value
+			static HALF_CONSTEXPR half negate(half arg) { return half(binary, arg.data_^0x8000); }
+
+			/// Absolute value implementation.
+			/// \param arg function argument
+			/// \return absolute value
+			static half fabs(half arg) { return half(binary, arg.data_&0x7FFF); }
+		};
+		template<> struct unary_specialized<expr>
+		{
+			static HALF_CONSTEXPR expr negate(float arg) { return expr(-arg); }
+			static expr fabs(float arg) { return expr(std::fabs(arg)); }
+		};
+
+		/// Wrapper for binary half-precision functions needing specialization for individual argument types.
+		/// \tparam T first argument type
+		/// \tparam U first argument type
+		template<typename T,typename U> struct binary_specialized
+		{
+			/// Minimum implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \return minimum value
+			static expr fmin(float x, float y)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::fmin(x, y));
+			#else
+				if(builtin_isnan(x))
+					return expr(y);
+				if(builtin_isnan(y))
+					return expr(x);
+				return expr(std::min(x, y));
+			#endif
+			}
+
+			/// Maximum implementation.
+			/// \param x first operand
+			/// \param y second operand
+			/// \return maximum value
+			static expr fmax(float x, float y)
+			{
+			#if HALF_ENABLE_CPP11_CMATH
+				return expr(std::fmax(x, y));
+			#else
+				if(builtin_isnan(x))
+					return expr(y);
+				if(builtin_isnan(y))
+					return expr(x);
+				return expr(std::max(x, y));
+			#endif
+			}
+		};
+		template<> struct binary_specialized<half,half>
+		{
+			static half fmin(half x, half y)
+			{
+				int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+				if(xabs > 0x7C00)
+					return y;
+				if(yabs > 0x7C00)
+					return x;
+				return (((xabs==x.data_) ? xabs : -xabs) > ((yabs==y.data_) ? yabs : -yabs)) ? y : x;
+			}
+			static half fmax(half x, half y)
+			{
+				int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
+				if(xabs > 0x7C00)
+					return y;
+				if(yabs > 0x7C00)
+					return x;
+				return (((xabs==x.data_) ? xabs : -xabs) < ((yabs==y.data_) ? yabs : -yabs)) ? y : x;
+			}
+		};
+
+		/// Helper class for half casts.
+		/// This class template has to be specialized for all valid cast argument to define an appropriate static `cast` member 
+		/// function and a corresponding `type` member denoting its return type.
+		/// \tparam T destination type
+		/// \tparam U source type
+		/// \tparam R rounding mode to use
+		template<typename T,typename U,std::float_round_style R=(std::float_round_style)(HALF_ROUND_STYLE)> struct half_caster {};
+		template<typename U,std::float_round_style R> struct half_caster<half,U,R>
+		{
+		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+			static_assert(std::is_arithmetic<U>::value, "half_cast from non-arithmetic type unsupported");
+		#endif
+
+			static half cast(U arg) { return cast_impl(arg, is_float<U>()); };
+
+		private:
+			static half cast_impl(U arg, true_type) { return half(binary, float2half<R>(arg)); }
+			static half cast_impl(U arg, false_type) { return half(binary, int2half<R>(arg)); }
+		};
+		template<typename T,std::float_round_style R> struct half_caster<T,half,R>
+		{
+		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+			static_assert(std::is_arithmetic<T>::value, "half_cast to non-arithmetic type unsupported");
+		#endif
+
+			static T cast(half arg) { return cast_impl(arg, is_float<T>()); }
+
+		private:
+			static T cast_impl(half arg, true_type) { return half2float<T>(arg.data_); }
+			static T cast_impl(half arg, false_type) { return half2int<R,T>(arg.data_); }
+		};
+		template<typename T,std::float_round_style R> struct half_caster<T,expr,R>
+		{
+		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+			static_assert(std::is_arithmetic<T>::value, "half_cast to non-arithmetic type unsupported");
+		#endif
+
+			static T cast(expr arg) { return cast_impl(arg, is_float<T>()); }
+
+		private:
+			static T cast_impl(float arg, true_type) { return static_cast<T>(arg); }
+			static T cast_impl(half arg, false_type) { return half2int<R,T>(arg.data_); }
+		};
+		template<std::float_round_style R> struct half_caster<half,half,R>
+		{
+			static half cast(half arg) { return arg; }
+		};
+		template<std::float_round_style R> struct half_caster<half,expr,R> : half_caster<half,half,R> {};
+
+		/// \name Comparison operators
+		/// \{
+
+		/// Comparison for equality.
+		/// \param x first operand
+		/// \param y second operand
+		/// \retval true if operands equal
+		/// \retval false else
+		template<typename T,typename U> typename enable<bool,T,U>::type operator==(T x, U y) { return functions::isequal(x, y); }
+
+		/// Comparison for inequality.
+		/// \param x first operand
+		/// \param y second operand
+		/// \retval true if operands not equal
+		/// \retval false else
+		template<typename T,typename U> typename enable<bool,T,U>::type operator!=(T x, U y) { return functions::isnotequal(x, y); }
+
+		/// Comparison for less than.
+		/// \param x first operand
+		/// \param y second operand
+		/// \retval true if \a x less than \a y
+		/// \retval false else
+		template<typename T,typename U> typename enable<bool,T,U>::type operator<(T x, U y) { return functions::isless(x, y); }
+
+		/// Comparison for greater than.
+		/// \param x first operand
+		/// \param y second operand
+		/// \retval true if \a x greater than \a y
+		/// \retval false else
+		template<typename T,typename U> typename enable<bool,T,U>::type operator>(T x, U y) { return functions::isgreater(x, y); }
+
+		/// Comparison for less equal.
+		/// \param x first operand
+		/// \param y second operand
+		/// \retval true if \a x less equal \a y
+		/// \retval false else
+		template<typename T,typename U> typename enable<bool,T,U>::type operator<=(T x, U y) { return functions::islessequal(x, y); }
+
+		/// Comparison for greater equal.
+		/// \param x first operand
+		/// \param y second operand
+		/// \retval true if \a x greater equal \a y
+		/// \retval false else
+		template<typename T,typename U> typename enable<bool,T,U>::type operator>=(T x, U y) { return functions::isgreaterequal(x, y); }
+
+		/// \}
+		/// \name Arithmetic operators
+		/// \{
+
+		/// Add halfs.
+		/// \param x left operand
+		/// \param y right operand
+		/// \return sum of half expressions
+		template<typename T,typename U> typename enable<expr,T,U>::type operator+(T x, U y) { return functions::plus(x, y); }
+
+		/// Subtract halfs.
+		/// \param x left operand
+		/// \param y right operand
+		/// \return difference of half expressions
+		template<typename T,typename U> typename enable<expr,T,U>::type operator-(T x, U y) { return functions::minus(x, y); }
+
+		/// Multiply halfs.
+		/// \param x left operand
+		/// \param y right operand
+		/// \return product of half expressions
+		template<typename T,typename U> typename enable<expr,T,U>::type operator*(T x, U y) { return functions::multiplies(x, y); }
+
+		/// Divide halfs.
+		/// \param x left operand
+		/// \param y right operand
+		/// \return quotient of half expressions
+		template<typename T,typename U> typename enable<expr,T,U>::type operator/(T x, U y) { return functions::divides(x, y); }
+
+		/// Identity.
+		/// \param arg operand
+		/// \return uncahnged operand
+		template<typename T> HALF_CONSTEXPR typename enable<T,T>::type operator+(T arg) { return arg; }
+
+		/// Negation.
+		/// \param arg operand
+		/// \return negated operand
+		template<typename T> HALF_CONSTEXPR typename enable<T,T>::type operator-(T arg) { return unary_specialized<T>::negate(arg); }
+
+		/// \}
+		/// \name Input and output
+		/// \{
+
+		/// Output operator.
+		/// \param out output stream to write into
+		/// \param arg half expression to write
+		/// \return reference to output stream
+		template<typename T,typename charT,typename traits> typename enable<std::basic_ostream<charT,traits>&,T>::type
+			operator<<(std::basic_ostream<charT,traits> &out, T arg) { return functions::write(out, arg); }
+
+		/// Input operator.
+		/// \param in input stream to read from
+		/// \param arg half to read into
+		/// \return reference to input stream
+		template<typename charT,typename traits> std::basic_istream<charT,traits>&
+			operator>>(std::basic_istream<charT,traits> &in, half &arg) { return functions::read(in, arg); }
+
+		/// \}
+		/// \name Basic mathematical operations
+		/// \{
+
+		/// Absolute value.
+		/// \param arg operand
+		/// \return absolute value of \a arg
+//		template<typename T> typename enable<T,T>::type abs(T arg) { return unary_specialized<T>::fabs(arg); }
+		inline half abs(half arg) { return unary_specialized<half>::fabs(arg); }
+		inline expr abs(expr arg) { return unary_specialized<expr>::fabs(arg); }
+
+		/// Absolute value.
+		/// \param arg operand
+		/// \return absolute value of \a arg
+//		template<typename T> typename enable<T,T>::type fabs(T arg) { return unary_specialized<T>::fabs(arg); }
+		inline half fabs(half arg) { return unary_specialized<half>::fabs(arg); }
+		inline expr fabs(expr arg) { return unary_specialized<expr>::fabs(arg); }
+
+		/// Remainder of division.
+		/// \param x first operand
+		/// \param y second operand
+		/// \return remainder of floating point division.
+//		template<typename T,typename U> typename enable<expr,T,U>::type fmod(T x, U y) { return functions::fmod(x, y); }
+		inline expr fmod(half x, half y) { return functions::fmod(x, y); }
+		inline expr fmod(half x, expr y) { return functions::fmod(x, y); }
+		inline expr fmod(expr x, half y) { return functions::fmod(x, y); }
+		inline expr fmod(expr x, expr y) { return functions::fmod(x, y); }
+
+		/// Remainder of division.
+		/// \param x first operand
+		/// \param y second operand
+		/// \return remainder of floating point division.
+//		template<typename T,typename U> typename enable<expr,T,U>::type remainder(T x, U y) { return functions::remainder(x, y); }
+		inline expr remainder(half x, half y) { return functions::remainder(x, y); }
+		inline expr remainder(half x, expr y) { return functions::remainder(x, y); }
+		inline expr remainder(expr x, half y) { return functions::remainder(x, y); }
+		inline expr remainder(expr x, expr y) { return functions::remainder(x, y); }
+
+		/// Remainder of division.
+		/// \param x first operand
+		/// \param y second operand
+		/// \param quo address to store some bits of quotient at
+		/// \return remainder of floating point division.
+//		template<typename T,typename U> typename enable<expr,T,U>::type remquo(T x, U y, int *quo) { return functions::remquo(x, y, quo); }
+		inline expr remquo(half x, half y, int *quo) { return functions::remquo(x, y, quo); }
+		inline expr remquo(half x, expr y, int *quo) { return functions::remquo(x, y, quo); }
+		inline expr remquo(expr x, half y, int *quo) { return functions::remquo(x, y, quo); }
+		inline expr remquo(expr x, expr y, int *quo) { return functions::remquo(x, y, quo); }
+
+		/// Fused multiply add.
+		/// \param x first operand
+		/// \param y second operand
+		/// \param z third operand
+		/// \return ( \a x * \a y ) + \a z rounded as one operation.
+//		template<typename T,typename U,typename V> typename enable<expr,T,U,V>::type fma(T x, U y, V z) { return functions::fma(x, y, z); }
+		inline expr fma(half x, half y, half z) { return functions::fma(x, y, z); }
+		inline expr fma(half x, half y, expr z) { return functions::fma(x, y, z); }
+		inline expr fma(half x, expr y, half z) { return functions::fma(x, y, z); }
+		inline expr fma(half x, expr y, expr z) { return functions::fma(x, y, z); }
+		inline expr fma(expr x, half y, half z) { return functions::fma(x, y, z); }
+		inline expr fma(expr x, half y, expr z) { return functions::fma(x, y, z); }
+		inline expr fma(expr x, expr y, half z) { return functions::fma(x, y, z); }
+		inline expr fma(expr x, expr y, expr z) { return functions::fma(x, y, z); }
+
+		/// Maximum of half expressions.
+		/// \param x first operand
+		/// \param y second operand
+		/// \return maximum of operands
+//		template<typename T,typename U> typename result<T,U>::type fmax(T x, U y) { return binary_specialized<T,U>::fmax(x, y); }
+		inline half fmax(half x, half y) { return binary_specialized<half,half>::fmax(x, y); }
+		inline expr fmax(half x, expr y) { return binary_specialized<half,expr>::fmax(x, y); }
+		inline expr fmax(expr x, half y) { return binary_specialized<expr,half>::fmax(x, y); }
+		inline expr fmax(expr x, expr y) { return binary_specialized<expr,expr>::fmax(x, y); }
+
+		/// Minimum of half expressions.
+		/// \param x first operand
+		/// \param y second operand
+		/// \return minimum of operands
+//		template<typename T,typename U> typename result<T,U>::type fmin(T x, U y) { return binary_specialized<T,U>::fmin(x, y); }
+		inline half fmin(half x, half y) { return binary_specialized<half,half>::fmin(x, y); }
+		inline expr fmin(half x, expr y) { return binary_specialized<half,expr>::fmin(x, y); }
+		inline expr fmin(expr x, half y) { return binary_specialized<expr,half>::fmin(x, y); }
+		inline expr fmin(expr x, expr y) { return binary_specialized<expr,expr>::fmin(x, y); }
+
+		/// Positive difference.
+		/// \param x first operand
+		/// \param y second operand
+		/// \return \a x - \a y or 0 if difference negative
+//		template<typename T,typename U> typename enable<expr,T,U>::type fdim(T x, U y) { return functions::fdim(x, y); }
+		inline expr fdim(half x, half y) { return functions::fdim(x, y); }
+		inline expr fdim(half x, expr y) { return functions::fdim(x, y); }
+		inline expr fdim(expr x, half y) { return functions::fdim(x, y); }
+		inline expr fdim(expr x, expr y) { return functions::fdim(x, y); }
+
+		/// Get NaN value.
+		/// \return quiet NaN
+		inline half nanh(const char*) { return functions::nanh(); }
+
+		/// \}
+		/// \name Exponential functions
+		/// \{
+
+		/// Exponential function.
+		/// \param arg function argument
+		/// \return e raised to \a arg
+//		template<typename T> typename enable<expr,T>::type exp(T arg) { return functions::exp(arg); }
+		inline expr exp(half arg) { return functions::exp(arg); }
+		inline expr exp(expr arg) { return functions::exp(arg); }
+
+		/// Exponential minus one.
+		/// \param arg function argument
+		/// \return e raised to \a arg subtracted by 1
+//		template<typename T> typename enable<expr,T>::type expm1(T arg) { return functions::expm1(arg); }
+		inline expr expm1(half arg) { return functions::expm1(arg); }
+		inline expr expm1(expr arg) { return functions::expm1(arg); }
+
+		/// Binary exponential.
+		/// \param arg function argument
+		/// \return 2 raised to \a arg
+//		template<typename T> typename enable<expr,T>::type exp2(T arg) { return functions::exp2(arg); }
+		inline expr exp2(half arg) { return functions::exp2(arg); }
+		inline expr exp2(expr arg) { return functions::exp2(arg); }
+
+		/// Natural logorithm.
+		/// \param arg function argument
+		/// \return logarithm of \a arg to base e
+//		template<typename T> typename enable<expr,T>::type log(T arg) { return functions::log(arg); }
+		inline expr log(half arg) { return functions::log(arg); }
+		inline expr log(expr arg) { return functions::log(arg); }
+
+		/// Common logorithm.
+		/// \param arg function argument
+		/// \return logarithm of \a arg to base 10
+//		template<typename T> typename enable<expr,T>::type log10(T arg) { return functions::log10(arg); }
+		inline expr log10(half arg) { return functions::log10(arg); }
+		inline expr log10(expr arg) { return functions::log10(arg); }
+
+		/// Natural logorithm.
+		/// \param arg function argument
+		/// \return logarithm of \a arg plus 1 to base e
+//		template<typename T> typename enable<expr,T>::type log1p(T arg) { return functions::log1p(arg); }
+		inline expr log1p(half arg) { return functions::log1p(arg); }
+		inline expr log1p(expr arg) { return functions::log1p(arg); }
+
+		/// Binary logorithm.
+		/// \param arg function argument
+		/// \return logarithm of \a arg to base 2
+//		template<typename T> typename enable<expr,T>::type log2(T arg) { return functions::log2(arg); }
+		inline expr log2(half arg) { return functions::log2(arg); }
+		inline expr log2(expr arg) { return functions::log2(arg); }
+
+		/// \}
+		/// \name Power functions
+		/// \{
+
+		/// Square root.
+		/// \param arg function argument
+		/// \return square root of \a arg
+//		template<typename T> typename enable<expr,T>::type sqrt(T arg) { return functions::sqrt(arg); }
+		inline expr sqrt(half arg) { return functions::sqrt(arg); }
+		inline expr sqrt(expr arg) { return functions::sqrt(arg); }
+
+		/// Cubic root.
+		/// \param arg function argument
+		/// \return cubic root of \a arg
+//		template<typename T> typename enable<expr,T>::type cbrt(T arg) { return functions::cbrt(arg); }
+		inline expr cbrt(half arg) { return functions::cbrt(arg); }
+		inline expr cbrt(expr arg) { return functions::cbrt(arg); }
+
+		/// Hypotenuse function.
+		/// \param x first argument
+		/// \param y second argument
+		/// \return square root of sum of squares without internal over- or underflows
+//		template<typename T,typename U> typename enable<expr,T,U>::type hypot(T x, U y) { return functions::hypot(x, y); }
+		inline expr hypot(half x, half y) { return functions::hypot(x, y); }
+		inline expr hypot(half x, expr y) { return functions::hypot(x, y); }
+		inline expr hypot(expr x, half y) { return functions::hypot(x, y); }
+		inline expr hypot(expr x, expr y) { return functions::hypot(x, y); }
+
+		/// Power function.
+		/// \param base first argument
+		/// \param exp second argument
+		/// \return \a base raised to \a exp
+//		template<typename T,typename U> typename enable<expr,T,U>::type pow(T base, U exp) { return functions::pow(base, exp); }
+		inline expr pow(half base, half exp) { return functions::pow(base, exp); }
+		inline expr pow(half base, expr exp) { return functions::pow(base, exp); }
+		inline expr pow(expr base, half exp) { return functions::pow(base, exp); }
+		inline expr pow(expr base, expr exp) { return functions::pow(base, exp); }
+
+		/// \}
+		/// \name Trigonometric functions
+		/// \{
+
+		/// Sine function.
+		/// \param arg function argument
+		/// \return sine value of \a arg
+//		template<typename T> typename enable<expr,T>::type sin(T arg) { return functions::sin(arg); }
+		inline expr sin(half arg) { return functions::sin(arg); }
+		inline expr sin(expr arg) { return functions::sin(arg); }
+
+		/// Cosine function.
+		/// \param arg function argument
+		/// \return cosine value of \a arg
+//		template<typename T> typename enable<expr,T>::type cos(T arg) { return functions::cos(arg); }
+		inline expr cos(half arg) { return functions::cos(arg); }
+		inline expr cos(expr arg) { return functions::cos(arg); }
+
+		/// Tangent function.
+		/// \param arg function argument
+		/// \return tangent value of \a arg
+//		template<typename T> typename enable<expr,T>::type tan(T arg) { return functions::tan(arg); }
+		inline expr tan(half arg) { return functions::tan(arg); }
+		inline expr tan(expr arg) { return functions::tan(arg); }
+
+		/// Arc sine.
+		/// \param arg function argument
+		/// \return arc sine value of \a arg
+//		template<typename T> typename enable<expr,T>::type asin(T arg) { return functions::asin(arg); }
+		inline expr asin(half arg) { return functions::asin(arg); }
+		inline expr asin(expr arg) { return functions::asin(arg); }
+
+		/// Arc cosine function.
+		/// \param arg function argument
+		/// \return arc cosine value of \a arg
+//		template<typename T> typename enable<expr,T>::type acos(T arg) { return functions::acos(arg); }
+		inline expr acos(half arg) { return functions::acos(arg); }
+		inline expr acos(expr arg) { return functions::acos(arg); }
+
+		/// Arc tangent function.
+		/// \param arg function argument
+		/// \return arc tangent value of \a arg
+//		template<typename T> typename enable<expr,T>::type atan(T arg) { return functions::atan(arg); }
+		inline expr atan(half arg) { return functions::atan(arg); }
+		inline expr atan(expr arg) { return functions::atan(arg); }
+
+		/// Arc tangent function.
+		/// \param x first argument
+		/// \param y second argument
+		/// \return arc tangent value
+//		template<typename T,typename U> typename enable<expr,T,U>::type atan2(T x, U y) { return functions::atan2(x, y); }
+		inline expr atan2(half x, half y) { return functions::atan2(x, y); }
+		inline expr atan2(half x, expr y) { return functions::atan2(x, y); }
+		inline expr atan2(expr x, half y) { return functions::atan2(x, y); }
+		inline expr atan2(expr x, expr y) { return functions::atan2(x, y); }
+
+		/// \}
+		/// \name Hyperbolic functions
+		/// \{
+
+		/// Hyperbolic sine.
+		/// \param arg function argument
+		/// \return hyperbolic sine value of \a arg
+//		template<typename T> typename enable<expr,T>::type sinh(T arg) { return functions::sinh(arg); }
+		inline expr sinh(half arg) { return functions::sinh(arg); }
+		inline expr sinh(expr arg) { return functions::sinh(arg); }
+
+		/// Hyperbolic cosine.
+		/// \param arg function argument
+		/// \return hyperbolic cosine value of \a arg
+//		template<typename T> typename enable<expr,T>::type cosh(T arg) { return functions::cosh(arg); }
+		inline expr cosh(half arg) { return functions::cosh(arg); }
+		inline expr cosh(expr arg) { return functions::cosh(arg); }
+
+		/// Hyperbolic tangent.
+		/// \param arg function argument
+		/// \return hyperbolic tangent value of \a arg
+//		template<typename T> typename enable<expr,T>::type tanh(T arg) { return functions::tanh(arg); }
+		inline expr tanh(half arg) { return functions::tanh(arg); }
+		inline expr tanh(expr arg) { return functions::tanh(arg); }
+
+		/// Hyperbolic area sine.
+		/// \param arg function argument
+		/// \return area sine value of \a arg
+//		template<typename T> typename enable<expr,T>::type asinh(T arg) { return functions::asinh(arg); }
+		inline expr asinh(half arg) { return functions::asinh(arg); }
+		inline expr asinh(expr arg) { return functions::asinh(arg); }
+
+		/// Hyperbolic area cosine.
+		/// \param arg function argument
+		/// \return area cosine value of \a arg
+//		template<typename T> typename enable<expr,T>::type acosh(T arg) { return functions::acosh(arg); }
+		inline expr acosh(half arg) { return functions::acosh(arg); }
+		inline expr acosh(expr arg) { return functions::acosh(arg); }
+
+		/// Hyperbolic area tangent.
+		/// \param arg function argument
+		/// \return area tangent value of \a arg
+//		template<typename T> typename enable<expr,T>::type atanh(T arg) { return functions::atanh(arg); }
+		inline expr atanh(half arg) { return functions::atanh(arg); }
+		inline expr atanh(expr arg) { return functions::atanh(arg); }
+
+		/// \}
+		/// \name Error and gamma functions
+		/// \{
+
+		/// Error function.
+		/// \param arg function argument
+		/// \return error function value of \a arg
+//		template<typename T> typename enable<expr,T>::type erf(T arg) { return functions::erf(arg); }
+		inline expr erf(half arg) { return functions::erf(arg); }
+		inline expr erf(expr arg) { return functions::erf(arg); }
+
+		/// Complementary error function.
+		/// \param arg function argument
+		/// \return 1 minus error function value of \a arg
+//		template<typename T> typename enable<expr,T>::type erfc(T arg) { return functions::erfc(arg); }
+		inline expr erfc(half arg) { return functions::erfc(arg); }
+		inline expr erfc(expr arg) { return functions::erfc(arg); }
+
+		/// Natural logarithm of gamma function.
+		/// \param arg function argument
+		/// \return natural logarith of gamma function for \a arg
+//		template<typename T> typename enable<expr,T>::type lgamma(T arg) { return functions::lgamma(arg); }
+		inline expr lgamma(half arg) { return functions::lgamma(arg); }
+		inline expr lgamma(expr arg) { return functions::lgamma(arg); }
+
+		/// Gamma function.
+		/// \param arg function argument
+		/// \return gamma function value of \a arg
+//		template<typename T> typename enable<expr,T>::type tgamma(T arg) { return functions::tgamma(arg); }
+		inline expr tgamma(half arg) { return functions::tgamma(arg); }
+		inline expr tgamma(expr arg) { return functions::tgamma(arg); }
+
+		/// \}
+		/// \name Rounding
+		/// \{
+
+		/// Nearest integer not less than half value.
+		/// \param arg half to round
+		/// \return nearest integer not less than \a arg
+//		template<typename T> typename enable<half,T>::type ceil(T arg) { return functions::ceil(arg); }
+		inline half ceil(half arg) { return functions::ceil(arg); }
+		inline half ceil(expr arg) { return functions::ceil(arg); }
+
+		/// Nearest integer not greater than half value.
+		/// \param arg half to round
+		/// \return nearest integer not greater than \a arg
+//		template<typename T> typename enable<half,T>::type floor(T arg) { return functions::floor(arg); }
+		inline half floor(half arg) { return functions::floor(arg); }
+		inline half floor(expr arg) { return functions::floor(arg); }
+
+		/// Nearest integer not greater in magnitude than half value.
+		/// \param arg half to round
+		/// \return nearest integer not greater in magnitude than \a arg
+//		template<typename T> typename enable<half,T>::type trunc(T arg) { return functions::trunc(arg); }
+		inline half trunc(half arg) { return functions::trunc(arg); }
+		inline half trunc(expr arg) { return functions::trunc(arg); }
+
+		/// Nearest integer.
+		/// \param arg half to round
+		/// \return nearest integer, rounded away from zero in half-way cases
+//		template<typename T> typename enable<half,T>::type round(T arg) { return functions::round(arg); }
+		inline half round(half arg) { return functions::round(arg); }
+		inline half round(expr arg) { return functions::round(arg); }
+
+		/// Nearest integer.
+		/// \param arg half to round
+		/// \return nearest integer, rounded away from zero in half-way cases
+//		template<typename T> typename enable<long,T>::type lround(T arg) { return functions::lround(arg); }
+		inline long lround(half arg) { return functions::lround(arg); }
+		inline long lround(expr arg) { return functions::lround(arg); }
+
+		/// Nearest integer using half's internal rounding mode.
+		/// \param arg half expression to round
+		/// \return nearest integer using default rounding mode
+//		template<typename T> typename enable<half,T>::type nearbyint(T arg) { return functions::nearbyint(arg); }
+		inline half nearbyint(half arg) { return functions::rint(arg); }
+		inline half nearbyint(expr arg) { return functions::rint(arg); }
+
+		/// Nearest integer using half's internal rounding mode.
+		/// \param arg half expression to round
+		/// \return nearest integer using default rounding mode
+//		template<typename T> typename enable<half,T>::type rint(T arg) { return functions::rint(arg); }
+		inline half rint(half arg) { return functions::rint(arg); }
+		inline half rint(expr arg) { return functions::rint(arg); }
+
+		/// Nearest integer using half's internal rounding mode.
+		/// \param arg half expression to round
+		/// \return nearest integer using default rounding mode
+//		template<typename T> typename enable<long,T>::type lrint(T arg) { return functions::lrint(arg); }
+		inline long lrint(half arg) { return functions::lrint(arg); }
+		inline long lrint(expr arg) { return functions::lrint(arg); }
+	#if HALF_ENABLE_CPP11_LONG_LONG
+		/// Nearest integer.
+		/// \param arg half to round
+		/// \return nearest integer, rounded away from zero in half-way cases
+//		template<typename T> typename enable<long long,T>::type llround(T arg) { return functions::llround(arg); }
+		inline long long llround(half arg) { return functions::llround(arg); }
+		inline long long llround(expr arg) { return functions::llround(arg); }
+
+		/// Nearest integer using half's internal rounding mode.
+		/// \param arg half expression to round
+		/// \return nearest integer using default rounding mode
+//		template<typename T> typename enable<long long,T>::type llrint(T arg) { return functions::llrint(arg); }
+		inline long long llrint(half arg) { return functions::llrint(arg); }
+		inline long long llrint(expr arg) { return functions::llrint(arg); }
+	#endif
+
+		/// \}
+		/// \name Floating point manipulation
+		/// \{
+
+		/// Decompress floating point number.
+		/// \param arg number to decompress
+		/// \param exp address to store exponent at
+		/// \return significant in range [0.5, 1)
+//		template<typename T> typename enable<half,T>::type frexp(T arg, int *exp) { return functions::frexp(arg, exp); }
+		inline half frexp(half arg, int *exp) { return functions::frexp(arg, exp); }
+		inline half frexp(expr arg, int *exp) { return functions::frexp(arg, exp); }
+
+		/// Multiply by power of two.
+		/// \param arg number to modify
+		/// \param exp power of two to multiply with
+		/// \return \a arg multplied by 2 raised to \a exp
+//		template<typename T> typename enable<half,T>::type ldexp(T arg, int exp) { return functions::scalbln(arg, exp); }
+		inline half ldexp(half arg, int exp) { return functions::scalbln(arg, exp); }
+		inline half ldexp(expr arg, int exp) { return functions::scalbln(arg, exp); }
+
+		/// Extract integer and fractional parts.
+		/// \param arg number to decompress
+		/// \param iptr address to store integer part at
+		/// \return fractional part
+//		template<typename T> typename enable<half,T>::type modf(T arg, half *iptr) { return functions::modf(arg, iptr); }
+		inline half modf(half arg, half *iptr) { return functions::modf(arg, iptr); }
+		inline half modf(expr arg, half *iptr) { return functions::modf(arg, iptr); }
+
+		/// Multiply by power of two.
+		/// \param arg number to modify
+		/// \param exp power of two to multiply with
+		/// \return \a arg multplied by 2 raised to \a exp
+//		template<typename T> typename enable<half,T>::type scalbn(T arg, int exp) { return functions::scalbln(arg, exp); }
+		inline half scalbn(half arg, int exp) { return functions::scalbln(arg, exp); }
+		inline half scalbn(expr arg, int exp) { return functions::scalbln(arg, exp); }
+
+		/// Multiply by power of two.
+		/// \param arg number to modify
+		/// \param exp power of two to multiply with
+		/// \return \a arg multplied by 2 raised to \a exp	
+//		template<typename T> typename enable<half,T>::type scalbln(T arg, long exp) { return functions::scalbln(arg, exp); }
+		inline half scalbln(half arg, long exp) { return functions::scalbln(arg, exp); }
+		inline half scalbln(expr arg, long exp) { return functions::scalbln(arg, exp); }
+
+		/// Extract exponent.
+		/// \param arg number to query
+		/// \return floating point exponent
+		/// \retval FP_ILOGB0 for zero
+		/// \retval FP_ILOGBNAN for NaN
+		/// \retval MAX_INT for infinity
+//		template<typename T> typename enable<int,T>::type ilogb(T arg) { return functions::ilogb(arg); }
+		inline int ilogb(half arg) { return functions::ilogb(arg); }
+		inline int ilogb(expr arg) { return functions::ilogb(arg); }
+
+		/// Extract exponent.
+		/// \param arg number to query
+		/// \return floating point exponent
+//		template<typename T> typename enable<half,T>::type logb(T arg) { return functions::logb(arg); }
+		inline half logb(half arg) { return functions::logb(arg); }
+		inline half logb(expr arg) { return functions::logb(arg); }
+
+		/// Next representable value.
+		/// \param from value to compute next representable value for
+		/// \param to direction towards which to compute next value
+		/// \return next representable value after \a from in direction towards \a to
+//		template<typename T,typename U> typename enable<half,T,U>::type nextafter(T from, U to) { return functions::nextafter(from, to); }
+		inline half nextafter(half from, half to) { return functions::nextafter(from, to); }
+		inline half nextafter(half from, expr to) { return functions::nextafter(from, to); }
+		inline half nextafter(expr from, half to) { return functions::nextafter(from, to); }
+		inline half nextafter(expr from, expr to) { return functions::nextafter(from, to); }
+
+		/// Next representable value.
+		/// \param from value to compute next representable value for
+		/// \param to direction towards which to compute next value
+		/// \return next representable value after \a from in direction towards \a to
+//		template<typename T> typename enable<half,T>::type nexttoward(T from, long double to) { return functions::nexttoward(from, to); }
+		inline half nexttoward(half from, long double to) { return functions::nexttoward(from, to); }
+		inline half nexttoward(expr from, long double to) { return functions::nexttoward(from, to); }
+
+		/// Take sign.
+		/// \param x value to change sign for
+		/// \param y value to take sign from
+		/// \return value equal to \a x in magnitude and to \a y in sign
+//		template<typename T,typename U> typename enable<half,T,U>::type copysign(T x, U y) { return functions::copysign(x, y); }
+		inline half copysign(half x, half y) { return functions::copysign(x, y); }
+		inline half copysign(half x, expr y) { return functions::copysign(x, y); }
+		inline half copysign(expr x, half y) { return functions::copysign(x, y); }
+		inline half copysign(expr x, expr y) { return functions::copysign(x, y); }
+
+		/// \}
+		/// \name Floating point classification
+		/// \{
+
+
+		/// Classify floating point value.
+		/// \param arg number to classify
+		/// \retval FP_ZERO for positive and negative zero
+		/// \retval FP_SUBNORMAL for subnormal numbers
+		/// \retval FP_INFINITY for positive and negative infinity
+		/// \retval FP_NAN for NaNs
+		/// \retval FP_NORMAL for all other (normal) values
+//		template<typename T> typename enable<int,T>::type fpclassify(T arg) { return functions::fpclassify(arg); }
+		inline int fpclassify(half arg) { return functions::fpclassify(arg); }
+		inline int fpclassify(expr arg) { return functions::fpclassify(arg); }
+
+		/// Check if finite number.
+		/// \param arg number to check
+		/// \retval true if neither infinity nor NaN
+		/// \retval false else
+//		template<typename T> typename enable<bool,T>::type isfinite(T arg) { return functions::isfinite(arg); }
+		inline bool isfinite(half arg) { return functions::isfinite(arg); }
+		inline bool isfinite(expr arg) { return functions::isfinite(arg); }
+
+		/// Check for infinity.
+		/// \param arg number to check
+		/// \retval true for positive or negative infinity
+		/// \retval false else
+//		template<typename T> typename enable<bool,T>::type isinf(T arg) { return functions::isinf(arg); }
+		inline bool isinf(half arg) { return functions::isinf(arg); }
+		inline bool isinf(expr arg) { return functions::isinf(arg); }
+
+		/// Check for NaN.
+		/// \param arg number to check
+		/// \retval true for NaNs
+		/// \retval false else
+//		template<typename T> typename enable<bool,T>::type isnan(T arg) { return functions::isnan(arg); }
+		inline bool isnan(half arg) { return functions::isnan(arg); }
+		inline bool isnan(expr arg) { return functions::isnan(arg); }
+
+		/// Check if normal number.
+		/// \param arg number to check
+		/// \retval true if normal number
+		/// \retval false if either subnormal, zero, infinity or NaN
+//		template<typename T> typename enable<bool,T>::type isnormal(T arg) { return functions::isnormal(arg); }
+		inline bool isnormal(half arg) { return functions::isnormal(arg); }
+		inline bool isnormal(expr arg) { return functions::isnormal(arg); }
+
+		/// Check sign.
+		/// \param arg number to check
+		/// \retval true for negative number
+		/// \retval false for positive number
+//		template<typename T> typename enable<bool,T>::type signbit(T arg) { return functions::signbit(arg); }
+		inline bool signbit(half arg) { return functions::signbit(arg); }
+		inline bool signbit(expr arg) { return functions::signbit(arg); }
+
+		/// \}
+		/// \name Comparison
+		/// \{
+
+		/// Comparison for greater than.
+		/// \param x first operand
+		/// \param y second operand
+		/// \retval true if \a x greater than \a y
+		/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type isgreater(T x, U y) { return functions::isgreater(x, y); }
+		inline bool isgreater(half x, half y) { return functions::isgreater(x, y); }
+		inline bool isgreater(half x, expr y) { return functions::isgreater(x, y); }
+		inline bool isgreater(expr x, half y) { return functions::isgreater(x, y); }
+		inline bool isgreater(expr x, expr y) { return functions::isgreater(x, y); }
+
+		/// Comparison for greater equal.
+		/// \param x first operand
+		/// \param y second operand
+		/// \retval true if \a x greater equal \a y
+		/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type isgreaterequal(T x, U y) { return functions::isgreaterequal(x, y); }
+		inline bool isgreaterequal(half x, half y) { return functions::isgreaterequal(x, y); }
+		inline bool isgreaterequal(half x, expr y) { return functions::isgreaterequal(x, y); }
+		inline bool isgreaterequal(expr x, half y) { return functions::isgreaterequal(x, y); }
+		inline bool isgreaterequal(expr x, expr y) { return functions::isgreaterequal(x, y); }
+
+		/// Comparison for less than.
+		/// \param x first operand
+		/// \param y second operand
+		/// \retval true if \a x less than \a y
+		/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type isless(T x, U y) { return functions::isless(x, y); }
+		inline bool isless(half x, half y) { return functions::isless(x, y); }
+		inline bool isless(half x, expr y) { return functions::isless(x, y); }
+		inline bool isless(expr x, half y) { return functions::isless(x, y); }
+		inline bool isless(expr x, expr y) { return functions::isless(x, y); }
+
+		/// Comparison for less equal.
+		/// \param x first operand
+		/// \param y second operand
+		/// \retval true if \a x less equal \a y
+		/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type islessequal(T x, U y) { return functions::islessequal(x, y); }
+		inline bool islessequal(half x, half y) { return functions::islessequal(x, y); }
+		inline bool islessequal(half x, expr y) { return functions::islessequal(x, y); }
+		inline bool islessequal(expr x, half y) { return functions::islessequal(x, y); }
+		inline bool islessequal(expr x, expr y) { return functions::islessequal(x, y); }
+
+		/// Comarison for less or greater.
+		/// \param x first operand
+		/// \param y second operand
+		/// \retval true if either less or greater
+		/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type islessgreater(T x, U y) { return functions::islessgreater(x, y); }
+		inline bool islessgreater(half x, half y) { return functions::islessgreater(x, y); }
+		inline bool islessgreater(half x, expr y) { return functions::islessgreater(x, y); }
+		inline bool islessgreater(expr x, half y) { return functions::islessgreater(x, y); }
+		inline bool islessgreater(expr x, expr y) { return functions::islessgreater(x, y); }
+
+		/// Check if unordered.
+		/// \param x first operand
+		/// \param y second operand
+		/// \retval true if unordered (one or two NaN operands)
+		/// \retval false else
+//		template<typename T,typename U> typename enable<bool,T,U>::type isunordered(T x, U y) { return functions::isunordered(x, y); }
+		inline bool isunordered(half x, half y) { return functions::isunordered(x, y); }
+		inline bool isunordered(half x, expr y) { return functions::isunordered(x, y); }
+		inline bool isunordered(expr x, half y) { return functions::isunordered(x, y); }
+		inline bool isunordered(expr x, expr y) { return functions::isunordered(x, y); }
+
+		/// \name Casting
+		/// \{
+
+		/// Cast to or from half-precision floating point number.
+		/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted 
+		/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do. 
+		/// It uses the default rounding mode.
+		///
+		/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types 
+		/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler 
+		/// error and casting between [half](\ref half_float::half)s is just a no-op.
+		/// \tparam T destination type (half or built-in arithmetic type)
+		/// \tparam U source type (half or built-in arithmetic type)
+		/// \param arg value to cast
+		/// \return \a arg converted to destination type
+		template<typename T,typename U> T half_cast(U arg) { return half_caster<T,U>::cast(arg); }
+
+		/// Cast to or from half-precision floating point number.
+		/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted 
+		/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do.
+		///
+		/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types 
+		/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler 
+		/// error and casting between [half](\ref half_float::half)s is just a no-op.
+		/// \tparam T destination type (half or built-in arithmetic type)
+		/// \tparam R rounding mode to use.
+		/// \tparam U source type (half or built-in arithmetic type)
+		/// \param arg value to cast
+		/// \return \a arg converted to destination type
+		template<typename T,std::float_round_style R,typename U> T half_cast(U arg) { return half_caster<T,U,R>::cast(arg); }
+		/// \}
+	}
+
+	using detail::operator==;
+	using detail::operator!=;
+	using detail::operator<;
+	using detail::operator>;
+	using detail::operator<=;
+	using detail::operator>=;
+	using detail::operator+;
+	using detail::operator-;
+	using detail::operator*;
+	using detail::operator/;
+	using detail::operator<<;
+	using detail::operator>>;
+
+	using detail::abs;
+	using detail::fabs;
+	using detail::fmod;
+	using detail::remainder;
+	using detail::remquo;
+	using detail::fma;
+	using detail::fmax;
+	using detail::fmin;
+	using detail::fdim;
+	using detail::nanh;
+	using detail::exp;
+	using detail::expm1;
+	using detail::exp2;
+	using detail::log;
+	using detail::log10;
+	using detail::log1p;
+	using detail::log2;
+	using detail::sqrt;
+	using detail::cbrt;
+	using detail::hypot;
+	using detail::pow;
+	using detail::sin;
+	using detail::cos;
+	using detail::tan;
+	using detail::asin;
+	using detail::acos;
+	using detail::atan;
+	using detail::atan2;
+	using detail::sinh;
+	using detail::cosh;
+	using detail::tanh;
+	using detail::asinh;
+	using detail::acosh;
+	using detail::atanh;
+	using detail::erf;
+	using detail::erfc;
+	using detail::lgamma;
+	using detail::tgamma;
+	using detail::ceil;
+	using detail::floor;
+	using detail::trunc;
+	using detail::round;
+	using detail::lround;
+	using detail::nearbyint;
+	using detail::rint;
+	using detail::lrint;
+#if HALF_ENABLE_CPP11_LONG_LONG
+	using detail::llround;
+	using detail::llrint;
+#endif
+	using detail::frexp;
+	using detail::ldexp;
+	using detail::modf;
+	using detail::scalbn;
+	using detail::scalbln;
+	using detail::ilogb;
+	using detail::logb;
+	using detail::nextafter;
+	using detail::nexttoward;
+	using detail::copysign;
+	using detail::fpclassify;
+	using detail::isfinite;
+	using detail::isinf;
+	using detail::isnan;
+	using detail::isnormal;
+	using detail::signbit;
+	using detail::isgreater;
+	using detail::isgreaterequal;
+	using detail::isless;
+	using detail::islessequal;
+	using detail::islessgreater;
+	using detail::isunordered;
+
+	using detail::half_cast;
+}
+
+
+/// Extensions to the C++ standard library.
+namespace std
+{
+	/// Numeric limits for half-precision floats.
+	/// Because of the underlying single-precision implementation of many operations, it inherits some properties from 
+	/// `std::numeric_limits<float>`.
+	template<> class numeric_limits<half_float::half> : public numeric_limits<float>
+	{
+	public:
+		/// Supports signed values.
+		static HALF_CONSTEXPR_CONST bool is_signed = true;
+
+		/// Is not exact.
+		static HALF_CONSTEXPR_CONST bool is_exact = false;
+
+		/// Doesn't provide modulo arithmetic.
+		static HALF_CONSTEXPR_CONST bool is_modulo = false;
+
+		/// IEEE conformant.
+		static HALF_CONSTEXPR_CONST bool is_iec559 = true;
+
+		/// Supports infinity.
+		static HALF_CONSTEXPR_CONST bool has_infinity = true;
+
+		/// Supports quiet NaNs.
+		static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true;
+
+		/// Supports subnormal values.
+		static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present;
+
+		/// Rounding mode.
+		/// Due to the mix of internal single-precision computations (using the rounding mode of the underlying 
+		/// single-precision implementation) with the rounding mode of the single-to-half conversions, the actual rounding 
+		/// mode might be `std::round_indeterminate` if the default half-precision rounding mode doesn't match the 
+		/// single-precision rounding mode.
+		static HALF_CONSTEXPR_CONST float_round_style round_style = (std::numeric_limits<float>::round_style==
+			half_float::half::round_style) ? half_float::half::round_style : round_indeterminate;
+
+		/// Significant digits.
+		static HALF_CONSTEXPR_CONST int digits = 11;
+
+		/// Significant decimal digits.
+		static HALF_CONSTEXPR_CONST int digits10 = 3;
+
+		/// Required decimal digits to represent all possible values.
+		static HALF_CONSTEXPR_CONST int max_digits10 = 5;
+
+		/// Number base.
+		static HALF_CONSTEXPR_CONST int radix = 2;
+
+		/// One more than smallest exponent.
+		static HALF_CONSTEXPR_CONST int min_exponent = -13;
+
+		/// Smallest normalized representable power of 10.
+		static HALF_CONSTEXPR_CONST int min_exponent10 = -4;
+
+		/// One more than largest exponent
+		static HALF_CONSTEXPR_CONST int max_exponent = 16;
+
+		/// Largest finitely representable power of 10.
+		static HALF_CONSTEXPR_CONST int max_exponent10 = 4;
+
+		/// Smallest positive normal value.
+		static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0400); }
+
+		/// Smallest finite value.
+		static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0xFBFF); }
+
+		/// Largest finite value.
+		static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7BFF); }
+
+		/// Difference between one and next representable value.
+		static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x1400); }
+
+		/// Maximum rounding error.
+		static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW
+			{ return half_float::half(half_float::detail::binary, (round_style==std::round_to_nearest) ? 0x3800 : 0x3C00); }
+
+		/// Positive infinity.
+		static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7C00); }
+
+		/// Quiet NaN.
+		static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7FFF); }
+
+		/// Signalling NaN.
+		static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7DFF); }
+
+		/// Smallest positive subnormal value.
+		static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0001); }
+	};
+
+#if HALF_ENABLE_CPP11_HASH
+	/// Hash function for half-precision floats.
+	/// This is only defined if C++11 `std::hash` is supported and enabled.
+	template<> struct hash<half_float::half> //: unary_function<half_float::half,size_t>
+	{
+		/// Type of function argument.
+		typedef half_float::half argument_type;
+
+		/// Function return type.
+		typedef size_t result_type;
+
+		/// Compute hash function.
+		/// \param arg half to hash
+		/// \return hash value
+		result_type operator()(argument_type arg) const
+			{ return hash<half_float::detail::uint16>()(static_cast<unsigned>(arg.data_)&-(arg.data_!=0x8000)); }
+	};
+#endif
+}
+
+
+#undef HALF_CONSTEXPR
+#undef HALF_CONSTEXPR_CONST
+#undef HALF_NOEXCEPT
+#undef HALF_NOTHROW
+#ifdef HALF_POP_WARNINGS
+	#pragma warning(pop)
+	#undef HALF_POP_WARNINGS
+#endif
+
+#endif
diff --git a/include/triton/ir/basic_block.h b/include/triton/ir/basic_block.h
new file mode 100644
index 000000000..3d274815a
--- /dev/null
+++ b/include/triton/ir/basic_block.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#ifndef _TRITON_IR_BASIC_BLOCK_H_
+#define _TRITON_IR_BASIC_BLOCK_H_
+
+#include <string>
+#include <list>
+#include "value.h"
+#include "visitor.h"
+
+namespace triton{
+namespace ir{
+
+class context;
+class function;
+class instruction;
+
+/* Basic Block */
+class basic_block: public value{
+public:
+  // instruction iterator types
+  typedef std::list<instruction*>                inst_list_t;
+  typedef inst_list_t::iterator                  iterator;
+  typedef inst_list_t::const_iterator            const_iterator;
+  typedef inst_list_t::reverse_iterator          reverse_iterator;
+  typedef inst_list_t::const_reverse_iterator    const_reverse_iterator;
+
+private:
+  // constructors
+  basic_block(context &ctx, const std::string &name, function *parent);
+
+public:
+  // accessors
+  function* get_parent() { return parent_; }
+  context& get_context() { return ctx_; }
+
+  // get iterator to first instruction that is not a phi
+  iterator get_first_non_phi();
+
+  // get instruction list
+  inst_list_t           &get_inst_list()       { return inst_list_; }
+  void  erase(instruction *i)                  {  inst_list_.remove(i); }
+
+  // instruction iterator functions
+  inline iterator                begin()       { return inst_list_.begin(); }
+  inline const_iterator          begin() const { return inst_list_.begin(); }
+  inline iterator                end  ()       { return inst_list_.end();   }
+  inline const_iterator          end  () const { return inst_list_.end();   }
+
+  inline reverse_iterator        rbegin()       { return inst_list_.rbegin(); }
+  inline const_reverse_iterator  rbegin() const { return inst_list_.rbegin(); }
+  inline reverse_iterator        rend  ()       { return inst_list_.rend();   }
+  inline const_reverse_iterator  rend  () const { return inst_list_.rend();   }
+
+  inline size_t                   size() const { return inst_list_.size();  }
+  inline bool                    empty() const { return inst_list_.empty(); }
+  inline const instruction      &front() const { return *inst_list_.front(); }
+  inline       instruction      &front()       { return *inst_list_.front(); }
+  inline const instruction       &back() const { return *inst_list_.back();  }
+  inline       instruction       &back()       { return *inst_list_.back();  }
+
+  // predecessors
+  const std::vector<basic_block*>& get_predecessors() const { return preds_; }
+  const std::vector<basic_block*>& get_successors() const { return succs_; }
+  void add_predecessor(basic_block* pred);
+
+  // factory functions
+  static basic_block* create(context &ctx, const std::string &name, function *parent);
+
+  // visitor
+  void accept(visitor *v) { v->visit_basic_block(this); }
+
+private:
+  context &ctx_;
+  std::string name_;
+  function *parent_;
+  std::vector<basic_block*> preds_;
+  std::vector<basic_block*> succs_;
+  inst_list_t inst_list_;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h
new file mode 100644
index 000000000..d5782e3a1
--- /dev/null
+++ b/include/triton/ir/builder.h
@@ -0,0 +1,157 @@
+#pragma once
+
+#ifndef _TRITON_IR_BUILDER_H_
+#define _TRITON_IR_BUILDER_H_
+
+#include <vector>
+#include <string>
+#include "instructions.h"
+#include "basic_block.h"
+#include "type.h"
+
+namespace triton{
+namespace ir{
+
+class basic_block;
+class value;
+class type;
+class constant_int;
+class instruction;
+class context;
+class phi_node;
+
+/* Builder */
+class builder{
+  typedef basic_block::iterator iterator;
+
+public:
+  // Constructor
+  builder(context &ctx);
+  // Setters
+  void set_insert_point(iterator instr);
+  void set_insert_point(instruction* i);
+  void set_insert_point_after(instruction* i);
+  void set_insert_point(basic_block* block);
+  basic_block* get_insert_block() { return block_; }
+  iterator get_insert_point() { return insert_point_;}
+  // Constants
+  value *get_int32(unsigned val);
+  // Types
+  type *get_void_ty();
+  type *get_int1_ty();
+  type *get_int8_ty();
+  type *get_int16_ty();
+  type *get_int32_ty();
+  type *get_int64_ty();
+  type *get_half_ty();
+  type *get_float_ty();
+  type *get_double_ty();
+  // Insert
+  template<typename InstTy>
+  InstTy* insert(InstTy *inst, const std::string &name = ""){
+    assert(block_);
+    block_->get_inst_list().insert(insert_point_, inst);
+    inst->set_parent(block_);
+    inst->set_name(name);
+//    for(ir::value* op: inst->ops())
+//      op->add_use(inst);
+    return inst;
+  }
+  // terminator instructions
+  value* create_br(basic_block *dest);
+  value* create_cond_br(value *cond, basic_block* if_dest, basic_block* else_dest);
+  value* create_ret_void();
+  // Cast instructions
+  value *create_cast(cast_op_t op, value *v, type *dst_ty, const std::string &name = "");
+  value* create_si_to_fp(value *src, type *dst_ty, const std::string &name = "");
+  value* create_ui_to_fp(value *src, type *dst_ty, const std::string &name = "");
+  value* create_fp_to_si(value *src, type *dst_ty, const std::string &name = "");
+  value* create_fp_to_ui(value *src, type *dst_ty, const std::string &name = "");
+  value* create_fp_ext(value *src, type *dst_ty, const std::string &name = "");
+  value* create_fp_trunc(value *src, type *dst_ty, const std::string &name = "");
+  value* create_int_cast(value *src, type *dst_ty, bool is_signed, const std::string &name = "");
+  value *create_downcast(value *arg, const std::string &name = "");
+  // Phi instruction
+  phi_node* create_phi(type *ty, unsigned num_reserved, const std::string &name = "");
+  // Binary instructions
+  value *create_insert_nuwnswb_binop(binary_op_t op, value *lhs, value *rhs, const std::string &name, bool has_nuw, bool has_nsw);
+  value *create_fmul(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fdiv(value *lhs, value *rhs, const std::string &name = "");
+  value *create_frem(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fadd(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fsub(value *lhs, value *rhs, const std::string &name = "");
+  value *create_mul(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false);
+  value *create_sdiv(value *lhs, value *rhs, const std::string &name = "");
+  value *create_udiv(value *lhs, value *rhs, const std::string &name = "");
+  value *create_srem(value *lhs, value *rhs, const std::string &name = "");
+  value *create_urem(value *lhs, value *rhs, const std::string &name = "");
+  value *create_add(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false);
+  value *create_sub(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false);
+  value *create_shl(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false);
+  value *create_lshr(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false);
+  value *create_ashr(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false);
+  // GEP
+  value *create_gep(value *ptr, const std::vector<value*>& idx_list, const std::string &name = "");
+  // Comparison (int)
+  value *create_icmp(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpSLE(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpSLT(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpSGE(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpSGT(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpULE(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpULT(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpUGE(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpUGT(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpEQ(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpNE(value *lhs, value *rhs, const std::string &name = "");
+  // Comparison (float)
+  value *create_fcmp(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name = "");
+  value *create_fcmpOLT(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fcmpOGT(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fcmpOLE(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fcmpOGE(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fcmpOEQ(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fcmpONE(value *lhs, value *rhs, const std::string &name = "");
+  // Logical
+  value *create_and(value *lhs, value *rhs, const std::string &name = "");
+  value *create_xor(value *lhs, value *rhs, const std::string &name = "");
+  value *create_or(value *lhs, value *rhs, const std::string &name = "");
+  // Unary
+//  value *create_fneg(value *arg, const std::string &name = "");
+//  value *create_neg(value *arg, const std::string &name = "");
+//  value *create_not(value *arg, const std::string &name = "");
+  // Input/Output
+  value *create_load(value *arg, const std::string &name = "");
+  value *create_store(value *ptr, value *val, const std::string &name = "");
+  value *create_masked_load(value *arg, value *mask, value *false_value, const std::string &name = "");
+  value *create_masked_store(value *ptr, value *val, value *mask, const std::string &name = "");
+  // Tile instruction
+  value *create_splat(value *arg, const type::tile_shapes_t &shapes, const std::string &name = "");
+  value *create_reshape(value *arg, const type::tile_shapes_t &shapes, const std::string &name = "");
+  value *create_broadcast(value *arg, const type::tile_shapes_t &shapes, const std::string &name = "");
+  // Built-in instruction
+  value *create_get_program_id(unsigned axis, const std::string &name = "");
+  value *create_get_num_program(unsigned axis, const std::string &name = "");
+  value *create_atomic_cas(value *ptr, value *cmp, value *val, const std::string &name = "");
+  value *create_atomic_exch(value *ptr, value *val, const std::string &name = "");
+  value *create_atomic_add(value *ptr, value *val, const std::string &name = "");
+  value *create_dot(value *A, value *B, value *C, const std::string &name = "");
+  value *create_trans(value *A, const std::vector<int> &perm = {}, const std::string &name = "");
+  value *create_sqrt(value *A, const std::string &name = "");
+  value *create_reduce(value *A, reduce_inst::op_t op, unsigned axis, const std::string &name = "");
+  value *create_select(value *pred, value *if_value, value *else_value, const std::string &name = "");
+  // Intrinsics
+  value *create_copy_to_shared(value *arg, const std::string &name = "");
+  value *create_copy_from_shared(value *arg, const std::string &name = "");
+  value *create_barrier(const std::string &name = "");
+
+private:
+  context &ctx_;
+  basic_block *block_;
+  iterator insert_point_;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h
new file mode 100644
index 000000000..671d5e5f0
--- /dev/null
+++ b/include/triton/ir/constant.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#ifndef _TRITON_IR_CONSTANT_H_
+#define _TRITON_IR_CONSTANT_H_
+
+#include "enums.h"
+#include "value.h"
+#include <cassert>
+#include "visitor.h"
+
+namespace triton{
+namespace ir{
+
+class type;
+class context;
+
+/* Constant */
+class constant: public user{
+protected:
+  using user::user;
+
+public:
+  static constant* get_all_ones_value(type *ty);
+  static constant* get_null_value(type *ty);
+  virtual std::string repr() const = 0;
+};
+
+/* Undef value */
+class undef_value: public constant{
+private:
+  undef_value(type *ty);
+
+public:
+  static undef_value* get(type* ty);
+  std::string repr() const { return "undef"; }
+  void accept(visitor* vst) { vst->visit_undef_value(this); }
+};
+
+
+/* Constant int */
+class constant_int: public constant{
+protected:
+  constant_int(type *ty, uint64_t value);
+
+public:
+  virtual uint64_t get_value() const { return value_; }
+  static constant_int *get(type *ty, uint64_t value);
+  std::string repr() const { return std::to_string(value_); }
+  void accept(visitor* vst) { vst->visit_constant_int(this); }
+
+protected:
+  uint64_t value_;
+};
+
+/* Constant fp */
+class constant_fp: public constant{
+  constant_fp(type *ty, double value);
+
+public:
+  double get_value() { return value_; }
+  static constant* get_negative_zero(type *ty);
+  static constant* get_zero_value_for_negation(type *ty);
+  static constant* get(context &ctx, double v);
+  static constant* get(type *ty, double v);
+  std::string repr() const { return std::to_string(value_); }
+  void accept(visitor* vst) { vst->visit_constant_fp(this); }
+
+private:
+  double value_;
+};
+
+
+/* Global Value */
+class global_value: public constant {
+public:
+  enum linkage_types_t {
+    external
+  };
+
+public:
+  global_value(type *ty, unsigned num_ops,
+               linkage_types_t linkage, const std::string &name,
+               unsigned addr_space);
+  std::string repr() const { return get_name(); }
+
+private:
+  linkage_types_t linkage_;
+};
+
+/* global object */
+class global_object: public global_value {
+public:
+  global_object(type *ty, unsigned num_ops,
+               linkage_types_t linkage, const std::string &name,
+               unsigned addr_space = 0);
+  std::string repr() const { return get_name(); }
+};
+
+/* global variable */
+class alloc_const: public global_object {
+public:
+  alloc_const(type *ty, constant_int *size,
+              const std::string &name = "");
+  std::string repr() const { return get_name(); }
+  void accept(visitor* vst) { vst->visit_alloc_const(this); }
+
+
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/ir/context.h b/include/triton/ir/context.h
new file mode 100644
index 000000000..83627e869
--- /dev/null
+++ b/include/triton/ir/context.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#ifndef _TRITON_IR_CONTEXT_H_
+#define _TRITON_IR_CONTEXT_H_
+
+#include <memory>
+#include "triton/ir/type.h"
+
+namespace triton{
+namespace ir{
+
+class type;
+class context_impl;
+
+/* Context */
+class context {
+public:
+  context();
+
+public:
+  std::shared_ptr<context_impl> p_impl;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/ir/context_impl.h b/include/triton/ir/context_impl.h
new file mode 100644
index 000000000..a016d1add
--- /dev/null
+++ b/include/triton/ir/context_impl.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#ifndef _TRITON_IR_CONTEXT_IMPL_H_
+#define _TRITON_IR_CONTEXT_IMPL_H_
+
+#include <map>
+#include "triton/ir/type.h"
+
+namespace triton{
+namespace ir{
+
+class context;
+class constant;
+class constant_int;
+class constant_fp;
+class undef_value;
+
+/* Context impl */
+class context_impl {
+public:
+  // constructors
+  context_impl(context &ctx);
+
+public:
+  // primitive types
+  type void_ty, label_ty, half_ty, float_ty, double_ty;
+  // derived types
+  integer_type int1_ty, int8_ty, int16_ty, int32_ty, int64_ty, int128_ty;
+  // Pointer types
+  std::map<std::pair<type*, unsigned>, pointer_type*> ptr_tys;
+  std::map<std::pair<type*, type::tile_shapes_t>, tile_type*> tile_tys;
+  // Int constants
+  std::map<std::pair<type*, uint64_t>, constant_int*> int_constants_;
+  // Float constants
+  std::map<std::pair<type*, double>, constant_fp*> fp_constants_;
+  // undef values
+  std::map<type*, undef_value*> uv_constants_;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/ir/enums.h b/include/triton/ir/enums.h
new file mode 100644
index 000000000..491d37edf
--- /dev/null
+++ b/include/triton/ir/enums.h
@@ -0,0 +1,149 @@
+#pragma once
+
+#ifndef _TRITON_IR_ENUMS_H_
+#define _TRITON_IR_ENUMS_H_
+
+namespace triton{
+namespace ir{
+
+
+enum binary_op_t {
+  Add,
+  FAdd,
+  Sub,
+  FSub,
+  Mul,
+  FMul,
+  UDiv,
+  SDiv,
+  FDiv,
+  URem,
+  SRem,
+  FRem,
+  Shl,
+  LShr,
+  AShr,
+  And,
+  Or,
+  Xor
+};
+
+enum cast_op_t {
+  Trunc,
+  ZExt,
+  SExt,
+  FPTrunc,
+  FPExt,
+  UIToFP,
+  SIToFP,
+  FPToUI,
+  FPToSI,
+  PtrToInt,
+  IntToPtr,
+  BitCast,
+  AddrSpaceCast
+};
+
+enum cmp_pred_t {
+  FIRST_FCMP_PREDICATE,
+  FCMP_FALSE,
+  FCMP_OEQ,
+  FCMP_OGT,
+  FCMP_OGE,
+  FCMP_OLT,
+  FCMP_OLE,
+  FCMP_ONE,
+  FCMP_ORD,
+  FCMP_UNO,
+  FCMP_UEQ,
+  FCMP_UGT,
+  FCMP_UGE,
+  FCMP_ULT,
+  FCMP_ULE,
+  FCMP_UNE,
+  FCMP_TRUE,
+  LAST_FCMP_PREDICATE,
+  FIRST_ICMP_PREDICATE,
+  ICMP_EQ,
+  ICMP_NE,
+  ICMP_UGT,
+  ICMP_UGE,
+  ICMP_ULT,
+  ICMP_ULE,
+  ICMP_SGT,
+  ICMP_SGE,
+  ICMP_SLT,
+  ICMP_SLE,
+  LAST_ICMP_PREDICATE
+};
+
+enum value_id_t: unsigned {
+  /* ------------ *
+    INSTRUCTIONS
+   * ------------ */
+  INST_BEGIN,
+  // phi
+  INST_PHI,
+  // arithmetic
+  INST_BINOP,
+  INST_GETELEMENTPTR,
+  INST_SELECT,
+  INST_SQRT,
+  // cmp
+  INST_ICMP,
+  INST_FCMP,
+  // cast
+  INST_CAST_TRUNC,
+  INST_CAST_ZEXT,
+  INST_CAST_SEXT,
+  INST_CAST_FP_TRUNC,
+  INST_CAST_FP_EXT,
+  INST_CAST_UI_TO_FP,
+  INST_CAST_SI_TO_FP,
+  INST_CAST_FP_TO_UI,
+  INST_CAST_FP_TO_SI,
+  INST_CAST_PTR_TO_INT,
+  INST_CAST_INT_TO_PTR,
+  INST_CAST_BIT_CAST,
+  INST_CAST_ADDR_SPACE_CAST,
+  // terminators
+  INST_RETURN,
+  INST_COND_BRANCH,
+  INST_UNCOND_BRANCH,
+  // io
+  INST_UNMASKED_LOAD,
+  INST_MASKED_LOAD,
+  INST_UNMASKED_STORE,
+  INST_MASKED_STORE,
+  // retile
+  INST_RESHAPE,
+  INST_SPLAT,
+  INST_BROADCAST,
+  INST_DOWNCAST,
+  // builtin
+  INST_GET_PROGRAM_ID,
+  INST_GET_NUM_PROGRAMS,
+  // atomics
+  INST_ATOMIC_CAS,
+  INST_ATOMIC_EXCH,
+  INST_ATOMIC_ADD,
+  // array arithmetic
+  INST_TRANS,
+  INST_REDUCE,
+  INST_DOT,
+  // intrinsics
+  INST_COPY_TO_SHARED,
+  INST_COPY_FROM_SHARED,
+  INST_RECOALESCE,
+  INST_BARRIER,
+  INST_MAKE_RANGE_DYN,
+  INST_MAKE_RANGE_STA,
+  INST_MAKE_RANGE
+};
+
+
+
+}
+}
+
+#endif
diff --git a/include/triton/ir/function.h b/include/triton/ir/function.h
new file mode 100644
index 000000000..d3ebe199b
--- /dev/null
+++ b/include/triton/ir/function.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#ifndef _TRITON_IR_FUNCTION_H_
+#define _TRITON_IR_FUNCTION_H_
+
+#include <string>
+#include <map>
+#include "value.h"
+#include "constant.h"
+
+namespace triton{
+namespace ir{
+
+class function;
+class function_type;
+class module;
+class basic_block;
+
+/* Argument */
+class argument: public value{
+  argument(type *ty, const std::string &name, function *parent, unsigned arg_no);
+
+public:
+  static argument* create(type *ty, const std::string &name,
+                          function *parent = nullptr, unsigned arg_no = 0);
+  function* get_parent() const;
+  unsigned get_arg_no() const;
+
+  void accept(visitor *v);
+
+private:
+  function *parent_;
+  unsigned arg_no_;
+};
+
+/* Attribute */
+enum attribute_kind_t {
+  readonly,
+  writeonly,
+  noalias,
+  aligned,
+  multiple_of
+};
+
+class attribute {
+public:
+  attribute(attribute_kind_t kind, unsigned value = 0):
+    kind_(kind), value_(value){}
+
+  bool operator<(const attribute& other) const {
+    return std::make_pair(kind_, value_) < std::make_pair(other.kind_, other.value_);
+  }
+
+  attribute_kind_t get_kind() const {
+    return kind_;
+  }
+
+  unsigned get_value() const {
+    return value_;
+  }
+
+  bool is_llvm_attr() const {
+    return kind_ != multiple_of;
+  }
+
+  std::string repr() const {
+    switch(kind_){
+      case readonly: return ".readonly";
+      case writeonly: return ".writeonly";
+      case noalias: return ".noalias";
+      case aligned: return ".aligned(" + std::to_string(value_) + ")";
+      case multiple_of: return ".readonly";
+      default: break;
+    }
+    assert(false);
+    return "";
+  }
+
+private:
+  attribute_kind_t kind_;
+  unsigned value_;
+};
+
+/* Function */
+class function: public global_object{
+  typedef std::vector<argument*> args_t;
+  typedef args_t::iterator       arg_iterator;
+  typedef args_t::const_iterator const_arg_iterator;
+
+  typedef std::vector<basic_block*> blocks_t;
+  typedef blocks_t::iterator        block_iterator;
+  typedef blocks_t::const_iterator  const_block_iterator;
+
+  typedef std::map<unsigned, std::set<attribute>> attr_map_t;
+
+private:
+  function(function_type *ty, linkage_types_t linkage,
+           const std::string &name = "", module *parent = nullptr);
+
+public:
+  // accessors
+  const args_t &args() { return args_; }
+  function_type* get_fn_type() { return fn_ty_; }
+
+  // factory methods
+  static function *create(function_type *ty, linkage_types_t linkage,
+                          const std::string &name, module *mod);
+  // blocks
+  const blocks_t &blocks() { return blocks_; }
+  void insert_block(basic_block* block, basic_block *next = nullptr);
+
+  // attributes
+  void add_attr(unsigned arg_id, attribute attr) { attrs_[arg_id].insert(attr); }
+  const attr_map_t &attrs() { return attrs_; }
+  std::set<attribute> get_attributes(argument* arg) { return attrs_[arg->get_arg_no() + 1]; }
+
+  // visitor
+  void accept(visitor *v) { v->visit_function(this); }
+
+private:
+  module *parent_;
+  bool init_;
+  function_type *fn_ty_;
+  args_t args_;
+  blocks_t blocks_;
+  attr_map_t attrs_;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h
new file mode 100644
index 000000000..41eb98eb3
--- /dev/null
+++ b/include/triton/ir/instructions.h
@@ -0,0 +1,808 @@
+#pragma once
+
+#ifndef _TRITON_IR_INSTRUCTIONS_H_
+#define _TRITON_IR_INSTRUCTIONS_H_
+
+#include <vector>
+#include <map>
+#include "triton/ir/enums.h"
+#include "triton/ir/constant.h"
+#include "triton/ir/value.h"
+#include "triton/ir/type.h"
+#include "triton/ir/metadata.h"
+#include "triton/ir/visitor.h"
+
+#define _TRITON_DEFINE_CLONE(name) \
+  ir::instruction* clone_impl() const { return new name(*this); }
+
+#define _TRITON_DEFINE_ACCEPT(name) \
+  void accept(visitor* v) { v->visit_ ## name (this); }
+
+namespace triton{
+namespace ir{
+
+class constant_int;
+class constant;
+class make_range;
+class basic_block;
+class context;
+class visitor;
+
+//===----------------------------------------------------------------------===//
+//                               instruction classes
+//===----------------------------------------------------------------------===//
+
+class result_reference;
+
+
+class instruction: public user{
+public:
+  virtual std::string repr_impl() const = 0;
+
+private:
+  virtual ir::instruction* clone_impl() const = 0;
+
+protected:
+  // constructors
+  instruction(type *ty, value_id_t ity, unsigned num_ops,
+              const std::string &name = "", instruction *next = nullptr);
+
+public:
+  // parent
+  void set_parent(basic_block *block)                         { parent_ = block; }
+  const basic_block *get_parent() const                       { return parent_;  }
+  basic_block *get_parent()                                   { return parent_;  }
+  void erase_from_parent();
+  // helpers
+  bool has_tile_result_or_op();
+  // repr
+  std::string repr() const                                    { return repr_impl(); }
+  // metadata
+  void set_metadata(ir::metadata::kind_t kind,
+                    unsigned value)                           { metadatas_[kind] = value;}
+  unsigned get_metadata(ir::metadata::kind_t kind)            { return metadatas_[kind];}
+  // cloning
+  ir::instruction* clone() {
+    ir::instruction* res = clone_impl();
+    for(auto it = op_begin(); it != op_end(); it++)
+      (*it)->add_use(res);
+    res->parent_ = nullptr;
+    return res;
+  }
+  // instruction id
+  value_id_t get_id() const { return id_; }
+
+private:
+  basic_block *parent_;
+  std::map<ir::metadata::kind_t, unsigned> metadatas_;
+  value_id_t id_;
+};
+
+
+//===----------------------------------------------------------------------===//
+//                               phi_node classes
+//===----------------------------------------------------------------------===//
+
+class phi_node: public instruction {
+private:
+  phi_node(type *ty, unsigned num_reserved, const std::string &name, instruction *next);
+  std::string repr_impl() const { return "phi"; }
+
+public:
+  void set_incoming_value(unsigned i, value *v);
+  void set_incoming_block(unsigned i, basic_block *block);
+  value *get_incoming_value(unsigned i) { return get_operand(i); }
+  basic_block *get_incoming_block(unsigned i) { return blocks_[i]; }
+  unsigned get_num_incoming() { return get_num_operands(); }
+  void add_incoming(value *v, basic_block *block);
+
+  // Type
+  void set_type(type *ty) { ty_ = ty; }
+
+  // Factory methods
+  static phi_node* create(type *ty, unsigned num_reserved, const std::string &name = "", instruction *next = nullptr);
+
+  _TRITON_DEFINE_CLONE(phi_node)
+  _TRITON_DEFINE_ACCEPT(phi_node)
+
+private:
+  unsigned num_reserved_;
+  std::vector<basic_block*> blocks_;
+};
+
+//===----------------------------------------------------------------------===//
+//                               binary_operator classes
+//===----------------------------------------------------------------------===//
+class binary_operator: public instruction {
+public:
+  typedef binary_op_t op_t;
+
+private:
+  std::string repr_impl() const;
+
+protected:
+  // Constructors
+  binary_operator(binary_op_t op, value *lhs, value *rhs, type *ty, const std::string &name, instruction *next);
+
+public:
+  // Get operand
+  binary_op_t get_op() const { return op_; }
+
+  // Bool
+  bool is_terminator()  const;
+  bool is_binary_op()   const;
+  bool is_int_div_rem() const;
+  bool is_shift()       const;
+  bool is_cast()        const;
+  bool is_int_mult()    const;
+  bool is_int_add_sub() const;
+  bool is_int_div()     const;
+  bool is_int_rem()     const;
+  bool is_shl()         const;
+  bool is_shr()         const;
+
+  // Wraps
+  void set_has_no_unsigned_wrap(bool b = true) { has_no_unsigned_wrap_ = b; }
+  void set_has_no_signed_wrap(bool b = true)   { has_no_signed_wrap_ = b; }
+
+  // Factory methods
+  static binary_operator *create(binary_op_t op, value *lhs, value *rhs,
+                                 const std::string &name = "", instruction *next = nullptr);
+//  static binary_operator *create_fneg(value *arg, const std::string &name = "", instruction *next = nullptr);
+//  static binary_operator *create_neg(value *arg, const std::string &name = "", instruction *next = nullptr);
+//  static binary_operator *create_not(value *arg, const std::string &name = "", instruction *next = nullptr);
+
+  _TRITON_DEFINE_CLONE(binary_operator)
+  _TRITON_DEFINE_ACCEPT(binary_operator)
+
+public:
+  binary_op_t op_;
+  bool has_no_unsigned_wrap_;
+  bool has_no_signed_wrap_;
+};
+
+
+//===----------------------------------------------------------------------===//
+//                               cmp_inst classes
+//===----------------------------------------------------------------------===//
+
+class cmp_inst: public instruction{
+public:
+  typedef cmp_pred_t pred_t;
+
+private:
+  std::string repr_impl() const;
+
+protected:
+  cmp_inst(type *ty, value_id_t id, cmp_pred_t pred,
+           value *lhs, value *rhs, const std::string &name, instruction *next);
+  static bool is_fp_predicate(cmp_pred_t pred);
+  static bool is_int_predicate(cmp_pred_t pred);
+  static type* make_cmp_result_type(type *ty);
+
+public:
+  cmp_pred_t get_pred() const { return pred_; }
+
+private:
+  cmp_pred_t pred_;
+};
+
+class icmp_inst: public cmp_inst {
+  icmp_inst(type *ty, cmp_pred_t pred,
+            value *lhs, value *rhs, const std::string &name, instruction *next);
+
+public:
+  static icmp_inst* create(cmp_pred_t pred, value *lhs, value *rhs,
+                    const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(icmp_inst)
+  _TRITON_DEFINE_ACCEPT(icmp_inst)
+};
+
+class fcmp_inst: public cmp_inst {
+  fcmp_inst(type *ty, cmp_pred_t pred,
+            value *lhs, value *rhs, const std::string &name, instruction *next);
+
+public:
+  static fcmp_inst* create(cmp_pred_t pred, value *lhs, value *rhs,
+                    const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(fcmp_inst)
+  _TRITON_DEFINE_ACCEPT(fcmp_inst)
+};
+
+//===----------------------------------------------------------------------===//
+//                               unary_inst classes
+//===----------------------------------------------------------------------===//
+
+class unary_inst: public instruction {
+protected:
+  unary_inst(type *ty, value_id_t id, value *v, const std::string &name, instruction *next);
+};
+
+
+//===----------------------------------------------------------------------===//
+//                               cast_inst classes
+//===----------------------------------------------------------------------===//
+
+class cast_inst: public unary_inst{
+private:
+  std::string repr_impl() const;
+
+protected:
+  cast_inst(type *ty, value_id_t id, value *v, const std::string &name, instruction *next, cast_op_t op)
+    : unary_inst(ty, id, v, name, next), op_(op) { }
+
+private:
+  static bool is_valid(cast_op_t op, value *arg, type *ty);
+
+public:
+  // accessors
+  cast_op_t get_op() const { return op_; }
+
+  // factory methods
+  static cast_inst *create(cast_op_t op, value *arg, type *ty,
+                           const std::string &name = "", instruction *next = nullptr);
+  static cast_inst *create_integer_cast(value *arg, type *ty, bool is_signed,
+                           const std::string &name = "", instruction *next = nullptr);
+
+  _TRITON_DEFINE_ACCEPT(cast_inst)
+
+private:
+  cast_op_t op_;
+};
+
+#define TRITON_IR_DECLARE_CAST_INST_SIMPL(name, id, op) \
+class name : public cast_inst { \
+  _TRITON_DEFINE_CLONE(name) \
+  friend class cast_inst; \
+  name(type *ty, value *v, const std::string &name, instruction *next) \
+    : cast_inst(ty, id, v, name, next, op){ } \
+};
+
+TRITON_IR_DECLARE_CAST_INST_SIMPL(trunc_inst, INST_CAST_TRUNC, cast_op_t::Trunc)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(z_ext_inst, INST_CAST_ZEXT, cast_op_t::ZExt)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(s_ext_inst, INST_CAST_SEXT, cast_op_t::SExt)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_trunc_inst, INST_CAST_FP_TRUNC, cast_op_t::FPTrunc)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_ext_inst, INST_CAST_FP_EXT, cast_op_t::FPExt)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(ui_to_fp_inst, INST_CAST_UI_TO_FP, cast_op_t::UIToFP)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(si_to_fp_inst, INST_CAST_SI_TO_FP, cast_op_t::SIToFP)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_to_ui_inst, INST_CAST_FP_TO_UI, cast_op_t::FPToUI)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_to_si_inst, INST_CAST_FP_TO_SI, cast_op_t::FPToSI)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(ptr_to_int_inst, INST_CAST_PTR_TO_INT, cast_op_t::PtrToInt)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(int_to_ptr_inst, INST_CAST_INT_TO_PTR, cast_op_t::IntToPtr)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(bit_cast_inst, INST_CAST_BIT_CAST, cast_op_t::BitCast)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(addr_space_cast_inst, INST_CAST_ADDR_SPACE_CAST, cast_op_t::AddrSpaceCast)
+
+//===----------------------------------------------------------------------===//
+//                               terminator_inst classes
+//===----------------------------------------------------------------------===//
+
+class terminator_inst: public instruction{
+  using instruction::instruction;
+};
+
+// return instruction
+class return_inst: public terminator_inst {
+private:
+  std::string repr_impl() const { return "ret"; }
+  return_inst(context &ctx, value *ret_val, instruction *next);
+
+public:
+  // accessors
+  value *get_return_value()
+  { return get_num_operands() ? get_operand(0) : nullptr; }
+
+  unsigned get_num_successors() const { return 0; }
+
+  // factory methods
+  static return_inst* create(context &ctx, value *ret_val = nullptr, instruction *next = nullptr);
+
+  _TRITON_DEFINE_CLONE(return_inst)
+  _TRITON_DEFINE_ACCEPT(return_inst)
+};
+
+// base branch instruction
+class branch_inst: public terminator_inst{
+private:
+  std::string repr_impl() const { return "br"; }
+
+protected:
+  using terminator_inst::terminator_inst;
+
+public:
+  static branch_inst* create(basic_block *dest,
+                             instruction *next = nullptr);
+  static branch_inst* create(value *cond, basic_block *if_dest, basic_block *else_dest,
+                             instruction *next = nullptr);
+};
+
+// conditional branch
+class cond_branch_inst: public branch_inst {
+private:
+  friend class branch_inst;
+  cond_branch_inst(basic_block *if_dst, basic_block *else_dst, value *cond, instruction *next);
+
+public:
+  basic_block *get_true_dest()  { return (basic_block*)get_operand(0); }
+  basic_block *get_false_dest() { return (basic_block*)get_operand(1); }
+  value *get_cond()             { return get_operand(2); }
+  _TRITON_DEFINE_CLONE(cond_branch_inst)
+  _TRITON_DEFINE_ACCEPT(cond_branch_inst)
+};
+
+// unconditional branch
+class uncond_branch_inst: public branch_inst {
+private:
+  friend class branch_inst;
+  uncond_branch_inst(basic_block *dst, instruction *next);
+
+public:
+  basic_block *get_dest()  { return (basic_block*)get_operand(0); }
+  _TRITON_DEFINE_CLONE(uncond_branch_inst)
+  _TRITON_DEFINE_ACCEPT(uncond_branch_inst)
+};
+
+
+//===----------------------------------------------------------------------===//
+//                               getelementptr_inst classes
+//===----------------------------------------------------------------------===//
+
+class getelementptr_inst: public instruction {
+private:
+  std::string repr_impl() const { return "getelementptr"; }
+  getelementptr_inst(type *pointee_ty, value *ptr, const std::vector<value*> &idx, const std::string &name, instruction *next);
+
+private:
+  static type *get_return_type(type *ty, value *ptr, const std::vector<value*> &idx);
+  static type *get_indexed_type_impl(type *ty, const std::vector<value *> &idx);
+  static type *get_indexed_type(type *ty, const std::vector<value*> &idx);
+
+public:
+  // accessors
+  type *get_source_elt_ty() { return source_elt_ty; }
+  op_iterator idx_begin()       { return op_begin() + 1; }
+  op_iterator idx_end()         { return op_end(); }
+  value *get_pointer_operand()  { return *op_begin(); }
+
+  // factory methods
+  static getelementptr_inst* create(value *ptr, const std::vector<value*> &idx,
+                                    const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(getelementptr_inst)
+  _TRITON_DEFINE_ACCEPT(getelementptr_inst)
+
+private:
+  type *source_elt_ty;
+  type *res_elt_ty;
+};
+
+//===----------------------------------------------------------------------===//
+//                          load_inst/store_inst classes
+//===----------------------------------------------------------------------===//
+
+class io_inst: public instruction {
+protected:
+  io_inst(type *ty, value_id_t id, unsigned num_ops,
+          const std::string &name = "", instruction *next = nullptr);
+
+public:
+  // accessors
+  value *get_pointer_operand() { return get_operand(0); }
+};
+
+// load
+class load_inst: public io_inst {
+protected:
+  load_inst(value *ptr, value_id_t id, unsigned num_ops,
+          const std::string &name = "", instruction *next = nullptr);
+
+private:
+  static type *get_pointee_type(type *ty);
+};
+
+// unmasked load
+class unmasked_load_inst: public load_inst {
+private:
+  std::string repr_impl() const { return "unmasked_load"; }
+  unmasked_load_inst(value *ptr, const std::string &name, instruction *next);
+
+public:
+  static unmasked_load_inst* create(value *ptr,
+                                    const std::string &name = "",
+                                    instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(unmasked_load_inst)
+  _TRITON_DEFINE_ACCEPT(unmasked_load_inst)
+};
+
+// masked load
+class masked_load_inst: public load_inst {
+private:
+  std::string repr_impl() const { return "masked_load"; }
+  masked_load_inst(value *ptr, value *mask, value *false_value,
+                   const std::string &name, instruction *next);
+
+public:
+  // accessors
+  value *get_mask_operand() { return get_operand(1); }
+  value *get_false_value_operand() { return get_operand(2); }
+  // factory method
+  static masked_load_inst* create(value *ptr, value *mask, value *false_value,
+                                  const std::string &name = "",
+                                  instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(masked_load_inst)
+  _TRITON_DEFINE_ACCEPT(masked_load_inst)
+};
+
+// store
+class store_inst: public io_inst {
+protected:
+  store_inst(value *ptr, value_id_t id, unsigned num_ops,
+            const std::string &name = "", instruction *next = nullptr);
+
+public:
+  value *get_value_operand() { return get_operand(1); }
+};
+
+// unmasked_store
+class unmasked_store_inst: public store_inst{
+private:
+  std::string repr_impl() const { return "unmasked_store"; }
+  unmasked_store_inst(value *ptr, value *v, const std::string &name, instruction *next);
+
+public:
+  // factory method
+  static unmasked_store_inst* create(value* ptr, value *v,
+                                    const std::string &name = "",
+                                    instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(unmasked_store_inst)
+  _TRITON_DEFINE_ACCEPT(unmasked_store_inst)
+};
+
+class masked_store_inst: public store_inst{
+private:
+  std::string repr_impl() const { return "masked_store"; }
+  masked_store_inst(value *ptr, value *v, value *mask,
+                    const std::string &name, instruction *next);
+
+public:
+  // accessors
+  value *get_mask_operand() { return get_operand(2); }
+  // factory method
+  static masked_store_inst* create(value *ptr, value *v, value *mask,
+                                   const std::string &name = "",
+                                   instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(masked_store_inst)
+  _TRITON_DEFINE_ACCEPT(masked_store_inst)
+};
+
+//===----------------------------------------------------------------------===//
+//                               retile_inst classes
+//===----------------------------------------------------------------------===//
+
+// retile
+
+class retile_inst: public unary_inst {
+protected:
+  retile_inst(value *arg, value_id_t id, const type::tile_shapes_t &shapes, const std::string &name, instruction *next);
+};
+
+// reshape
+
+class reshape_inst: public retile_inst {
+private:
+  using retile_inst::retile_inst;
+  std::string repr_impl() const { return "reshape"; }
+
+public:
+  static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix,
+                      const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(reshape_inst)
+  _TRITON_DEFINE_ACCEPT(reshape_inst)
+};
+
+// splat
+
+class splat_inst: public retile_inst {
+private:
+  using retile_inst::retile_inst;
+  std::string repr_impl() const { return "splat"; }
+
+public:
+  static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix,
+                      const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(splat_inst)
+  _TRITON_DEFINE_ACCEPT(splat_inst)
+};
+
+// broadcast
+
+class broadcast_inst: public retile_inst {
+private:
+  using retile_inst::retile_inst;
+  std::string repr_impl() const { return "broadcast"; }
+
+public:
+  static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix,
+                      const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(broadcast_inst)
+  _TRITON_DEFINE_ACCEPT(broadcast_inst)
+};
+
+
+// downcast
+
+class downcast_inst: public unary_inst {
+private:
+  using unary_inst::unary_inst;
+  std::string repr_impl() const { return "downcast"; }
+
+public:
+  static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(downcast_inst)
+  _TRITON_DEFINE_ACCEPT(downcast_inst)
+};
+
+//===----------------------------------------------------------------------===//
+//                               builtin_inst classes
+//===----------------------------------------------------------------------===//
+
+class builtin_inst: public instruction{
+protected:
+  using instruction::instruction;
+};
+
+class get_program_id_inst: public builtin_inst {
+private:
+  get_program_id_inst(type *ty, unsigned axis, const std::string &name, instruction *next);
+  std::string repr_impl() const { return "get_program_id(" + std::to_string(axis_) + ")"; }
+
+public:
+  static instruction* create(context &ctx, unsigned axis, const std::string &name = "", instruction *next = nullptr);
+  unsigned get_axis() const { return axis_; }
+  _TRITON_DEFINE_CLONE(get_program_id_inst)
+  _TRITON_DEFINE_ACCEPT(get_program_id_inst)
+
+private:
+  unsigned axis_;
+};
+
+class get_num_program_inst: public builtin_inst {
+private:
+  get_num_program_inst(type *ty, unsigned axis, const std::string &name, instruction *next);
+  std::string repr_impl() const { return "get_num_program(" + std::to_string(axis_) + ")"; }
+
+public:
+  static instruction* create(context &ctx, unsigned axis, const std::string &name = "", instruction *next = nullptr);
+  unsigned get_axis() const { return axis_; }
+  _TRITON_DEFINE_CLONE(get_num_program_inst)
+  _TRITON_DEFINE_ACCEPT(get_num_program_inst)
+
+private:
+  unsigned axis_;
+};
+
+class atomic_cas_inst: public builtin_inst {
+private:
+  atomic_cas_inst(value *ptr, value *cmp, value *val, const std::string &name, instruction *next);
+  std::string repr_impl() const { return "atomic_cas"; }
+  _TRITON_DEFINE_CLONE(atomic_cas_inst)
+  _TRITON_DEFINE_ACCEPT(atomic_cas_inst)
+
+public:
+  static instruction* create(value *ptr, value *cmp, value *val, const std::string &name = "", instruction *next = nullptr);
+};
+
+class atomic_exch_inst: public builtin_inst {
+private:
+  atomic_exch_inst(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr);
+  std::string repr_impl() const { return "atomic_exch"; }
+  _TRITON_DEFINE_CLONE(atomic_exch_inst)
+  _TRITON_DEFINE_ACCEPT(atomic_exch_inst)
+
+public:
+  static instruction* create(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr);
+};
+
+class atomic_add_inst: public builtin_inst {
+private:
+  atomic_add_inst(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr);
+  std::string repr_impl() const { return "atomic_add"; }
+  _TRITON_DEFINE_CLONE(atomic_add_inst)
+  _TRITON_DEFINE_ACCEPT(atomic_add_inst)
+
+public:
+  static instruction* create(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr);
+};
+
+class dot_inst: public builtin_inst {
+public:
+  enum TransT { NoTrans, Trans };
+
+private:
+  dot_inst(value *A, value *B, value *C, TransT AT, TransT BT, const std::string &name, instruction *next);
+  std::string repr_impl() const { return "dot"; }
+
+public:
+  static instruction *create(value *A, value *B, value *C, bool AT, bool BT, const std::string &name = "", instruction *next = nullptr);
+  static instruction* create_nn(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr);
+  static instruction* create_nt(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr);
+  static instruction* create_tn(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr);
+  static instruction* create_tt(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(dot_inst)
+  _TRITON_DEFINE_ACCEPT(dot_inst)
+};
+
+//class outer_inst: public builtin_inst {
+//private:
+//  outer_inst(value *A, value *B, value *C, const std::string &name, instruction *next);
+//public:
+//  static instruction* create(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr);
+//};
+
+class trans_inst: public builtin_inst {
+public:
+  ir::type* get_res_ty(ir::type* in, std::vector<int> perm);
+  std::vector<int> init_perm(ir::type* ty, const std::vector<int>& perm);
+
+private:
+  trans_inst(value *arg, const std::vector<int>& perm, const std::string& name, instruction* next);
+  std::string repr_impl() const { return "trans"; }
+
+public:
+  static instruction* create(value *arg, const std::vector<int> &perm = {}, const std::string &name = "", instruction *next = nullptr);
+  const std::vector<int> get_perm() const;
+  _TRITON_DEFINE_CLONE(trans_inst)
+  _TRITON_DEFINE_ACCEPT(trans_inst)
+
+private:
+  std::vector<int> perm_;
+};
+
+class sqrt_inst: public builtin_inst {
+private:
+  sqrt_inst(value *arg, const std::string& name, instruction* next);
+  std::string repr_impl() const { return "sqrt"; }
+public:
+  static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(sqrt_inst)
+  _TRITON_DEFINE_ACCEPT(sqrt_inst)
+};
+
+class reduce_inst: public builtin_inst {
+public:
+  enum op_t{
+    ADD, SUB, MAX, MIN,
+    FADD, FSUB, FMAX, FMIN
+  };
+
+private:
+  static type* get_res_type(value *arg, unsigned axis);
+  static std::string to_str(op_t op);
+
+private:
+  reduce_inst(value* arg, op_t op, unsigned axis, const std::string& name, instruction* next);
+  std::string repr_impl() const { return "reduce"; }
+  _TRITON_DEFINE_CLONE(reduce_inst)
+  _TRITON_DEFINE_ACCEPT(reduce_inst)
+
+public:
+  static instruction* create(value *arg, op_t op, unsigned axis, const std::string &name = "", instruction *next = nullptr);
+  unsigned get_axis() const { return axis_; }
+  op_t get_op() const { return op_; }
+
+private:
+  unsigned axis_;
+  op_t op_;
+};
+
+class select_inst: public builtin_inst {
+private:
+  select_inst(value *pred, value *if_value, value *else_value, const std::string& name, instruction* next);
+  std::string repr_impl() const { return "select"; }
+  _TRITON_DEFINE_CLONE(select_inst)
+  _TRITON_DEFINE_ACCEPT(select_inst)
+
+public:
+  static instruction* create(value *pred, value *if_value, value *else_value, const std::string &name = "", instruction *next = nullptr);
+};
+
+//===----------------------------------------------------------------------===//
+//                               intrinsics classes
+//===----------------------------------------------------------------------===//
+
+class copy_to_shared_inst: public unary_inst{
+private:
+  using unary_inst::unary_inst;
+  std::string repr_impl() const { return "copy_to_shared"; }
+
+public:
+  static copy_to_shared_inst* create(value *arg, const std::string &name = "",
+                                     instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(copy_to_shared_inst)
+  _TRITON_DEFINE_ACCEPT(copy_to_shared_inst)
+};
+
+class copy_from_shared_inst: public unary_inst{
+private:
+  using unary_inst::unary_inst;
+  std::string repr_impl() const { return "copy_from_shared"; }
+
+public:
+  static copy_from_shared_inst* create(value *arg, const std::string &name = "",
+                                     instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(copy_from_shared_inst)
+  _TRITON_DEFINE_ACCEPT(copy_from_shared_inst)
+};
+
+class recoalesce_inst: public unary_inst{
+private:
+  using unary_inst::unary_inst;
+  std::string repr_impl() const { return "recoalesce_inst"; }
+
+public:
+  static recoalesce_inst* create(value *arg, const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(recoalesce_inst)
+  _TRITON_DEFINE_ACCEPT(recoalesce_inst)
+};
+
+class barrier_inst: public instruction{
+private:
+  barrier_inst(context &ctx, const std::string &name, instruction *next);
+  std::string repr_impl() const { return "barrier"; }
+  _TRITON_DEFINE_CLONE(barrier_inst)
+  _TRITON_DEFINE_ACCEPT(barrier_inst)
+
+public:
+  static barrier_inst* create(context &ctx, const std::string &name = "",
+                                            instruction *next = nullptr);
+};
+
+// On NVIDIA, implementation is such that
+// constant_range = nv_dynamic_program_idx + nv_static_program_idx
+// so as to enable re-association on nv_static_program_idx which is constant
+class make_range_dyn: public instruction {
+private:
+  make_range_dyn(type *ty, const std::string &name, instruction *next);
+  std::string repr_impl() const { return "nv_dynamic_program_idx"; }
+  _TRITON_DEFINE_CLONE(make_range_dyn)
+  _TRITON_DEFINE_ACCEPT(make_range_dyn)
+
+public:
+  static make_range_dyn* create(type *ty, const std::string &name = "", instruction *next = nullptr);
+};
+
+class make_range_sta: public constant {
+private:
+  make_range_sta(make_range *range);
+
+public:
+  static make_range_sta *get(make_range* range);
+  make_range* get_range() const;
+  std::string repr() const { return "nv_static_program_idx"; }
+  _TRITON_DEFINE_ACCEPT(make_range_sta)
+
+private:
+  make_range *range_;
+};
+
+
+/* constant range */
+class make_range: public instruction{
+  make_range(type *ty, constant_int* first, constant_int* last);
+  std::string repr_impl() const { return "make_range[" + first_->repr() + " : " + last_->repr() + "]"; }
+  _TRITON_DEFINE_CLONE(make_range)
+  _TRITON_DEFINE_ACCEPT(make_range)
+
+public:
+  static make_range *create(constant_int *first, constant_int *last);
+  const constant_int* get_first() const;
+  const constant_int* get_last() const;
+
+private:
+  constant_int* first_;
+  constant_int* last_;
+};
+
+
+}
+}
+
+#endif
diff --git a/include/triton/ir/metadata.h b/include/triton/ir/metadata.h
new file mode 100644
index 000000000..e595da36c
--- /dev/null
+++ b/include/triton/ir/metadata.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#ifndef _TRITON_IR_METADATA_H_
+#define _TRITON_IR_METADATA_H_
+
+namespace triton{
+namespace ir{
+
+
+/* Metadata */
+class metadata{
+public:
+  enum kind_t{
+    multiple_of
+  };
+
+private:
+  metadata(kind_t kind, unsigned value);
+
+public:
+  static metadata* get(kind_t kind, unsigned value);
+
+private:
+  kind_t kind_;
+  unsigned value_;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/ir/module.h b/include/triton/ir/module.h
new file mode 100644
index 000000000..d85140d39
--- /dev/null
+++ b/include/triton/ir/module.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#ifndef _TRITON_IR_MODULE_H_
+#define _TRITON_IR_MODULE_H_
+
+#include <map>
+#include <set>
+#include <stack>
+#include <string>
+#include <functional>
+#include "builder.h"
+#include "metadata.h"
+
+namespace triton{
+
+namespace lang{
+
+class iteration_statement;
+class compound_statement;
+
+}
+
+namespace ir{
+
+class basic_block;
+class phi_node;
+class value;
+class context;
+class function;
+class attribute;
+class function_type;
+class constant;
+class global_value;
+class alloc_const;
+
+/* Module */
+struct scope {
+  std::map<std::string, ir::type*> types;
+  std::map<std::string, ir::value*> values;
+};
+
+class module {
+  typedef std::pair<std::string, basic_block*> val_key_t;
+  friend class function;
+  typedef std::pair<ir::metadata::kind_t, unsigned> md_pair_t;
+
+public:
+  typedef std::map<std::string, global_value*> symbols_map_t;
+  typedef std::vector<function*> functions_list_t;
+  struct current_iteration_info_t{
+    lang::iteration_statement *statement;
+    basic_block *block;
+  };
+
+private:
+  phi_node *make_phi(type *ty, unsigned num_values, basic_block *block);
+  value *try_remove_trivial_phis(ir::phi_node *&phi);
+  value *add_phi_operands(const std::string& name, phi_node *&phi);
+  value *get_value_recursive(const std::string& name, basic_block *block);
+  void push_function(function *fn) { functions_.push_back(fn); }
+
+public:
+  module(const std::string &name, context &ctx);
+  context& get_context();
+  builder& get_builder();
+  // Setters
+  void set_value(const std::string& name, basic_block* block, value *x);
+  void set_value(const std::string& name, value* x);
+  void set_const(const std::string& name);
+  void set_continue_fn(std::function<ir::value*()> fn);
+  // Getters
+  value *get_value(const std::string& name, basic_block* block);
+  value *get_value(const std::string& name);
+  const std::string& get_name();
+  std::function<ir::value*()> get_continue_fn();
+  // Seal block -- no more predecessors will be added
+  void seal_block(basic_block *block);
+  // Functions
+  const functions_list_t &get_function_list() const { return functions_; }
+  functions_list_t &get_function_list()             { return functions_; }
+  function *get_or_insert_function(const std::string &name, function_type *ty);
+  // Scope
+  void add_new_scope()                                        { if(scopes_.empty()) scopes_.push(scope()); else scopes_.push(scope(get_scope())); }
+  void pop_scope()                                            { scopes_.pop(); }
+  scope& get_scope()                                          { return scopes_.top(); }
+  // Const allocation
+  void add_alloc(ir::alloc_const* x)                          { allocs_.push_back(x); }
+  const std::vector<ir::alloc_const*>& allocs()               { return allocs_; }
+  // Register global
+  void register_global(const std::string& name, ir::value *x) { globals_[name] = x; }
+  const std::map<std::string, ir::value*>& globals() const    { return globals_; }
+  // Metadata
+  void add_metadata(const std::string &name, md_pair_t x)     { metadatas_[name] = x; }
+
+private:
+  std::string name_;
+  context &context_;
+  builder builder_;
+  std::map<val_key_t, value*> values_;
+  std::map<val_key_t, type*> types_;
+  std::set<std::string> const_;
+  std::set<basic_block*> sealed_blocks_;
+  std::map<basic_block*, std::map<std::string, phi_node*>> incomplete_phis_;
+  functions_list_t functions_;
+  symbols_map_t symbols_;
+  std::function<ir::value*()> continue_fn_;
+  std::map<value*, value**> current_phi_;
+  std::stack<scope> scopes_;
+  std::vector<ir::alloc_const*> allocs_;
+  std::map<std::string, ir::value*> globals_;
+  std::map<std::string, md_pair_t> metadatas_;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/ir/print.h b/include/triton/ir/print.h
new file mode 100644
index 000000000..471948d5f
--- /dev/null
+++ b/include/triton/ir/print.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#ifndef _TRITON_IR_PRINT_H_
+#define _TRITON_IR_PRINT_H_
+
+#include "builder.h"
+
+namespace triton{
+namespace ir{
+
+class module;
+
+void print(module &mod, std::ostream& os);
+
+}
+}
+
+#endif
diff --git a/include/triton/ir/type.h b/include/triton/ir/type.h
new file mode 100644
index 000000000..dedc8ea8c
--- /dev/null
+++ b/include/triton/ir/type.h
@@ -0,0 +1,238 @@
+#pragma once
+
+#ifndef _TRITON_IR_TYPE_H_
+#define _TRITON_IR_TYPE_H_
+
+#include <cassert>
+#include <vector>
+#include <string>
+
+namespace triton{
+namespace ir{
+
+class context;
+class value;
+class integer_type;
+class constant_int;
+
+/* Type */
+class type {
+public:
+  typedef std::vector<unsigned>	         tile_shapes_t;
+
+protected:
+  typedef std::vector<type*>                  contained_tys_vec_t;
+  typedef contained_tys_vec_t::iterator       ty_iterator;
+  typedef contained_tys_vec_t::const_iterator const_ty_iterator;
+
+public:
+  enum id_t {
+    // primitive types
+    VoidTyID = 0,    ///<  0: type with no size
+    HalfTyID,        ///<  1: 16-bit floating point type
+    FloatTyID,       ///<  2: 32-bit floating point type
+    DoubleTyID,      ///<  3: 64-bit floating point type
+    X86_FP80TyID,    ///<  4: 80-bit floating point type (X87)
+    FP128TyID,       ///<  5: 128-bit floating point type (112-bit mantissa)
+    PPC_FP128TyID,   ///<  6: 128-bit floating point type (two 64-bits, PowerPC)
+    LabelTyID,       ///<  7: Labels
+    MetadataTyID,    ///<  8: Metadata
+    TokenTyID,       ///<  9: Token
+    // derived types
+    IntegerTyID,     ///< 10: Arbitrary bit width integers
+    FunctionTyID,    ///< 11: Functions
+    PointerTyID,     ///< 12: Pointers
+    StructTyID,      ///< 13: Struct
+    TileTyID,        ///< 14: Tile
+  };
+
+public:
+  //constructors
+  type(context &ctx, id_t id) : ctx_(ctx), id_(id) { }
+
+  //destructor
+  virtual ~type(){}
+
+  // accessors
+  context &get_context() const { return ctx_; }
+  id_t get_type_id() const     { return id_;  }
+  // type attributes
+  unsigned get_fp_mantissa_width() const;
+  unsigned get_integer_bitwidth() const;
+  unsigned get_tile_bitwidth() const;
+  unsigned get_primitive_size_in_bits() const;
+  type *get_scalar_ty() const;
+  const tile_shapes_t& get_tile_shapes() const;
+  const size_t get_tile_rank() const;
+  const size_t get_tile_ranks1() const;
+  unsigned get_tile_num_elements() const;
+  type *get_tile_element_ty() const;
+  unsigned get_pointer_address_space() const;
+  type *get_pointer_element_ty() const;
+
+  // primitive predicates
+  bool is_void_ty() const               { return id_ == VoidTyID; }
+  bool is_half_ty() const               { return id_ == HalfTyID; }
+  bool is_float_ty() const              { return id_ == FloatTyID; }
+  bool is_double_ty() const             { return id_ == DoubleTyID; }
+  bool is_label_ty()  const             { return id_ == LabelTyID;}
+  bool is_metadata_ty() const           { return id_ == MetadataTyID; }
+  bool is_token_ty() const              { return id_ == TokenTyID; }
+  bool is_integer_ty() const            { return id_ == IntegerTyID; }
+  bool is_integer_ty(unsigned bitwidth) { return is_integer_ty() &&
+                                                 get_integer_bitwidth() == bitwidth;}
+  bool is_pointer_ty() const            { return id_ == PointerTyID; }
+  bool is_tile_ty() const               { return id_ == TileTyID; }
+
+  // Composite predicates
+  bool is_int_or_tileint_ty();
+  bool is_integer_ty(unsigned width) const;
+  bool is_floating_point_ty() const;
+  bool is_sized() const ;
+
+  // Factory methods
+  // primitive types
+  static type *get_void_ty(context &ctx);
+  static type *get_label_ty(context &ctx);
+  // half
+  static type *get_half_ty(context &ctx);
+  static type *get_float_ty(context &ctx);
+  static type *get_double_ty(context &ctx);
+  // integer types
+  static integer_type *get_int1_ty(context &ctx);
+  static integer_type *get_int8_ty(context &ctx);
+  static integer_type *get_int16_ty(context &ctx);
+  static integer_type *get_int32_ty(context &ctx);
+  static integer_type *get_int64_ty(context &ctx);
+  static integer_type *get_int128_ty(context &ctx);
+
+  // repr
+  std::string tile_repr() const {
+    std::string res = get_tile_element_ty()->repr();
+    auto shapes = get_tile_shapes();
+    res += "<";
+    for(size_t i = 0; i < shapes.size(); i++){
+      if(i > 0)
+        res += ", ";
+      res += std::to_string(shapes[i]);
+    }
+    res+= ">";
+    return res;
+  }
+
+  std::string repr() const {
+    switch(id_) {
+      case VoidTyID: return "void";
+      case HalfTyID: return "f16";
+      case FloatTyID: return "f32";
+      case DoubleTyID: return "f64";
+      case X86_FP80TyID: return "f80";
+      case FP128TyID: return "f128";
+      case PPC_FP128TyID: return "ppcf128";
+      case LabelTyID: return "label";
+      case MetadataTyID: return "md";
+      case TokenTyID: return "tok";
+      case IntegerTyID: return "i" + std::to_string(get_integer_bitwidth());
+      case FunctionTyID: return "fn";
+      case PointerTyID: return get_pointer_element_ty()->repr() + "*";
+      case StructTyID: return "struct";
+      case TileTyID: return tile_repr();
+      default: break;
+    }
+    assert(false);
+    return "";
+  };
+
+private:
+  context &ctx_;
+  id_t id_;
+
+protected:
+  contained_tys_vec_t contained_tys_;
+};
+
+class integer_type: public type {
+  friend class context_impl;
+
+private:
+  // constructors
+  integer_type(context &ctx, unsigned bitwidth)
+    : type(ctx, IntegerTyID), bitwidth_(bitwidth){ }
+
+public:
+  // accessors
+  unsigned get_bitwidth() const { return bitwidth_; }
+
+  // factory methods
+  static integer_type* get(context &ctx, unsigned width);
+
+private:
+  unsigned bitwidth_;
+};
+
+class composite_type: public type{
+protected:
+  using type::type;
+
+public:
+  bool index_valid(value *idx) const;
+  type* get_type_at_index(value *idx) const;
+};
+
+class tile_type: public composite_type {
+private:
+  tile_type(type *ty, const tile_shapes_t &shapes);
+  static bool is_valid_elt_ty(type *ty);
+
+public:
+  // accessors
+  const tile_shapes_t& get_shapes() const { return shapes_; }
+  unsigned get_num_elements() const;
+  unsigned get_bitwidth() const;
+
+  // factory methods
+  static tile_type* get(type *ty, const tile_shapes_t &shapes);
+  static tile_type* get_same_shapes(type *ty, type *ref);
+
+private:
+  tile_shapes_t shapes_;
+};
+
+class pointer_type: public type {
+private:
+  pointer_type(type *ty, unsigned address_space);
+  static bool is_valid_elt_ty(type *ty);
+
+public:
+  // accessors
+  unsigned get_address_space()               const { return address_space_; }
+  type *get_element_ty()                     const { return contained_tys_[0]; }
+  // factory methods
+  static pointer_type* get(type *ty, unsigned address_space);
+
+private:
+  unsigned address_space_;
+};
+
+class function_type: public type {
+private:
+  function_type(type *ret_ty, const std::vector<type *> &param_tys);
+
+public:
+  // accessors
+  unsigned get_num_params()         const { return contained_tys_.size() - 1;  }
+  const_ty_iterator params_begin() const { return contained_tys_.begin() + 1; }
+  const_ty_iterator params_end()   const { return contained_tys_.end(); }
+  ty_iterator       params_begin()       { return contained_tys_.begin() + 1; }
+  ty_iterator       params_end()         { return contained_tys_.end(); }
+  type*    get_param_ty(unsigned i) const { return contained_tys_.at(1 + i);   }
+  type*    get_return_ty()          const { return contained_tys_.at(0);       }
+  // factory methods
+  static function_type* get(type *ret_ty, const std::vector<type*>& param_tys);
+};
+
+
+}
+}
+
+#endif
diff --git a/include/triton/ir/utils.h b/include/triton/ir/utils.h
new file mode 100644
index 000000000..3b9e2f5f3
--- /dev/null
+++ b/include/triton/ir/utils.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#ifndef _TRITON_IR_CFG_H_
+#define _TRITON_IR_CFG_H_
+
+#include <vector>
+#include <functional>
+
+namespace triton{
+namespace ir{
+
+class module;
+class function;
+class basic_block;
+class instruction;
+class value;
+
+class cfg {
+public:
+  static std::vector<basic_block *> reverse_post_order(function* fn);
+};
+
+void for_each_instruction(ir::module& mod, const std::function<void(triton::ir::instruction*)> &fn);
+void for_each_value(ir::module& mod, const std::function<void(triton::ir::value *)> &fn);
+
+}
+}
+
+#endif
diff --git a/include/triton/ir/value.h b/include/triton/ir/value.h
new file mode 100644
index 000000000..e192a54ef
--- /dev/null
+++ b/include/triton/ir/value.h
@@ -0,0 +1,90 @@
+#pragma once
+
+#ifndef _TRITON_IR_VALUE_H_
+#define _TRITON_IR_VALUE_H_
+
+#include <string>
+#include <vector>
+#include <set>
+
+namespace triton{
+namespace ir{
+
+class type;
+class use;
+class user;
+class visitor;
+
+//===----------------------------------------------------------------------===//
+//                               value class
+//===----------------------------------------------------------------------===//
+
+class value {
+public:
+  // constructor
+  value(type *ty, const std::string &name = "");
+  virtual ~value(){ }
+  // uses
+  void add_use(user* arg);
+  unsigned erase_use(user* arg);
+  const std::set<user*> &get_users() { return users_; }
+  virtual void replace_all_uses_with(value *target);
+  // name
+  void set_name(const std::string &name);
+  const std::string &get_name() const { return name_; }
+  type* get_type() const { return ty_; }
+  // visitor
+  virtual void accept(visitor *v) = 0;
+
+private:
+  std::string name_;
+
+protected:
+  type *ty_;
+  std::set<user*> users_;
+};
+
+//===----------------------------------------------------------------------===//
+//                               user class
+//===----------------------------------------------------------------------===//
+
+class user: public value{
+public:
+  typedef std::vector<value*>      ops_t;
+  typedef ops_t::iterator       op_iterator;
+  typedef ops_t::const_iterator const_op_iterator;
+
+protected:
+  void resize_ops(unsigned num_ops) { ops_.resize(num_ops + num_hidden_); num_ops_ = num_ops; }
+  void resize_hidden(unsigned num_hidden) { ops_.resize(num_ops_ + num_hidden); num_hidden_ = num_hidden; }
+
+public:
+  // Constructor
+  user(type *ty, unsigned num_ops, const std::string &name = "")
+      : value(ty, name), ops_(num_ops), num_ops_(num_ops), num_hidden_(0){
+  }
+
+  // Operands
+  const ops_t& ops() { return ops_; }
+  op_iterator op_begin() { return ops_.begin(); }
+  op_iterator op_end()   { return ops_.end(); }
+  void     set_operand(unsigned i, value *x);
+  value   *get_operand(unsigned i) const;
+  unsigned get_num_operands() const ;
+  unsigned get_num_hidden() const;
+
+  // Utils
+  void replace_all_uses_with(value *target);
+  void replace_uses_of_with(value *before, value *after);
+
+
+private:
+  ops_t ops_;
+  unsigned num_ops_;
+  unsigned num_hidden_;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/ir/visitor.h b/include/triton/ir/visitor.h
new file mode 100644
index 000000000..b5941b88f
--- /dev/null
+++ b/include/triton/ir/visitor.h
@@ -0,0 +1,152 @@
+#pragma once
+
+#ifndef _TRITON_IR_VISITOR_H_
+#define _TRITON_IR_VISITOR_H_
+
+
+namespace triton{
+namespace ir{
+
+class value;
+
+class instruction;
+
+class phi_node;
+class binary_operator;
+class getelementptr_inst;
+
+class icmp_inst;
+class fcmp_inst;
+class cast_inst;
+class trunc_inst;
+class z_ext_inst;
+class s_ext_inst;
+class fp_trunc_inst;
+class fp_ext_inst;
+class ui_to_fp_inst;
+class si_to_fp_inst;
+class fp_to_ui_inst;
+class fp_to_si_inst;
+class ptr_to_int_inst;
+class int_to_ptr_inst;
+class bit_cast_inst;
+class addr_space_cast_inst;
+
+class return_inst;
+class cond_branch_inst;
+class uncond_branch_inst;
+
+
+class unmasked_load_inst;
+class masked_load_inst;
+class unmasked_store_inst;
+class masked_store_inst;
+
+class retile_inst;
+class reshape_inst;
+class splat_inst;
+class broadcast_inst;
+class downcast_inst;
+
+class get_program_id_inst;
+class get_num_program_inst;
+class atomic_cas_inst;
+class atomic_exch_inst;
+class atomic_add_inst;
+class dot_inst;
+class trans_inst;
+class sqrt_inst;
+class reduce_inst;
+class select_inst;
+
+class recoalesce_inst;
+class copy_to_shared_inst;
+class copy_from_shared_inst;
+class barrier_inst;
+class make_range_dyn;
+class make_range;
+
+class make_range_sta;
+class undef_value;
+class constant_int;
+class constant_fp;
+class global_value;
+class global_object;
+class alloc_const;
+
+class constant_fp;
+class undef_value;
+class constant_int;
+class constant_fp;
+class global_value;
+class global_object;
+class alloc_const;
+
+class function;
+
+class basic_block;
+
+class argument;
+
+class visitor {
+public:
+  virtual ~visitor() {}
+
+  virtual void visit_value(ir::value*);
+
+  virtual void visit_basic_block(basic_block*) = 0;
+  virtual void visit_argument(argument*) = 0;
+  virtual void visit_phi_node(phi_node*) = 0;
+  virtual void visit_binary_operator(binary_operator*) = 0;
+  virtual void visit_getelementptr_inst(getelementptr_inst*) = 0;
+
+  virtual void visit_icmp_inst(icmp_inst*) = 0;
+  virtual void visit_fcmp_inst(fcmp_inst*) = 0;
+  virtual void visit_cast_inst(cast_inst*) = 0;
+
+  virtual void visit_return_inst(return_inst*) = 0;
+  virtual void visit_cond_branch_inst(cond_branch_inst*) = 0;
+  virtual void visit_uncond_branch_inst(uncond_branch_inst*) = 0;
+
+
+  virtual void visit_unmasked_load_inst(unmasked_load_inst*) = 0;
+  virtual void visit_masked_load_inst(masked_load_inst*) = 0;
+  virtual void visit_unmasked_store_inst(unmasked_store_inst*) = 0;
+  virtual void visit_masked_store_inst(masked_store_inst*) = 0;
+
+  virtual void visit_reshape_inst(reshape_inst*) = 0;
+  virtual void visit_splat_inst(splat_inst*) = 0;
+  virtual void visit_broadcast_inst(broadcast_inst*) = 0;
+  virtual void visit_downcast_inst(downcast_inst*) = 0;
+
+  virtual void visit_get_program_id_inst(get_program_id_inst*) = 0;
+  virtual void visit_get_num_program_inst(get_num_program_inst*) = 0;
+  virtual void visit_atomic_cas_inst(atomic_cas_inst*) = 0;
+  virtual void visit_atomic_exch_inst(atomic_exch_inst*) = 0;
+  virtual void visit_atomic_add_inst(atomic_add_inst*) = 0;
+  virtual void visit_dot_inst(dot_inst*) = 0;
+  virtual void visit_trans_inst(trans_inst*) = 0;
+  virtual void visit_sqrt_inst(sqrt_inst*) = 0;
+  virtual void visit_reduce_inst(reduce_inst*) = 0;
+  virtual void visit_select_inst(select_inst*) = 0;
+
+  virtual void visit_recoalesce_inst(recoalesce_inst*) = 0;
+  virtual void visit_copy_to_shared_inst(copy_to_shared_inst*) = 0;
+  virtual void visit_copy_from_shared_inst(copy_from_shared_inst*) = 0;
+  virtual void visit_barrier_inst(barrier_inst*) = 0;
+  virtual void visit_make_range_dyn(make_range_dyn*) = 0;
+  virtual void visit_make_range(make_range*) = 0;
+
+  virtual void visit_function(function*) = 0;
+
+  virtual void visit_make_range_sta(make_range_sta*) = 0;
+  virtual void visit_undef_value(undef_value*) = 0;
+  virtual void visit_constant_int(constant_int*) = 0;
+  virtual void visit_constant_fp(constant_fp*) = 0;
+  virtual void visit_alloc_const(alloc_const*) = 0;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/lang/ast.h b/include/triton/lang/ast.h
new file mode 100644
index 000000000..f523cb835
--- /dev/null
+++ b/include/triton/lang/ast.h
@@ -0,0 +1,820 @@
+#pragma once
+
+#ifndef _WGTCC_AST_H_
+#define _WGTCC_AST_H_
+
+#include "error.h"
+#include "token.h"
+#include "type.h"
+
+#include <cassert>
+#include <list>
+#include <memory>
+#include <string>
+
+
+class Visitor;
+template<typename T> class Evaluator;
+class AddrEvaluator;
+class Generator;
+
+class Scope;
+class Parser;
+class ASTNode;
+class Token;
+class TokenSequence;
+
+// Expressions
+class Expr;
+class BinaryOp;
+class UnaryOp;
+class ConditionalOp;
+class FuncCall;
+class TempVar;
+class Constant;
+
+class Identifier;
+class Object;
+struct Initializer;
+class Declaration;
+class Enumerator;
+
+// Statements
+class Stmt;
+class IfStmt;
+class ForStmt;
+class JumpStmt;
+class LabelStmt;
+class EmptyStmt;
+class CompoundStmt;
+class FuncDef;
+class TranslationUnit;
+
+
+/*
+ * AST Node
+ */
+
+class ASTNode {
+public:
+  struct Attr{
+
+    enum KindT{
+      MULTIPLEOF,
+      ALIGNED,
+      NOALIAS,
+      READONLY,
+      WRITEONLY
+    };
+
+    KindT kind;
+    std::vector<Expr*> vals;
+  };
+  using AttrList = std::vector<Attr>;
+
+public:
+  virtual ~ASTNode() {}
+  virtual void Accept(Visitor* v) = 0;
+
+protected:
+  ASTNode() {}
+
+  MemPool* pool_ {nullptr};
+};
+
+using ExtDecl = ASTNode;
+
+
+/*
+ * Statements
+ */
+
+class Stmt : public ASTNode {
+public:
+  virtual ~Stmt() {}
+
+protected:
+   Stmt() {}
+};
+
+
+class EmptyStmt : public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static EmptyStmt* New();
+  virtual ~EmptyStmt() {}
+  virtual void Accept(Visitor* v);
+
+protected:
+  EmptyStmt() {}
+};
+
+
+class LabelStmt : public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static LabelStmt* New();
+  ~LabelStmt() {}
+  virtual void Accept(Visitor* v);
+  std::string Repr() const { return ".L" + std::to_string(tag_); }
+
+protected:
+  LabelStmt(): tag_(GenTag()) {}
+
+private:
+  static int GenTag() {
+    static int tag = 0;
+    return ++tag;
+  }
+
+  int tag_; // 使用整型的tag值，而不直接用字符串
+};
+
+
+class IfStmt : public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+public:
+  static IfStmt* New(Expr* cond, Stmt* then, Stmt* els=nullptr);
+  virtual ~IfStmt() {}
+  virtual void Accept(Visitor* v);
+
+protected:
+  IfStmt(Expr* cond, Stmt* then, Stmt* els = nullptr)
+      : cond_(cond), then_(then), else_(els) {}
+
+private:
+  Expr* cond_;
+  Stmt* then_;
+  Stmt* else_;
+};
+
+class ForStmt: public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+public:
+  static ForStmt* New(Stmt* body, Stmt* init = nullptr, Expr* cond = nullptr, Expr* step = nullptr);
+  virtual ~ForStmt() {}
+  virtual void Accept(Visitor* v);
+
+protected:
+  ForStmt(Stmt* body, Stmt* init = nullptr, Expr* cond = nullptr, Expr* step = nullptr)
+      : body_(body), init_(init), cond_(cond), step_(step) {}
+
+private:
+  Stmt* body_;
+  Stmt* init_;
+  Expr* cond_;
+  Expr* step_;
+};
+
+class JumpStmt : public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static JumpStmt* New(LabelStmt* label);
+  virtual ~JumpStmt() {}
+  virtual void Accept(Visitor* v);
+  void SetLabel(LabelStmt* label) { label_ = label; }
+
+protected:
+  JumpStmt(LabelStmt* label): label_(label) {}
+
+private:
+  LabelStmt* label_;
+};
+
+
+class ReturnStmt: public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static ReturnStmt* New(Expr* expr);
+  virtual ~ReturnStmt() {}
+  virtual void Accept(Visitor* v);
+
+protected:
+  ReturnStmt(::Expr* expr): expr_(expr) {}
+
+private:
+  ::Expr* expr_;
+};
+
+
+using StmtList = std::list<Stmt*>;
+
+class CompoundStmt : public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static CompoundStmt* New(StmtList& stmts, ::Scope* scope=nullptr);
+  virtual ~CompoundStmt() {}
+  virtual void Accept(Visitor* v);
+  StmtList& Stmts() { return stmts_; }
+  ::Scope* Scope() { return scope_; }
+
+protected:
+  CompoundStmt(const StmtList& stmts, ::Scope* scope=nullptr)
+      : stmts_(stmts), scope_(scope) {}
+
+private:
+  StmtList stmts_;
+  ::Scope* scope_;
+};
+
+
+struct Initializer {
+  Initializer(Type* type,
+              int offset,
+              Expr* expr,
+              unsigned char bitFieldBegin=0,
+              unsigned char bitFieldWidth=0)
+      : type_(type),
+        offset_(offset),
+        bitFieldBegin_(bitFieldBegin),
+        bitFieldWidth_(bitFieldWidth),
+        expr_(expr) {}
+
+  bool operator<(const Initializer& rhs) const;
+
+  // It could be the object it self or, it will be the member
+  // that was initialized
+  Type* type_;
+  int offset_;
+  unsigned char bitFieldBegin_;
+  unsigned char bitFieldWidth_;
+
+  Expr* expr_;
+};
+
+
+using InitList = std::set<Initializer>;
+
+class Declaration: public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static Declaration* New(Object* obj);
+  virtual ~Declaration() {}
+  virtual void Accept(Visitor* v);
+  InitList& Inits() { return inits_; }
+  Object* Obj() { return obj_; }
+  void AddInit(Initializer init);
+
+protected:
+  Declaration(Object* obj): obj_(obj) {}
+
+  Object* obj_;
+  InitList inits_;
+};
+
+
+/*
+ * Expr
+ *  BinaryOp
+ *  UnaryOp
+ *  ConditionalOp
+ *  FuncCall
+ *  Constant
+ *  Identifier
+ *  Object
+ *  TempVar
+ */
+
+class Expr : public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+  friend class LValAssigner;
+
+public:
+  virtual ~Expr() {}
+  ::Type* Type() { return type_.GetPtr(); }
+  virtual bool IsLVal() = 0;
+  virtual void TypeChecking() = 0;
+  void EnsureCompatible(const QualType lhs, const QualType rhs) const;
+  void EnsureCompatibleOrVoidPointer(const QualType lhs,
+                                     const QualType rhs) const;
+  const Token* Tok() const { return tok_; }
+  void SetTok(const Token* tok) { tok_ = tok; }
+
+  static Expr* MayCast(Expr* expr);
+  static Expr* MayCast(Expr* expr, QualType desType);
+  static ::Type* TryExtractScalarType(Expr* loc, Expr *operand);
+  static ::Type* ScalarOrLikeTile(Expr* operand, ::Type* ty);
+
+  virtual bool IsNullPointerConstant() const { return false; }
+  bool IsConstQualified() const { return type_.IsConstQualified(); }
+  bool IsRestrictQualified() const { return type_.IsRestrictQualified(); }
+  bool IsVolatileQualified() const { return type_.IsVolatileQualified(); }
+
+protected:
+  // You can construct a expression without specifying a type,
+  // then the type should be evaluated in TypeChecking()
+  Expr(const Token* tok, QualType type): tok_(tok), type_(type) {}
+
+  const Token* tok_;
+  QualType type_;
+};
+
+
+/*
+ * '+', '-', '*', '/', '%', '<', '>', '<<', '>>', '|', '&', '^'
+ * '=',(复合赋值运算符被拆分为两个运算)
+ * '==', '!=', '<=', '>=',
+ * '&&', '||'
+ * '['(下标运算符), '.'(成员运算符)
+ * ','(逗号运算符),
+ */
+class BinaryOp : public Expr {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+  friend class LValAssigner;
+  friend class Declaration;
+
+public:
+  static BinaryOp* New(const Token* tok, Expr* lhs, Expr* rhs);
+  static BinaryOp* New(const Token* tok, int op, Expr* lhs, Expr* rhs);
+  virtual ~BinaryOp() {}
+  virtual void Accept(Visitor* v);
+
+  // Member ref operator is a lvalue
+  virtual bool IsLVal() {
+    switch (op_) {
+    case '.': return !Type()->ToArray() && lhs_->IsLVal();
+    case ']': return !Type()->ToArray();
+    case Token::MASKED_DEREF: return true;
+    default: return false;
+    }
+  }
+  ArithmType* Convert();
+  static void Broadcast(Expr* loc, Expr*& lhs, Expr*& rhs, QualType &type);
+
+  virtual void TypeChecking();
+  void SubScriptingOpTypeChecking();
+  void MemberRefOpTypeChecking();
+  void MultiOpTypeChecking();
+  void AdditiveOpTypeChecking();
+  void ShiftOpTypeChecking();
+  void RangeOpTypeChecking();
+  void MatmulOpTypeChecking();
+  void MaskedDerefOpTypeChecking();
+  void RelationalOpTypeChecking();
+  void EqualityOpTypeChecking();
+  void BitwiseOpTypeChecking();
+  void LogicalOpTypeChecking();
+  void AssignOpTypeChecking();
+  void CommaOpTypeChecking();
+
+protected:
+  BinaryOp(const Token* tok, int op, Expr* lhs, Expr* rhs)
+      : Expr(tok, nullptr), op_(op) {
+        lhs_ = lhs, rhs_ = rhs;
+        if (op != '.') {
+          lhs_ = MayCast(lhs);
+          rhs_ = MayCast(rhs);
+        }
+      }
+
+  int op_;
+  Expr* lhs_;
+  Expr* rhs_;
+};
+
+
+/*
+ * Unary Operator:
+ * '++' (prefix/postfix)
+ * '--' (prefix/postfix)
+ * '&'  (ADDR)
+ * '*'  (DEREF)
+ * '+'  (PLUS)
+ * '-'  (MINUS)
+ * '~'
+ * '!'
+ * CAST // like (int)3
+ */
+class UnaryOp : public Expr {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+  friend class LValAssigner;
+
+public:
+  static UnaryOp* New(int op, Expr* operand, QualType type=nullptr, int info=0);
+  virtual ~UnaryOp() {}
+  virtual void Accept(Visitor* v);
+  virtual bool IsLVal();
+  ::Type *Convert();
+  static int encodeRed(int ax, int tag);
+  static void decodeRed(int info, int& ax, int& tag);
+  void TypeChecking();
+  void IncDecOpTypeChecking();
+  void AddrOpTypeChecking();
+  void DerefOpTypeChecking();
+  void ReduceOpTypeChecking();
+  void UnaryArithmOpTypeChecking();
+  void CastOpTypeChecking();
+
+protected:
+  UnaryOp(int op, Expr* operand, QualType type=nullptr, int info=0)
+    : Expr(operand->Tok(), type), op_(op), info_(info) {
+      operand_ = operand;
+      if (op_ != Token::CAST && op_ != Token::ADDR) {
+        operand_ = MayCast(operand);
+      }
+    }
+
+  int op_;
+  int info_;
+  Expr* operand_;
+};
+
+class TransOp: public Expr {
+  friend class Generator;
+
+public:
+  using PermInt = std::vector<int>;
+
+public:
+  static TransOp* New(const PermInt& perm, Expr* operand);
+  const PermInt& getPerm() const { return perm_; }
+  void Accept(Visitor* v);
+  bool IsLVal() { return false; }
+  void TypeChecking();
+
+protected:
+  TransOp(const PermInt& perm, Expr* operand)
+    : Expr(operand->Tok(), nullptr), operand_(operand), perm_(perm) {}
+
+private:
+  Expr* operand_;
+  PermInt perm_;
+};
+
+
+// cond ? true ： false
+class ConditionalOp : public Expr {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static ConditionalOp* New(const Token* tok,
+      Expr* cond, Expr* exprTrue, Expr* exprFalse);
+  virtual ~ConditionalOp() {}
+  virtual void Accept(Visitor* v);
+  virtual bool IsLVal() { return false; }
+  ArithmType* Convert();
+  virtual void TypeChecking();
+
+protected:
+  ConditionalOp(Expr* cond, Expr* exprTrue, Expr* exprFalse)
+      : Expr(cond->Tok(), nullptr), cond_(MayCast(cond)),
+        exprTrue_(MayCast(exprTrue)), exprFalse_(MayCast(exprFalse)) {}
+
+private:
+  Expr* cond_;
+  Expr* exprTrue_;
+  Expr* exprFalse_;
+};
+
+
+class FuncCall : public Expr {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  using ArgList = std::vector<Expr*>;
+
+public:
+  static FuncCall* New(Expr* designator, const ArgList& args);
+  ~FuncCall() {}
+  virtual void Accept(Visitor* v);
+
+  // A function call is ofcourse not lvalue
+  virtual bool IsLVal() { return false; }
+  ArgList* Args() { return &args_; }
+  Expr* Designator() { return designator_; }
+  const std::string& Name() const { return tok_->str_; }
+  ::FuncType* FuncType() { return designator_->Type()->ToFunc(); }
+  virtual void TypeChecking();
+
+protected:
+  FuncCall(Expr* designator, const ArgList& args)
+    : Expr(designator->Tok(), nullptr),
+      designator_(designator), args_(args) {}
+
+  Expr* designator_;
+  ArgList args_;
+};
+
+
+class Constant: public Expr {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static Constant* New(const Token* tok, int tag, long val);
+  static Constant* New(const Token* tok, int tag, double val);
+  static Constant* New(const Token* tok, int tag, const std::string* val);
+  ~Constant() {}
+  virtual void Accept(Visitor* v);
+  virtual bool IsLVal() { return false; }
+  virtual void TypeChecking() {}
+
+  long IVal() const { return ival_; }
+  double FVal() const { return fval_; }
+  const std::string* SVal() const { return sval_; }
+  std::string SValRepr() const;
+  std::string Repr() const { return std::string(".LC") + std::to_string(id_); }
+
+protected:
+  Constant(const Token* tok, QualType type, long val)
+      : Expr(tok, type), ival_(val) {}
+  Constant(const Token* tok, QualType type, double val)
+      : Expr(tok, type), fval_(val) {}
+  Constant(const Token* tok, QualType type, const std::string* val)
+      : Expr(tok, type), sval_(val) {}
+
+  union {
+    long ival_;
+    double fval_;
+    struct {
+      long id_;
+      const std::string* sval_;
+    };
+  };
+};
+
+
+class TempVar : public Expr {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static TempVar* New(QualType type);
+  virtual ~TempVar() {}
+  virtual void Accept(Visitor* v);
+  virtual bool IsLVal() { return true; }
+  virtual void TypeChecking() {}
+
+protected:
+  TempVar(QualType type): Expr(nullptr, type), tag_(GenTag()) {}
+
+private:
+  static int GenTag() {
+    static int tag = 0;
+    return ++tag;
+  }
+
+  int tag_;
+};
+
+
+enum Linkage {
+  L_NONE,
+  L_EXTERNAL,
+  L_INTERNAL,
+};
+
+
+class Identifier: public Expr {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+  friend class LValAssigner;
+
+public:
+  static Identifier* New(const Token* tok, QualType type, Linkage linkage, const AttrList& attrList={});
+  virtual ~Identifier() {}
+  virtual void Accept(Visitor* v);
+  virtual bool IsLVal() { return false; }
+  virtual Object* ToObject() { return nullptr; }
+  virtual Enumerator* ToEnumerator() { return nullptr; }
+
+   // An identifer can be:
+   //   object, sturct/union/enum tag, typedef name, function, label.
+   Identifier* ToTypeName() {
+    // A typename has no linkage
+    // And a function has external or internal linkage
+    if (ToObject() || ToEnumerator() || linkage_ != L_NONE)
+      return nullptr;
+    return this;
+  }
+  virtual const std::string Name() const { return tok_->str_; }
+  enum Linkage Linkage() const { return linkage_; }
+  void SetLinkage(enum Linkage linkage) { linkage_ = linkage; }
+  virtual void TypeChecking() {}
+
+protected:
+  Identifier(const Token* tok, QualType type, enum Linkage linkage, const AttrList& attrList={})
+      : Expr(tok, type), linkage_(linkage), attrList_(attrList) {}
+
+  // An identifier has property linkage
+  enum Linkage linkage_;
+  AttrList attrList_;
+};
+
+
+class Enumerator: public Identifier {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static Enumerator* New(const Token* tok, int val);
+  virtual ~Enumerator() {}
+  virtual void Accept(Visitor* v);
+  virtual Enumerator* ToEnumerator() { return this; }
+  int Val() const { return cons_->IVal(); }
+
+protected:
+  Enumerator(const Token* tok, int val)
+      : Identifier(tok, ArithmType::New(T_INT), L_NONE),
+        cons_(Constant::New(tok, T_INT, (long)val)) {}
+
+  Constant* cons_;
+};
+
+
+class Object : public Identifier {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+  friend class LValAssigner;
+
+public:
+  static Object* New(const Token* tok,
+                     QualType type,
+                     int storage=0,
+                     enum Linkage linkage=L_NONE,
+                     unsigned char bitFieldBegin=0,
+                     unsigned char bitFieldWidth=0,
+                     const AttrList& attrList={});
+  static Object* NewAnony(const Token* tok,
+                          QualType type,
+                          int storage=0,
+                          enum Linkage linkage=L_NONE,
+                          unsigned char bitFieldBegin=0,
+                          unsigned char bitFieldWidth=0,
+                          const AttrList& attrList={});
+  ~Object() {}
+  virtual void Accept(Visitor* v);
+  virtual Object* ToObject() { return this; }
+  virtual bool IsLVal() {
+    // TODO(wgtdkp): not all object is lval?
+    return true;
+  }
+  bool IsStatic() const {
+    return (Storage() & S_STATIC) || (Linkage() != L_NONE);
+  }
+  int Storage() const { return storage_; }
+  void SetStorage(int storage) { storage_ = storage; }
+  int Align() const { return align_; }
+  void SetAlign(int align) {
+    assert(align > 0);
+    // Allowing reduce alignment to implement __attribute__((packed))
+    //if (align < align_)
+    //  Error(this, "alignment specifier cannot reduce alignment");
+    align_ = align;
+  }
+  int Offset() const { return offset_; }
+  void SetOffset(int offset) { offset_ = offset; }
+  Declaration* Decl() { return decl_; }
+  void SetDecl(Declaration* decl) { decl_ = decl; }
+  const AttrList& GetAttrList() const { return attrList_; }
+  unsigned char BitFieldBegin() const { return bitFieldBegin_; }
+  unsigned char BitFieldEnd() const { return bitFieldBegin_ + bitFieldWidth_; }
+  unsigned char BitFieldWidth() const { return bitFieldWidth_; }
+  static unsigned long BitFieldMask(Object* bitField) {
+    return BitFieldMask(bitField->bitFieldBegin_, bitField->bitFieldWidth_);
+  }
+  static unsigned long BitFieldMask(unsigned char begin, unsigned char width) {
+    auto end = begin + width;
+    return ((0xFFFFFFFFFFFFFFFFUL << (64 - end)) >> (64 - width)) << begin;
+  }
+
+  bool HasInit() const { return decl_ && decl_->Inits().size(); }
+  bool Anonymous() const { return anonymous_; }
+  virtual const std::string Name() const { return Identifier::Name(); }
+  std::string Repr() const {
+    assert(IsStatic() || anonymous_);
+    if (anonymous_)
+      return "anonymous." + std::to_string(id_);
+    if (linkage_ == L_NONE)
+      return Name() + "." + std::to_string(id_);
+    return Name();
+  }
+
+protected:
+  Object(const Token* tok,
+         QualType type,
+         int storage=0,
+         enum Linkage linkage=L_NONE,
+         unsigned char bitFieldBegin=0,
+         unsigned char bitFieldWidth=0,
+         const AttrList& attrList={})
+      : Identifier(tok, type, linkage),
+        storage_(storage),
+        offset_(0),
+        align_(type->Align()),
+        decl_(nullptr),
+        bitFieldBegin_(bitFieldBegin),
+        bitFieldWidth_(bitFieldWidth),
+        anonymous_(false),
+        attrList_(attrList){}
+
+private:
+  int storage_;
+  int offset_;
+  int align_;
+
+  Declaration* decl_;
+
+  unsigned char bitFieldBegin_;
+  // 0 means it's not a bitfield
+  unsigned char bitFieldWidth_;
+
+  bool anonymous_;
+  long id_ {0};
+
+  AttrList attrList_;
+};
+
+
+/*
+ * Declaration
+ */
+
+class FuncDef : public ExtDecl {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  using ParamList = std::vector<Object*>;
+
+public:
+  static FuncDef* New(Identifier* ident, LabelStmt* retLabel);
+  virtual ~FuncDef() {}
+  virtual void Accept(Visitor* v);
+  ::FuncType* FuncType() { return ident_->Type()->ToFunc(); }
+  CompoundStmt* Body() { return body_; }
+  void SetBody(CompoundStmt* body) { body_ = body; }
+  std::string Name() const { return ident_->Name(); }
+  enum Linkage Linkage() { return ident_->Linkage(); }
+
+protected:
+  FuncDef(Identifier* ident, LabelStmt* retLabel)
+      : ident_(ident), retLabel_(retLabel) {}
+
+private:
+  Identifier* ident_;
+  LabelStmt* retLabel_;
+  CompoundStmt* body_;
+};
+
+
+using ExtDeclList = std::list<ExtDecl*>;
+
+class TranslationUnit : public ASTNode {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static TranslationUnit* New() { return new TranslationUnit();}
+  virtual ~TranslationUnit() {}
+  virtual void Accept(Visitor* v);
+  void Add(ExtDecl* extDecl) { extDecls_.push_back(extDecl); }
+  ExtDeclList& ExtDecls() { return extDecls_; }
+  const ExtDeclList& ExtDecls() const { return extDecls_; }
+
+private:
+  TranslationUnit() {}
+
+  ExtDeclList extDecls_;
+};
+
+#endif
diff --git a/include/triton/lang/code_gen.h b/include/triton/lang/code_gen.h
new file mode 100644
index 000000000..a29cf268b
--- /dev/null
+++ b/include/triton/lang/code_gen.h
@@ -0,0 +1,162 @@
+#pragma once
+
+#ifndef _WGTCC_CODE_GEN_H_
+#define _WGTCC_CODE_GEN_H_
+
+#include "ast.h"
+#include "visitor.h"
+#include <stack>
+
+namespace triton{
+namespace ir{
+
+class value;
+class module;
+class type;
+class context;
+class builder;
+class attribute;
+
+}
+}
+
+using namespace triton;
+
+class Parser;
+struct Addr;
+template<> class Evaluator<Addr>;
+struct StaticInitializer;
+class LValAssigner;
+
+using TypeList = std::vector<Type*>;
+using LocationList = std::vector<std::string>;
+using StaticInitList = std::vector<StaticInitializer>;
+
+// Error
+inline void should_not_happen() { throw std::runtime_error("should not happen"); }
+inline void error_not_implemented() { throw std::runtime_error("not implemented"); }
+
+class Generator: public Visitor {
+  friend class Evaluator<Addr>;
+  friend class LValAssigner;
+
+protected:
+  struct scope {
+    std::map<std::string, ir::type*> types;
+    std::map<std::string, ir::value*> values;
+  };
+
+  void set_ret(ir::value* value);
+  ir::value *GenUnaryMinus(ir::value* arg);
+
+public:
+  Generator(Parser* parser) : parser_(parser) {}
+
+  void Visit(ASTNode* node) { node->Accept(this); }
+  void VisitExpr(Expr* expr) { expr->Accept(this); }
+  void VisitStmt(Stmt* stmt) { stmt->Accept(this); }
+
+  // Expression
+  void VisitBinaryOp(BinaryOp* binaryOp);
+  void VisitUnaryOp(UnaryOp* unaryOp);
+  void VisitTransOp(TransOp* transOp);
+  void VisitConditionalOp(ConditionalOp* condOp);
+  void VisitFuncCall(FuncCall* funcCall);
+  void VisitObject(Object* obj);
+  void VisitEnumerator(Enumerator* enumer);
+  void VisitIdentifier(Identifier* ident);
+  void VisitConstant(Constant* cons);
+  void VisitTempVar(TempVar* tempVar);
+
+  // Statement
+  void VisitDeclaration(Declaration* init);
+  void VisitEmptyStmt(EmptyStmt* emptyStmt);
+  void VisitIfStmt(IfStmt* ifStmt);
+  void VisitForStmt(ForStmt* ifStmt);
+  void VisitJumpStmt(JumpStmt* jumpStmt);
+  void VisitReturnStmt(ReturnStmt* returnStmt);
+  void VisitLabelStmt(LabelStmt* labelStmt);
+  void VisitCompoundStmt(CompoundStmt* compoundStmt);
+
+  void VisitFuncDef(FuncDef* funcDef);
+  void VisitTranslationUnit(TranslationUnit* unit);
+
+  void Gen(ir::module *mod);
+
+protected:
+  // Triton-IR attributes
+  ir::attribute GenIRAttr(ASTNode::Attr attr);
+
+  // Triton-IR values
+  ir::value* GenAssignOp(Expr* lvalue, ir::value* rhs);
+  ir::value* GenBroadcastOp(ir::value* src, ir::type* dst_ty);
+  ir::value* GenNumcastOp(ir::value*src, ir::type* dst_ty);
+  ir::value* GenCastOp(ir::value* op, ir::type* type);
+
+  // Triton-IR types
+  static ir::type* GenIRType(::Type* type, ir::context &ctx);
+  static ir::type* GenIRArithmType(ArithmType* type, ir::context& ctx);
+  static ir::type* GenIRArrayType(ArrayType* type,  ir::context& ctx);
+  static ir::type* GenIRTileType(TileType* type,  ir::context& ctx);
+  static ir::type* GenIRFuncType(FuncType* type,  ir::context& ctx);
+  static ir::type* GenIRPointerType(PointerType* type,  ir::context& ctx);
+  static ir::type* GenIRStructType(StructType* type,  ir::context& ctx);
+  void AllocObjects(Scope* scope, const FuncDef::ParamList& params=FuncDef::ParamList());
+
+  // SSA
+  void pushScope();
+  void popScope();
+
+private:
+  Parser* parser_;
+  ir::value* ret_;
+  ir::builder* bld_;
+  ir::context* ctx_;
+  ir::module* mod_;
+
+private:
+//  std::stack<scope> scopes_;
+  LValAssigner* assign_;
+};
+
+
+class LValAssigner: public Visitor {
+public:
+  LValAssigner(Generator* gen): gen_(gen) {}
+
+  // Expression
+  void VisitBinaryOp(BinaryOp* binaryOp);
+  void VisitUnaryOp(UnaryOp* unaryOp);
+  void VisitObject(Object* obj);
+  void VisitIdentifier(Identifier* ident);
+
+  void VisitConditionalOp(ConditionalOp*)      { should_not_happen(); }
+  void VisitFuncCall(FuncCall*)                { should_not_happen(); }
+  void VisitTransOp(TransOp*)                  { should_not_happen(); }
+  void VisitEnumerator(Enumerator*)            { should_not_happen(); }
+  void VisitConstant(Constant*)                { should_not_happen(); }
+  void VisitTempVar(TempVar*)                  { should_not_happen(); }
+  void VisitDeclaration(Declaration*)          { should_not_happen(); }
+  void VisitEmptyStmt(EmptyStmt*)              { should_not_happen(); }
+  void VisitIfStmt(IfStmt*)                    { should_not_happen(); }
+  void VisitForStmt(ForStmt*)                  { should_not_happen(); }
+  void VisitJumpStmt(JumpStmt*)                { should_not_happen(); }
+  void VisitReturnStmt(ReturnStmt*)            { should_not_happen(); }
+  void VisitLabelStmt(LabelStmt*)              { should_not_happen(); }
+  void VisitCompoundStmt(CompoundStmt*)        { should_not_happen(); }
+  void VisitFuncDef(FuncDef*)                  { should_not_happen(); }
+  void VisitTranslationUnit(TranslationUnit*)  { should_not_happen(); }
+
+  ir::value* GenExpr(Expr* expr, ir::value* rhs) {
+    rhs_ = rhs;
+    expr->Accept(this);
+    return ret_;
+  }
+
+private:
+  ir::value* ret_;
+  ir::value* rhs_;
+  Generator* gen_;
+};
+
+#endif
diff --git a/include/triton/lang/cpp.h b/include/triton/lang/cpp.h
new file mode 100644
index 000000000..cfd839dd7
--- /dev/null
+++ b/include/triton/lang/cpp.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#ifndef _WGTCC_CPP_H_
+#define _WGTCC_CPP_H_
+
+#include "scanner.h"
+
+#include <cstdio>
+#include <list>
+#include <map>
+#include <set>
+#include <stack>
+#include <string>
+
+class Macro;
+struct CondDirective;
+
+using MacroMap = std::map<std::string, Macro>;
+using ParamList = std::list<std::string>;
+using ParamMap = std::map<std::string, TokenSequence>;
+using PPCondStack = std::stack<CondDirective>;
+using PathList = std::list<std::string>;
+
+
+class Macro {
+public:
+  Macro(const TokenSequence& repSeq, bool preDef=false)
+      : funcLike_(false), variadic_(false),
+        preDef_(preDef), repSeq_(repSeq) {}
+
+  Macro(bool variadic, ParamList& params,
+        TokenSequence& repSeq, bool preDef=false)
+      : funcLike_(true), variadic_(variadic), preDef_(preDef),
+        params_(params), repSeq_(repSeq) {}
+
+  ~Macro() {}
+  bool FuncLike() { return funcLike_; }
+  bool ObjLike() { return !FuncLike(); }
+  bool Variadic() { return variadic_; }
+  bool PreDef() { return preDef_; }
+  ParamList& Params() { return params_; }
+  TokenSequence RepSeq(const std::string* filename, unsigned line);
+
+private:
+  bool funcLike_;
+  bool variadic_;
+  bool preDef_;
+  ParamList params_;
+  TokenSequence repSeq_;
+};
+
+
+struct CondDirective {
+  int tag_;
+  bool enabled_;
+  bool cond_;
+};
+
+
+class Preprocessor {
+public:
+  Preprocessor(const std::string* str, bool isSrc = true)
+      : curLine_(1), lineLine_(0), curCond_(true), fName_(nullptr), fSrc_(nullptr) {
+    if(isSrc)
+      fSrc_ = str;
+    else
+      fName_ = str;
+    // Add predefined
+    Init();
+  }
+
+
+  ~Preprocessor() {}
+  void Finalize(TokenSequence os);
+  void Process(TokenSequence& os);
+  void Expand(TokenSequence& os, TokenSequence is, bool inCond=false);
+  void Subst(TokenSequence& os, TokenSequence is,
+             bool leadingWS, const HideSet& hs, ParamMap& params);
+  void Glue(TokenSequence& os, TokenSequence is);
+  void Glue(TokenSequence& os, const Token* tok);
+  const Token* Stringize(TokenSequence is);
+  void Stringize(std::string& str, TokenSequence is);
+  const Token* ParseActualParam(TokenSequence& is, Macro* macro, ParamMap& paramMap);
+  int GetDirective(TokenSequence& is);
+  const Token* EvalDefOp(TokenSequence& is);
+  void ReplaceIdent(TokenSequence& is);
+  void ParseDirective(TokenSequence& os, TokenSequence& is, int directive);
+  void ParseIf(TokenSequence ls);
+  void ParseIfdef(TokenSequence ls);
+  void ParseIfndef(TokenSequence ls);
+  void ParseElif(TokenSequence ls);
+  void ParseElse(TokenSequence ls);
+  void ParseEndif(TokenSequence ls);
+  void ParseInclude(TokenSequence& is, TokenSequence ls);
+  void ParseDef(TokenSequence ls);
+  void ParseUndef(TokenSequence ls);
+  void ParseLine(TokenSequence ls);
+  void ParseError(TokenSequence ls);
+  void ParsePragma(TokenSequence ls);
+  void IncludeSrc(TokenSequence& is, const std::string* text, const std::string* filename);
+  void IncludeFile(TokenSequence& is, const std::string* filename);
+  bool ParseIdentList(ParamList& params, TokenSequence& is);
+
+
+  Macro* FindMacro(const std::string& name) {
+    auto res = macroMap_.find(name);
+    if (res == macroMap_.end())
+      return nullptr;
+    return &res->second;
+  }
+
+  void AddMacro(const std::string& name,
+                std::string* text, bool preDef=false);
+
+  void AddMacro(const std::string& name, const Macro& macro) {
+    auto res = macroMap_.find(name);
+    if (res != macroMap_.end()) {
+      // TODO(wgtdkp): give warning
+      macroMap_.erase(res);
+    }
+    macroMap_.insert(std::make_pair(name, macro));
+  }
+
+  void RemoveMacro(const std::string& name) {
+    auto res = macroMap_.find(name);
+    if (res == macroMap_.end())
+      return;
+    if(res->second.PreDef()) // Cannot undef predefined macro
+      return;
+    macroMap_.erase(res);
+  }
+
+  std::string* SearchFile(const std::string& name,
+                          const bool libHeader,
+                          bool next,
+                          const std::string& curPath);
+
+  void AddSearchPath(std::string path);
+  void HandleTheFileMacro(TokenSequence& os, const Token* macro);
+  void HandleTheLineMacro(TokenSequence& os, const Token* macro);
+  void UpdateFirstTokenLine(TokenSequence ts);
+
+  bool NeedExpand() const {
+    if (ppCondStack_.empty())
+      return true;
+    auto top = ppCondStack_.top();
+    return top.enabled_ && top.cond_;
+  }
+
+private:
+  void Init();
+
+  PPCondStack ppCondStack_;
+  unsigned curLine_;
+  unsigned lineLine_;
+  bool curCond_;
+
+  MacroMap macroMap_;
+  PathList searchPaths_;
+  const std::string* fName_;
+  const std::string* fSrc_;
+};
+
+#endif
diff --git a/include/triton/lang/encoding.h b/include/triton/lang/encoding.h
new file mode 100644
index 000000000..297b2b732
--- /dev/null
+++ b/include/triton/lang/encoding.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#ifndef _WGTCC_ENCODING_H_
+#define _WGTCC_ENCODING_H_
+
+#include <string>
+
+
+enum class Encoding {
+  NONE,
+  CHAR16,
+  CHAR32,
+  UTF8,
+  WCHAR
+};
+
+
+void ConvertToUTF16(std::string& str);
+void ConvertToUTF32(std::string& str);
+void AppendUCN(std::string& str, int c);
+
+#endif
diff --git a/include/triton/lang/error.h b/include/triton/lang/error.h
new file mode 100644
index 000000000..386ca3a3e
--- /dev/null
+++ b/include/triton/lang/error.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#ifndef _WGTCC_ERROR_H_
+#define _WGTCC_ERROR_H_
+
+
+struct SourceLocation;
+class Token;
+class Expr;
+
+
+[[noreturn]] void Error(const char* format, ...);
+[[noreturn]] void Error(const SourceLocation& loc, const char* format, ...);
+[[noreturn]] void Error(const Token* tok, const char* format, ...);
+[[noreturn]] void Error(const Expr* expr, const char* format, ...);
+
+#endif
diff --git a/include/triton/lang/evaluator.h b/include/triton/lang/evaluator.h
new file mode 100644
index 000000000..ac8404550
--- /dev/null
+++ b/include/triton/lang/evaluator.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#ifndef _WGTCC_EVALUATOR_H_
+#define _WGTCC_EVALUATOR_H_
+
+#include "ast.h"
+#include "error.h"
+#include "visitor.h"
+
+
+class Expr;
+
+template<typename T>
+class Evaluator: public Visitor {
+public:
+  Evaluator() {}
+
+  virtual ~Evaluator() {}
+
+  virtual void VisitBinaryOp(BinaryOp* binary);
+  virtual void VisitUnaryOp(UnaryOp* unary);
+  virtual void VisitConditionalOp(ConditionalOp* cond);
+
+  virtual void VisitFuncCall(FuncCall* funcCall) {
+    Error(funcCall, "expect constant expression");
+  }
+  virtual void VisitEnumerator(Enumerator* enumer) {
+    val_ = static_cast<T>(enumer->Val());
+  }
+  virtual void VisitIdentifier(Identifier* ident) {
+    Error(ident, "expect constant expression");
+  }
+  virtual void VisitTransOp(TransOp* trans) {
+    Error(trans, "expect constant expression");
+  }
+  virtual void VisitObject(Object* obj) {
+    Error(obj, "expect constant expression");
+  }
+  virtual void VisitConstant(Constant* cons) {
+    if (cons->Type()->IsFloat()) {
+      val_ = static_cast<T>(cons->FVal());
+    } else if (cons->Type()->IsInteger()) {
+      val_ = static_cast<T>(cons->IVal());
+    } else {
+      assert(false);
+    }
+  }
+  virtual void VisitTempVar(TempVar* tempVar) { assert(false); }
+
+  // We may should assert here
+  virtual void VisitDeclaration(Declaration* init) {}
+  virtual void VisitIfStmt(IfStmt* ifStmt) {}
+  virtual void VisitForStmt(ForStmt* forStmt) {}
+  virtual void VisitJumpStmt(JumpStmt* jumpStmt) {}
+  virtual void VisitReturnStmt(ReturnStmt* returnStmt) {}
+  virtual void VisitLabelStmt(LabelStmt* labelStmt) {}
+  virtual void VisitEmptyStmt(EmptyStmt* emptyStmt) {}
+  virtual void VisitCompoundStmt(CompoundStmt* compStmt) {}
+  virtual void VisitFuncDef(FuncDef* funcDef) {}
+  virtual void VisitTranslationUnit(TranslationUnit* unit) {}
+
+  T Eval(Expr* expr) {
+    expr->Accept(this);
+    return val_;
+  }
+
+private:
+  T val_;
+};
+
+
+struct Addr {
+  std::string label_;
+  int offset_;
+};
+
+template<>
+class Evaluator<Addr>: public Visitor {
+public:
+  Evaluator<Addr>() {}
+  virtual ~Evaluator<Addr>() {}
+  virtual void VisitBinaryOp(BinaryOp* binary);
+  virtual void VisitUnaryOp(UnaryOp* unary);
+  virtual void VisitConditionalOp(ConditionalOp* cond);
+
+  virtual void VisitFuncCall(FuncCall* funcCall) {
+    Error(funcCall, "expect constant expression");
+  }
+  virtual void VisitTransOp(TransOp* trans) {
+    Error(trans, "expect constant expression");
+  }
+  virtual void VisitEnumerator(Enumerator* enumer) {
+    addr_.offset_ = enumer->Val();
+  }
+  virtual void VisitIdentifier(Identifier* ident) {
+    addr_.label_ = ident->Name();
+    addr_.offset_ = 0;
+  }
+  virtual void VisitObject(Object* obj) {
+    if (!obj->IsStatic()) {
+      Error(obj, "expect static object");
+    }
+    addr_.label_ = obj->Repr();
+    addr_.offset_ = 0;
+  }
+  virtual void VisitConstant(Constant* cons);
+  virtual void VisitTempVar(TempVar* tempVar) { assert(false); }
+
+  // We may should assert here
+  virtual void VisitDeclaration(Declaration* init) {}
+  virtual void VisitIfStmt(IfStmt* ifStmt) {}
+  virtual void VisitForStmt(ForStmt* forStmt) {}
+  virtual void VisitJumpStmt(JumpStmt* jumpStmt) {}
+  virtual void VisitReturnStmt(ReturnStmt* returnStmt) {}
+  virtual void VisitLabelStmt(LabelStmt* labelStmt) {}
+  virtual void VisitEmptyStmt(EmptyStmt* emptyStmt) {}
+  virtual void VisitCompoundStmt(CompoundStmt* compStmt) {}
+  virtual void VisitFuncDef(FuncDef* funcDef) {}
+  virtual void VisitTranslationUnit(TranslationUnit* unit) {}
+
+  Addr Eval(Expr* expr) {
+    expr->Accept(this);
+    return addr_;
+  }
+
+private:
+  Addr addr_;
+};
+
+#endif
diff --git a/include/triton/lang/mem_pool.h b/include/triton/lang/mem_pool.h
new file mode 100644
index 000000000..9b6ab53c1
--- /dev/null
+++ b/include/triton/lang/mem_pool.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#ifndef _WGTCC_MEM_POOL_H_
+#define _WGTCC_MEM_POOL_H_
+
+#include <cstddef>
+#include <vector>
+
+
+class MemPool {
+public:
+  MemPool(): allocated_(0) {}
+  virtual ~MemPool() {}
+  MemPool(const MemPool& other) = delete;
+  MemPool& operator=(const MemPool& other) = delete;
+  virtual void* Alloc() = 0;
+  virtual void Free(void* addr) = 0;
+  virtual void Clear() = 0;
+
+protected:
+  size_t allocated_;
+};
+
+
+template <class T>
+class MemPoolImp: public MemPool {
+public:
+  MemPoolImp() : root_(nullptr) {}
+  virtual ~MemPoolImp() {}
+  MemPoolImp(const MemPool& other) = delete;
+  MemPoolImp& operator=(MemPool& other) = delete;
+  virtual void* Alloc();
+  virtual void Free(void* addr);
+  virtual void Clear();
+
+private:
+  enum {
+    COUNT = (4 * 1024) / sizeof(T)
+  };
+
+  union Chunk {
+    Chunk* next_;
+    char mem_[sizeof(T)];
+  };
+
+  struct Block {
+    Block() {
+      for (size_t i = 0; i < COUNT - 1; ++i)
+        chunks_[i].next_ = &chunks_[i+1];
+      chunks_[COUNT-1].next_ = nullptr;
+    }
+    Chunk chunks_[COUNT];
+  };
+
+  std::vector<Block*> blocks_;
+  Chunk* root_;
+};
+
+
+template <class T>
+void* MemPoolImp<T>::Alloc() {
+  if (nullptr == root_) { // 空间不够，需要分配空间
+    auto block = new Block();
+    root_ = block->chunks_;
+    // 如果blocks实现为std::list, 那么push_back实际的overhead更大
+    // 这也表明，即使我们不需要随机访问功能(那么std::vector的拷贝是一种overhead)，
+    // 仍然倾向于使用std::vector，
+    // 当然std::vector的指数级capacity增长会造成内存浪费。
+    blocks_.push_back(block);
+  }
+
+  auto ret = root_;
+  root_ = root_->next_;
+
+  ++allocated_;
+  return ret;
+}
+
+
+template <class T>
+void MemPoolImp<T>::Free(void* addr) {
+  if (nullptr == addr)
+    return;
+
+  auto chunk = static_cast<Chunk*>(addr);
+  chunk->next_ = root_;
+  root_ = chunk;
+
+  --allocated_;
+}
+
+
+template <class T>
+void MemPoolImp<T>::Clear() {
+  for (auto block: blocks_)
+    delete block;
+
+  blocks_.resize(0);
+  root_ = nullptr;
+  allocated_ = 0;
+}
+
+#endif
diff --git a/include/triton/lang/parser.h b/include/triton/lang/parser.h
new file mode 100644
index 000000000..05edbb159
--- /dev/null
+++ b/include/triton/lang/parser.h
@@ -0,0 +1,259 @@
+#pragma once
+
+#ifndef _PARSER_H_
+#define _PARSER_H_
+
+#include "ast.h"
+#include "encoding.h"
+#include "error.h"
+#include "mem_pool.h"
+#include "scope.h"
+#include "token.h"
+
+#include <cassert>
+#include <memory>
+#include <stack>
+
+
+class Preprocessor;
+
+struct DeclInfo {
+  DeclInfo(const Token* _tok,
+           QualType _type,
+           ASTNode::AttrList _attrs = {})
+    : tok(_tok), type(_type), attrs(_attrs) {}
+
+  const Token* tok;
+  QualType type;
+  ASTNode::AttrList attrs;
+};
+
+
+class Parser {
+  using LiteralList = std::vector<Constant*>;
+  using StaticObjectList = std::vector<Object*>;
+  using CaseLabelList = std::vector<std::pair<Constant*, LabelStmt*>>;
+  using LabelJumpList = std::list<std::pair<const Token*, JumpStmt*>>;
+  using LabelMap = std::map<std::string, LabelStmt*>;
+  friend class Generator;
+
+public:
+  explicit Parser(TokenSequence& ts)
+    : unit_(TranslationUnit::New()),
+      ts_(ts),
+      externalSymbols_(new Scope(nullptr, S_BLOCK)),
+      errTok_(nullptr),
+      curScope_(new Scope(nullptr, S_FILE)),
+      curFunc_(nullptr),
+      breakDest_(nullptr),
+      continueDest_(nullptr),
+      caseLabels_(nullptr),
+      defaultLabel_(nullptr) {
+        ts_.SetParser(this);
+      }
+
+  ~Parser() {}
+
+  Constant* ParseConstant(const Token* tok);
+  Constant* ParseFloat(const Token* tok);
+  Constant* ParseInteger(const Token* tok);
+  Constant* ParseCharacter(const Token* tok);
+  Encoding ParseLiteral(std::string& str, const Token* tok);
+  Constant* ConcatLiterals(const Token* tok);
+  Expr* ParseGeneric();
+
+  void Parse();
+  void ParseTranslationUnit();
+  FuncDef* ParseFuncDef(Identifier* ident);
+
+
+  // Expressions
+  Expr* ParseExpr();
+  Expr* ParsePrimaryExpr();
+  QualType TryCompoundLiteral();
+  Object* ParseCompoundLiteral(QualType type);
+  Expr* ParsePostfixExpr();
+  Expr* ParsePostfixExprTail(Expr* primExpr);
+  Expr* ParseSubScripting(Expr* pointer);
+  BinaryOp* ParseMemberRef(const Token* tok, int op, Expr* lhs);
+  UnaryOp* ParsePostfixIncDec(const Token* tok, Expr* operand);
+  FuncCall* ParseFuncCall(Expr* caller);
+
+  Expr* ParseUnaryExpr();
+  Constant* ParseSizeof();
+  Constant* ParseAlignof();
+  UnaryOp* ParsePrefixIncDec(const Token* tok);
+  UnaryOp* ParseUnaryOp(const Token* tok, int op);
+  Expr* ParseDerefOp(const Token* tok);
+
+  QualType ParseTypeName();
+  Expr* ParseCastExpr();
+  Expr* ParseRangeExpr();
+  Expr* ParseMatmulExpr();
+  Expr* ParseMultiplicativeExpr();
+  Expr* ParseAdditiveExpr();
+  Expr* ParseShiftExpr();
+  Expr* ParseRelationalExpr();
+  Expr* ParseEqualityExpr();
+  Expr* ParseBitiwiseAndExpr();
+  Expr* ParseBitwiseXorExpr();
+  Expr* ParseBitwiseOrExpr();
+  Expr* ParseLogicalAndExpr();
+  Expr* ParseLogicalOrExpr();
+  Expr* ParseConditionalExpr();
+  Expr* ParseCommaExpr();
+  Expr* ParseAssignExpr();
+
+  // Declarations
+  CompoundStmt* ParseDecl();
+  void ParseStaticAssert();
+  QualType ParseDeclSpec(int* storageSpec, int* funcSpec, int* alignSpec);
+  QualType ParseSpecQual();
+  int ParseAlignas();
+  Type* ParseStructUnionSpec(bool isStruct);
+  StructType* ParseStructUnionDecl(StructType* type);
+  void ParseBitField(StructType* structType, const Token* tok, QualType type);
+  Type* ParseEnumSpec();
+  Type* ParseEnumerator(ArithmType* type);
+  int ParseQual();
+  QualType ParsePointer(QualType typePointedTo);
+  DeclInfo ParseDeclarator(QualType type);
+  QualType ParseArrayFuncDeclarator(const Token* ident, QualType base);
+  int ParseArrayLength();
+  TileType::ShapeInt ParseTileShape();
+  bool ParseParamList(FuncType::ParamList& params);
+  Object* ParseParamDecl();
+
+  QualType ParseAbstractDeclarator(QualType type);
+  Identifier* ParseDirectDeclarator(QualType type,
+                                    int storageSpec,
+                                    int funcSpec,
+                                    int align);
+  // Initializer
+  void ParseInitializer(Declaration* decl,
+                        QualType type,
+                        int offset,
+                        bool designated=false,
+                        bool forceBrace=false,
+                        unsigned char bitFieldBegin=0,
+                        unsigned char bitFieldWidth=0);
+  void ParseArrayInitializer(Declaration* decl,
+                             ArrayType* type,
+                             int offset,
+                             bool designated);
+  StructType::Iterator ParseStructDesignator(StructType* type,
+                                             const std::string& name);
+  void ParseStructInitializer(Declaration* decl,
+                              StructType* type,
+                              int offset,
+                              bool designated);
+  bool ParseLiteralInitializer(Declaration* init,
+                               ArrayType* type,
+                               int offset);
+  Declaration* ParseInitDeclarator(Identifier* ident);
+  Declaration* ParseInitDeclaratorSub(Object* obj);
+
+  // Statements
+  Stmt* ParseStmt();
+  CompoundStmt* ParseCompoundStmt(FuncType* funcType=nullptr);
+  IfStmt* ParseIfStmt();
+  CompoundStmt* ParseSwitchStmt();
+  CompoundStmt* ParseWhileStmt();
+  CompoundStmt* ParseDoStmt();
+  ForStmt *ParseForStmt();
+  JumpStmt* ParseGotoStmt();
+  JumpStmt* ParseContinueStmt();
+  JumpStmt* ParseBreakStmt();
+  ReturnStmt* ParseReturnStmt();
+  CompoundStmt* ParseLabelStmt(const Token* label);
+  CompoundStmt* ParseCaseStmt();
+  CompoundStmt* ParseDefaultStmt();
+  Identifier* ProcessDeclarator(const Token* tok,
+                                QualType type, const ASTNode::AttrList &attrs,
+                                int storageSpec,
+                                int funcSpec,
+                                int align);
+  // GNU extensions
+  ASTNode::AttrList TryAttributeSpecList();
+  void ParseAttributeSpec(ASTNode::AttrList &attrList);
+  ASTNode::Attr ParseAttribute();
+  bool IsTypeName(const Token* tok) const{
+    if (tok->IsTypeSpecQual())
+      return true;
+
+    if (tok->IsIdentifier()) {
+      auto ident = curScope_->Find(tok);
+      if (ident && ident->ToTypeName())
+        return true;
+    }
+    return false;
+  }
+  bool IsType(const Token* tok) const{
+    if (tok->IsDecl())
+      return true;
+
+    if (tok->IsIdentifier()) {
+      auto ident = curScope_->Find(tok);
+      return (ident && ident->ToTypeName());
+    }
+
+    return false;
+  }
+  void EnsureInteger(Expr* expr) {
+    if (!expr->Type()->IsInteger()) {
+      Error(expr, "expect integer expression");
+    }
+  }
+
+  void EnterBlock(FuncType* funcType=nullptr);
+  void ExitBlock() { curScope_ = curScope_->Parent(); }
+  void EnterProto() { curScope_ = new Scope(curScope_, S_PROTO); }
+  void ExitProto() { curScope_ = curScope_->Parent(); }
+  FuncDef* EnterFunc(Identifier* ident);
+  void ExitFunc();
+
+  LabelStmt* FindLabel(const std::string& label) {
+    auto ret = curLabels_.find(label);
+    if (curLabels_.end() == ret)
+      return nullptr;
+    return ret->second;
+  }
+  void AddLabel(const std::string& label, LabelStmt* labelStmt) {
+    assert(nullptr == FindLabel(label));
+    curLabels_[label] = labelStmt;
+  }
+  TranslationUnit* Unit() { return unit_; }
+  FuncDef* CurFunc() { return curFunc_; }
+  const TokenSequence& ts() const { return ts_; }
+
+private:
+  static bool IsBuiltin(FuncType* type);
+  static bool IsBuiltin(const std::string& name);
+  static Identifier* GetBuiltin(const Token* tok);
+  static void DefineBuiltins();
+
+  static FuncType* vaStartType_;
+  static FuncType* vaArgType_;
+
+  // The root of the AST
+  TranslationUnit* unit_;
+
+  TokenSequence& ts_;
+
+  // It is not the real scope,
+  // It contains all external symbols(resolved and not resolved)
+  Scope* externalSymbols_;
+
+  const Token* errTok_;
+  Scope* curScope_;
+  FuncDef* curFunc_;
+  LabelMap curLabels_;
+  LabelJumpList unresolvedJumps_;
+
+  LabelStmt* breakDest_;
+  LabelStmt* continueDest_;
+  CaseLabelList* caseLabels_;
+  LabelStmt* defaultLabel_;
+};
+
+#endif
diff --git a/include/triton/lang/scanner.h b/include/triton/lang/scanner.h
new file mode 100644
index 000000000..57cdff9a0
--- /dev/null
+++ b/include/triton/lang/scanner.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#ifndef _WGTCC_SCANNER_H_
+#define _WGTCC_SCANNER_H_
+
+#include "error.h"
+#include "encoding.h"
+#include "token.h"
+
+#include <string>
+#include <cassert>
+
+
+class Scanner {
+public:
+  explicit Scanner(const Token* tok)
+      : Scanner(&tok->str_, tok->loc_) {}
+  Scanner(const std::string* text, const SourceLocation& loc)
+      : Scanner(text, loc.filename_, loc.line_, loc.column_) {}
+  explicit Scanner(const std::string* text,
+                   const std::string* filename=nullptr,
+                   unsigned line=1, unsigned column=1)
+      : text_(text), tok_(Token::END) {
+    // TODO(wgtdkp): initialization
+    p_ = &(*text_)[0];
+    loc_ = {filename, p_, line, 1};
+  }
+
+  virtual ~Scanner() {}
+  Scanner(const Scanner& other) = delete;
+  Scanner& operator=(const Scanner& other) = delete;
+
+  // Scan plain text and generate tokens in ts.
+  // The param 'ts' need not be empty, if so, the tokens
+  // are inserted at the *header* of 'ts'.
+  // The param 'ws' tells if there is leading white space
+  // before this token, it is only SkipComment() that will
+  // set this param.
+  Token* Scan(bool ws=false);
+  void Tokenize(TokenSequence& ts);
+  static std::string ScanHeadName(const Token* lhs, const Token* rhs);
+  Encoding ScanCharacter(int& val);
+  Encoding ScanLiteral(std::string& val);
+  std::string ScanIdentifier();
+
+private:
+  Token* SkipIdentifier();
+  Token* SkipNumber();
+  Token* SkipLiteral();
+  Token* SkipCharacter();
+  Token* MakeToken(int tag);
+  Token* MakeNewLine();
+  Encoding ScanEncoding(int c);
+  int ScanEscaped();
+  int ScanHexEscaped();
+  int ScanOctEscaped(int c);
+  int ScanUCN(int len);
+  void SkipWhiteSpace();
+  void SkipComment();
+  bool IsUCN(int c) { return c == '\\' && (Test('u') || Test('U')); }
+  bool IsOctal(int c) { return '0' <= c && c <= '7'; }
+  int XDigit(int c);
+  bool Empty() const { return *p_ == 0; }
+  int Peek();
+  bool Test(int c) { return Peek() == c; };
+  int Next();
+  void PutBack();
+  bool Try(int c) {
+    if (Peek() == c) {
+      Next();
+      return true;
+    }
+    return false;
+  };
+  void Mark() { tok_.loc_ = loc_; };
+
+  const std::string* text_;
+  SourceLocation loc_;
+  Token tok_;
+  const char* p_;
+};
+
+
+std::string* ReadFile(const std::string& filename);
+
+#endif
diff --git a/include/triton/lang/scope.h b/include/triton/lang/scope.h
new file mode 100644
index 000000000..b958d3ecd
--- /dev/null
+++ b/include/triton/lang/scope.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#ifndef _WGTCC_SCOPE_H_
+#define _WGTCC_SCOPE_H_
+
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+
+class Identifier;
+class Token;
+
+
+enum ScopeType {
+  S_FILE,
+  S_PROTO,
+  S_BLOCK,
+  S_FUNC,
+};
+
+
+class Scope {
+  friend class StructType;
+  using TagList = std::vector<Identifier*>;
+  using IdentMap = std::map<std::string, Identifier*>;
+
+public:
+  explicit Scope(Scope* parent, enum ScopeType type)
+      : parent_(parent), type_(type) {}
+  ~Scope() {}
+  Scope* Parent() { return parent_; }
+  void SetParent(Scope* parent) { parent_ = parent; }
+  enum ScopeType Type() const { return type_; }
+
+  Identifier* Find(const Token* tok);
+  Identifier* FindInCurScope(const Token* tok);
+  Identifier* FindTag(const Token* tok);
+  Identifier* FindTagInCurScope(const Token* tok);
+  TagList AllTagsInCurScope() const;
+
+  void Insert(Identifier* ident);
+  void Insert(const std::string& name, Identifier* ident);
+  void InsertTag(Identifier* ident);
+  void Print();
+  bool operator==(const Scope& other) const { return type_ == other.type_; }
+  IdentMap::iterator begin() { return identMap_.begin(); }
+  IdentMap::iterator end() { return identMap_.end(); }
+  size_t size() const { return identMap_.size(); }
+
+private:
+  Identifier* Find(const std::string& name);
+  Identifier* FindInCurScope(const std::string& name);
+  Identifier* FindTag(const std::string& name);
+  Identifier* FindTagInCurScope(const std::string& name);
+  std::string TagName(const std::string& name) {
+    return name + "@:tag";
+  }
+  static bool IsTagName(const std::string& name) {
+    return name.size() > 5 && name[name.size() - 5] == '@';
+  }
+  const Scope& operator=(const Scope& other);
+  Scope(const Scope& scope);
+
+  Scope* parent_;
+  enum ScopeType type_;
+
+  IdentMap identMap_;
+};
+
+#endif
diff --git a/include/triton/lang/token.h b/include/triton/lang/token.h
new file mode 100644
index 000000000..1b2868849
--- /dev/null
+++ b/include/triton/lang/token.h
@@ -0,0 +1,429 @@
+#pragma once
+
+#ifndef _WGTCC_TOKEN_H_
+#define _WGTCC_TOKEN_H_
+
+#include "error.h"
+
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <list>
+#include <set>
+#include <string>
+#include <unordered_map>
+
+
+class Generator;
+class Parser;
+class Scanner;
+class Token;
+class TokenSequence;
+
+using HideSet = std::set<std::string>;
+using TokenList = std::list<const Token*>;
+
+
+struct SourceLocation {
+  const std::string* filename_;
+  const char* lineBegin_;
+  unsigned line_;
+  unsigned column_;
+
+  const char* Begin() const {
+    return lineBegin_ + column_ - 1;
+  }
+};
+
+
+class Token {
+  friend class Scanner;
+public:
+  enum {
+    // Punctuators
+    LPAR = '(',
+    RPAR = ')',
+    LSQB = '[',
+    RSQB = ']',
+    COLON = ':',
+    COMMA = ',',
+    SEMI = ';',
+    ADD = '+',
+    SUB = '-',
+    MUL = '*',
+    DIV = '/',
+    OR = '|',
+    AND = '&',
+    XOR = '^',
+    LESS = '<',
+    GREATER = '>',
+    EQUAL = '=',
+    DOT = '.',
+    MOD = '%',
+    LBRACE = '{',
+    RBRACE = '}',
+    TILDE = '~',
+    NOT = '!',
+    COND = '?',
+    SHARP = '#',
+    MATMUL = '@',
+    NEW_LINE = '\n',
+
+    DSHARP = 128, // '##'
+    PTR,
+    INC,
+    DEC,
+    LEFT,
+    RIGHT,
+    LE,
+    GE,
+    EQ,
+    NE,
+    LOGICAL_AND,
+    LOGICAL_OR,
+
+    MUL_ASSIGN,
+    DIV_ASSIGN,
+    MOD_ASSIGN,
+    ADD_ASSIGN,
+    SUB_ASSIGN,
+    LEFT_ASSIGN,
+    RIGHT_ASSIGN,
+    AND_ASSIGN,
+    XOR_ASSIGN,
+    OR_ASSIGN,
+
+    ELLIPSIS,
+    MASKED_DEREF,
+    // Punctuators end
+
+    // KEYWORD BEGIN
+    // TYPE QUALIFIER BEGIN
+    CONST,
+    RESTRICT,
+    VOLATILE,
+    ATOMIC,
+    // TYPE QUALIFIER END
+
+    // TYPE SPECIFIER BEGIN
+    VOID,
+    CHAR,
+    SHORT,
+    INT,
+    LONG,
+    HALF,
+    FLOAT,
+    DOUBLE,
+    SIGNED,
+    UNSIGNED,
+    BOOL,		// _Bool
+    COMPLEX,	// _Complex
+    STRUCT,
+    UNION,
+    ENUM,
+    // TYPE SPECIFIER END
+
+    ATTRIBUTE, // GNU extension __attribute__
+    // FUNCTION SPECIFIER BEGIN
+    INLINE,
+    NORETURN,	// _Noreturn
+    // FUNCTION SPECIFIER END
+
+    // TILE ARITHMETICS BEGIN
+    NEWAXIS,
+    MAX,
+    MIN,
+    // TILE ARITHMETICS END
+
+    ALIGNAS, // _Alignas
+    // For syntactic convenience
+    STATIC_ASSERT, // _Static_assert
+    // STORAGE CLASS SPECIFIER BEGIN
+    TYPEDEF,
+    EXTERN,
+    STATIC,
+    THREAD,	// _Thread_local
+    AUTO,
+    GLOBAL,
+    CMEM, // constant memory
+
+    // STORAGE CLASS SPECIFIER END
+    BREAK,
+    CASE,
+    CONTINUE,
+    DEFAULT,
+    DO,
+    ELSE,
+    FOR,
+    GOTO,
+    IF,
+    RETURN,
+    SIZEOF,
+    SWITCH,
+    WHILE,
+    ALIGNOF, // _Alignof
+    GENERIC, // _Generic
+    IMAGINARY, // _Imaginary
+    // KEYWORD END
+
+    IDENTIFIER,
+    CONSTANT,
+    I_CONSTANT,
+    C_CONSTANT,
+    F_CONSTANT,
+    LITERAL,
+
+    // For the parser, a identifier is a typedef name or user defined type
+    POSTFIX_INC,
+    POSTFIX_DEC,
+    PREFIX_INC,
+    PREFIX_DEC,
+    ADDR,  // '&'
+    DEREF, // '*'
+    PLUS,
+    MINUS,
+    CAST,
+    REDUCE,
+
+    // For preprocessor
+    PP_IF,
+    PP_IFDEF,
+    PP_IFNDEF,
+    PP_ELIF,
+    PP_ELSE,
+    PP_ENDIF,
+    PP_INCLUDE,
+    PP_DEFINE,
+    PP_UNDEF,
+    PP_LINE,
+    PP_ERROR,
+    PP_PRAGMA,
+    PP_NONE,
+    PP_EMPTY,
+
+
+    IGNORE,
+    INVALID,
+    END,
+    NOTOK = -1,
+  };
+
+  static Token* New(int tag);
+  static Token* New(const Token& other);
+  static Token* New(int tag,
+                    const SourceLocation& loc,
+                    const std::string& str,
+                    bool ws=false);
+  Token& operator=(const Token& other) {
+    tag_ = other.tag_;
+    ws_ = other.ws_;
+    loc_ = other.loc_;
+    str_ = other.str_;
+    hs_ = other.hs_ ? new HideSet(*other.hs_): nullptr;
+    return *this;
+  }
+  virtual ~Token() {}
+
+  // Token::NOTOK represents not a kw.
+  static int KeyWordTag(const std::string& key) {
+    auto kwIter = kwTypeMap_.find(key);
+    if (kwTypeMap_.end() == kwIter)
+      return Token::NOTOK;	// Not a key word type
+    return kwIter->second;
+  }
+  static bool IsKeyWord(const std::string& name);
+  static bool IsKeyWord(int tag) { return CONST <= tag && tag < IDENTIFIER; }
+  bool IsKeyWord() const { return IsKeyWord(tag_); }
+  bool IsPunctuator() const { return 0 <= tag_ && tag_ <= ELLIPSIS; }
+  bool IsLiteral() const { return tag_ == LITERAL; }
+  bool IsConstant() const { return CONSTANT <= tag_ && tag_ <= F_CONSTANT; }
+  bool IsIdentifier() const { return IDENTIFIER == tag_; }
+  bool IsEOF() const { return tag_ == Token::END; }
+  bool IsTypeSpecQual() const { return CONST <= tag_ && tag_ <= ENUM; }
+  bool IsDecl() const { return CONST <= tag_ && tag_ <= GLOBAL; }
+  static const char* Lexeme(int tag) {
+    auto iter = tagLexemeMap_.find(tag);
+    if (iter == tagLexemeMap_.end())
+      return nullptr;
+
+    return iter->second;
+  }
+
+  int tag_;
+
+  // 'ws_' standards for weither there is preceding white space
+  // This is to simplify the '#' operator(stringize) in macro expansion
+  bool ws_ { false };
+  SourceLocation loc_;
+
+  std::string str_;
+  HideSet* hs_ { nullptr };
+
+private:
+  explicit Token(int tag): tag_(tag) {}
+  Token(int tag, const SourceLocation& loc,
+        const std::string& str, bool ws=false)
+      : tag_(tag), ws_(ws), loc_(loc), str_(str) {}
+
+  Token(const Token& other) {
+    *this = other;
+  }
+
+  static const std::unordered_map<std::string, int> kwTypeMap_;
+  static const std::unordered_map<int, const char*> tagLexemeMap_;
+};
+
+
+class TokenSequence {
+  friend class Preprocessor;
+
+public:
+  TokenSequence(): tokList_(new TokenList()),
+                   begin_(tokList_->begin()), end_(tokList_->end()) {}
+  explicit TokenSequence(Token* tok) {
+    TokenSequence();
+    InsertBack(tok);
+  }
+  explicit TokenSequence(TokenList* tokList)
+      : tokList_(tokList),
+        begin_(tokList->begin()),
+        end_(tokList->end()) {}
+  TokenSequence(TokenList* tokList,
+                TokenList::iterator begin,
+                TokenList::iterator end)
+      : tokList_(tokList), begin_(begin), end_(end) {}
+  ~TokenSequence() {}
+  TokenSequence(const TokenSequence& other) { *this = other; }
+  const TokenSequence& operator=(const TokenSequence& other) {
+    tokList_ = other.tokList_;
+    begin_ = other.begin_;
+    end_ = other.end_;
+    return *this;
+  }
+  void Copy(const TokenSequence& other) {
+    tokList_ = new TokenList(other.begin_, other.end_);
+    begin_ = tokList_->begin();
+    end_ = tokList_->end();
+    for (auto iter = begin_; iter != end_; ++iter)
+      *iter = Token::New(**iter);
+  }
+  void UpdateHeadLocation(const SourceLocation& loc) {
+    assert(!Empty());
+    auto tok = const_cast<Token*>(Peek());
+    tok->loc_ = loc;
+  }
+  void FinalizeSubst(bool leadingWS, const HideSet& hs) {
+    auto ts = *this;
+    while (!ts.Empty()) {
+      auto tok = const_cast<Token*>(ts.Next());
+      if (!tok->hs_)
+        tok->hs_ = new HideSet(hs);
+      else
+        tok->hs_->insert(hs.begin(), hs.end());
+    }
+    // Even if the token sequence is empty
+    const_cast<Token*>(Peek())->ws_ = leadingWS;
+  }
+
+  const Token* Expect(int expect);
+  bool Try(int tag) {
+    if (Peek()->tag_ == tag) {
+      Next();
+      return true;
+    }
+    return false;
+  }
+  bool Test(int tag) { return Peek()->tag_ == tag; }
+  const Token* Next() {
+    auto ret = Peek();
+    if (!ret->IsEOF()) {
+      ++begin_;
+      Peek(); // May skip newline token, but why ?
+    } else {
+      ++exceed_end;
+    }
+    return ret;
+  }
+  void PutBack() {
+    assert(begin_ != tokList_->begin());
+    if (exceed_end > 0) {
+      --exceed_end;
+    } else {
+      --begin_;
+      if ((*begin_)->tag_ == Token::NEW_LINE)
+        PutBack();
+    }
+  }
+  const Token* Peek() const;
+  const Token* Peek2() {
+    if (Empty())
+      return Peek(); // Return the Token::END
+    Next();
+    auto ret = Peek();
+    PutBack();
+    return ret;
+  }
+  const Token* Back() const {
+    auto back = end_;
+    return *--back;
+  }
+  void PopBack() {
+    assert(!Empty());
+    assert(end_ == tokList_->end());
+    auto size_eq1 = tokList_->back() == *begin_;
+    tokList_->pop_back();
+    end_ = tokList_->end();
+    if (size_eq1)
+      begin_ = end_;
+  }
+  TokenList::iterator Mark() { return begin_; }
+  void ResetTo(TokenList::iterator mark) { begin_ = mark; }
+  bool Empty() const { return Peek()->tag_ == Token::END; }
+  void InsertBack(TokenSequence& ts) {
+    auto pos = tokList_->insert(end_, ts.begin_, ts.end_);
+    if (begin_ == end_) {
+      begin_ = pos;
+    }
+  }
+  void InsertBack(const Token* tok) {
+    auto pos = tokList_->insert(end_, tok);
+    if (begin_ == end_) {
+      begin_ = pos;
+    }
+  }
+
+  // If there is preceding newline
+  void InsertFront(TokenSequence& ts) {
+    auto pos = GetInsertFrontPos();
+    begin_ = tokList_->insert(pos, ts.begin_, ts.end_);
+  }
+  void InsertFront(const Token* tok) {
+    auto pos = GetInsertFrontPos();
+    begin_ = tokList_->insert(pos, tok);
+  }
+  bool IsBeginOfLine() const;
+  TokenSequence GetLine();
+  void SetParser(Parser* parser) { parser_ = parser; }
+  void Print(FILE* fp=stdout) const;
+  void Print(std::string *str) const;
+
+private:
+  // Find a insert position with no preceding newline
+  TokenList::iterator GetInsertFrontPos() {
+    auto pos = begin_;
+    if (pos == tokList_->begin())
+      return pos;
+    --pos;
+    while (pos != tokList_->begin() && (*pos)->tag_ == Token::NEW_LINE)
+      --pos;
+    return ++pos;
+  }
+
+  TokenList* tokList_;
+  mutable TokenList::iterator begin_;
+  TokenList::iterator end_;
+  Parser* parser_ {nullptr};
+  int exceed_end {0};
+};
+
+#endif
diff --git a/include/triton/lang/type.h b/include/triton/lang/type.h
new file mode 100644
index 000000000..59ea8eb3f
--- /dev/null
+++ b/include/triton/lang/type.h
@@ -0,0 +1,464 @@
+#pragma once
+
+#ifndef _WGTCC_TYPE_H_
+#define _WGTCC_TYPE_H_
+
+#include "mem_pool.h"
+#include "scope.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <list>
+
+
+class Scope;
+class Token;
+class Expr;
+
+class Type;
+class QualType;
+class VoidType;
+class Identifier;
+class Object;
+class Constant;
+
+class ArithmType;
+class DerivedType;
+class ArrayType;
+class TileType;
+class FuncType;
+class PointerType;
+class StructType;
+class EnumType;
+
+
+enum {
+  // Storage class specifiers
+  S_TYPEDEF = 0x01,
+  S_EXTERN = 0x02,
+  S_STATIC = 0x04,
+  S_THREAD = 0x08,
+  S_CONSTANT = 0x10,
+  S_GLOBAL = 0x20,
+
+  // Type specifier
+  T_SIGNED = 0x40,
+  T_UNSIGNED = 0x80,
+  T_CHAR = 0x100,
+  T_SHORT = 0x200,
+  T_INT = 0x400,
+  T_LONG = 0x800,
+  T_VOID = 0x1000,
+  T_HALF = 0x2000,
+  T_FLOAT = 0x4000,
+  T_DOUBLE = 0x8000,
+  T_BOOL = 0x10000,
+  T_COMPLEX = 0x20000,
+  // T_ATOMIC = 0x40000,
+  T_STRUCT_UNION = 0x80000,
+  T_ENUM = 0x100000,
+  T_TYPEDEF_NAME = 0x200000,
+
+  T_LLONG = 0x4000000,
+
+  // Function specifier
+  F_INLINE = 0x8000000,
+  F_NORETURN = 0x10000000,
+};
+
+
+struct Qualifier {
+  enum {
+    CONST = 0x01,
+    RESTRICT = 0x02,
+    VOLATILE = 0x04,
+    CMEM = 0x08,
+    MASK = CONST | RESTRICT | VOLATILE | CMEM
+  };
+};
+
+
+class QualType {
+public:
+  QualType(Type* ptr, int quals=0x00)
+      : ptr_(reinterpret_cast<intptr_t>(ptr)) {
+    assert((quals & ~Qualifier::MASK) == 0);
+    ptr_ |= quals;
+  }
+
+  operator bool() const { return !IsNull(); }
+  bool IsNull() const { return GetPtr() == nullptr; }
+  const Type* GetPtr() const {
+    return reinterpret_cast<const Type*>(ptr_ & ~Qualifier::MASK);
+  }
+  Type* GetPtr() {
+    return reinterpret_cast<Type*>(ptr_ & ~Qualifier::MASK);
+  }
+  Type& operator*() { return *GetPtr(); }
+  const Type& operator*() const { return *GetPtr(); }
+  Type* operator->() { return GetPtr(); }
+  const Type* operator->() const { return GetPtr(); }
+
+  // Indicate whether the specified types are identical(exclude qualifiers).
+  friend bool operator==(QualType lhs, QualType rhs) {
+    return lhs.operator->() == rhs.operator->();
+  }
+  friend bool operator!=(QualType lhs, QualType rhs) {
+    return !(lhs == rhs);
+  }
+
+  int Qual() const { return ptr_ & 0x07; }
+  bool IsConstQualified() const { return ptr_ & Qualifier::CONST; }
+  bool IsRestrictQualified() const { return ptr_ & Qualifier::RESTRICT; }
+  bool IsVolatileQualified() const { return ptr_ & Qualifier::VOLATILE; }
+  bool IsConstantQualified() const { return ptr_ & Qualifier::CMEM; }
+
+private:
+  intptr_t ptr_;
+};
+
+
+class Type {
+public:
+  static const int intWidth_ = 4;
+  static const int machineWidth_ = 8;
+
+  bool operator!=(const Type& other) const = delete;
+  bool operator==(const Type& other) const = delete;
+
+  virtual bool Compatible(const Type& other) const {
+    return complete_ == other.complete_;
+  }
+
+  virtual ~Type() {}
+
+  // For Debugging
+  virtual std::string Str() const = 0;
+  virtual int Width() const = 0;
+  virtual int Align() const { return Width(); }
+  static int MakeAlign(int offset, int align) {
+    if ((offset % align) == 0)
+      return offset;
+    if (offset >= 0)
+      return offset + align - (offset % align);
+    else
+      return offset - align - (offset % align);
+  }
+
+  static QualType MayCast(QualType type, bool inProtoScope=false);
+  bool Complete() const { return complete_; }
+  void SetComplete(bool complete) const { complete_ = complete; }
+
+  bool IsReal() const { return IsInteger() || IsFloat(); };
+  virtual bool IsScalar() const { return false; }
+  virtual bool IsFloat() const { return false; }
+  virtual bool IsInteger() const { return false; }
+  virtual bool IsBool() const { return false; }
+  virtual bool IsVoidPointer() const { return false; }
+  virtual bool IsUnsigned() const { return false; }
+  virtual bool IsTile() const { return ToTile() != nullptr; }
+
+  const Type* ScalarType() const;
+  Type* ScalarType();
+
+  virtual VoidType*           ToVoid() { return nullptr; }
+  virtual const VoidType*     ToVoid() const { return nullptr; }
+  virtual ArithmType*         ToArithm() { return nullptr; }
+  virtual const ArithmType*   ToArithm() const { return nullptr; }
+  virtual ArrayType*          ToArray() { return nullptr; }
+  virtual const ArrayType*    ToArray() const { return nullptr; }
+  virtual TileType*           ToTile() { return nullptr; }
+  virtual const TileType*     ToTile() const { return nullptr; }
+  virtual FuncType*           ToFunc() { return nullptr; }
+  virtual const FuncType*     ToFunc() const { return nullptr; }
+  virtual PointerType*        ToPointer() { return nullptr; }
+  virtual const PointerType*  ToPointer() const { return nullptr; }
+  virtual DerivedType*        ToDerived() { return nullptr; }
+  virtual const DerivedType*  ToDerived() const { return nullptr; }
+  virtual StructType*         ToStruct() { return nullptr; }
+  virtual const StructType*   ToStruct() const { return nullptr; }
+
+protected:
+  Type(MemPool* pool, bool complete)
+      : complete_(complete), pool_(pool) {}
+
+  mutable bool complete_;
+  MemPool* pool_;
+};
+
+
+class VoidType : public Type {
+public:
+  static VoidType* New();
+  virtual ~VoidType() {}
+  virtual VoidType* ToVoid() { return this; }
+  virtual const VoidType* ToVoid() const { return this; }
+  virtual bool Compatible(const Type& other) const { return other.ToVoid(); }
+  virtual int Width() const {
+    // Non-standard GNU extension
+    return 1;
+  }
+  virtual std::string Str() const { return "void:1"; }
+
+protected:
+  explicit VoidType(MemPool* pool): Type(pool, false) {}
+};
+
+
+class ArithmType : public Type {
+public:
+  static ArithmType* New(int typeSpec);
+
+  virtual ~ArithmType() {}
+  virtual ArithmType* ToArithm() { return this; }
+  virtual const ArithmType* ToArithm() const { return this; }
+  virtual bool Compatible(const Type& other) const {
+    // C11 6.2.7 [1]: Two types have compatible type if their types are the same
+    // But I would to loose this constraints: integer and pointer are compatible
+    // if (IsInteger() && other.ToPointer())
+    //   return other.Compatible(*this);
+    return this == &other;
+  }
+
+  virtual int Width() const;
+  virtual std::string Str() const;
+  virtual bool IsScalar() const { return true; }
+  virtual bool IsInteger() const { return !IsFloat() && !IsComplex(); }
+  virtual bool IsUnsigned() const { return tag_ & T_UNSIGNED; }
+  virtual bool IsFloat() const {
+    return (tag_ & T_HALF) || (tag_ & T_FLOAT) || (tag_ & T_DOUBLE);
+  }
+  virtual bool IsBool() const { return tag_ & T_BOOL; }
+  bool IsComplex() const { return tag_ & T_COMPLEX; }
+  int Tag() const { return tag_; }
+  int Rank() const;
+  static ArithmType* IntegerPromote(ArithmType* type) {
+    assert(type->IsInteger());
+    if (type->Rank() < ArithmType::New(T_INT)->Rank())
+      return ArithmType::New(T_INT);
+    return type;
+  }
+  static ArithmType* MaxType(ArithmType* lhsType,
+                                   ArithmType* rhsType);
+
+protected:
+  explicit ArithmType(MemPool* pool, int spec)
+    : Type(pool, true), tag_(Spec2Tag(spec)) {}
+
+private:
+  static int Spec2Tag(int spec);
+
+  int tag_;
+};
+
+
+class DerivedType : public Type {
+public:
+  QualType Derived() const { return derived_; }
+  void SetDerived(QualType derived) { derived_ = derived; }
+  virtual DerivedType* ToDerived() { return this; }
+  virtual const DerivedType* ToDerived() const { return this; }
+
+protected:
+  DerivedType(MemPool* pool, QualType derived)
+      : Type(pool, true), derived_(derived) {}
+
+  QualType derived_;
+};
+
+
+class PointerType : public DerivedType {
+public:
+  static PointerType* New(QualType derived);
+  virtual ~PointerType() {}
+  virtual PointerType* ToPointer() { return this; }
+  virtual const PointerType* ToPointer() const { return this; }
+  virtual bool Compatible(const Type& other) const;
+  virtual int Width() const { return 8; }
+  virtual bool IsScalar() const { return true; }
+  virtual bool IsVoidPointer() const { return derived_->ToVoid(); }
+  virtual std::string Str() const {
+    return derived_->Str() + "*:" + std::to_string(Width());
+  }
+
+protected:
+  PointerType(MemPool* pool, QualType derived): DerivedType(pool, derived) {}
+};
+
+
+class ArrayType : public DerivedType {
+public:
+  static ArrayType* New(int len, QualType eleType);
+  static ArrayType* New(Expr* expr, QualType eleType);
+  virtual ~ArrayType() { /*delete derived_;*/ }
+
+  virtual ArrayType* ToArray() { return this; }
+  virtual const ArrayType* ToArray() const { return this; }
+  virtual bool Compatible(const Type& other) const;
+  virtual int Width() const {
+    return Complete() ? (derived_->Width() * len_): 0;
+  }
+  virtual int Align() const { return derived_->Align(); }
+  virtual std::string Str() const {
+    return derived_->Str() + "[]:" + std::to_string(Width());
+  }
+
+  int GetElementOffset(int idx) const { return derived_->Width() * idx; }
+  int Len() const { return len_; }
+  void SetLen(int len) { len_ = len; }
+  bool Variadic() const { return lenExpr_ != nullptr; }
+
+protected:
+  ArrayType(MemPool* pool, Expr* lenExpr, QualType derived)
+      : DerivedType(pool, derived),
+        lenExpr_(lenExpr), len_(0) {
+    SetComplete(false);
+  }
+
+  ArrayType(MemPool* pool, int len, QualType derived)
+      : DerivedType(pool, derived),
+        lenExpr_(nullptr), len_(len) {
+    SetComplete(len_ >= 0);
+  }
+  const Expr* lenExpr_;
+  int len_;
+};
+
+class TileType : public DerivedType {
+public:
+  using ShapeExpr = std::vector<Expr*>;
+  using ShapeInt = std::vector<int>;
+
+public:
+  static TileType* New(const ShapeExpr& expr, QualType eleType);
+  static TileType* New(const ShapeInt& shape, QualType eleType);
+  virtual ~TileType() { }
+
+  virtual TileType* ToTile() { return this; }
+  virtual const TileType* ToTile() const { return this; }
+  virtual bool Compatible(const Type& other) const;
+  virtual int Width() const { return Complete() ? derived_->Width()*NumEle() : 0; }
+  virtual int Align() const { return derived_->Align(); }
+  virtual std::string Str() const {
+    return derived_->Str() + "[{}]:" + std::to_string(Width());
+  }
+
+  ShapeInt Shape() { return shape_; }
+  int NumEle() const {
+    int ret = 1;
+    for(int s: shape_)
+      ret *= s;
+    return ret;
+  }
+
+protected:
+  TileType(MemPool* pool, const ShapeExpr& expr, QualType derived)
+    : DerivedType(pool, derived),
+      shapeExpr_(expr) {
+    bool isComplete = true;
+    for(Expr* s: shapeExpr_)
+      isComplete = isComplete && !s;
+    SetComplete(isComplete);
+  }
+
+  TileType(MemPool* pool, const ShapeInt& shape, QualType derived)
+    : DerivedType(pool, derived),
+      shape_(shape) {
+    bool isComplete = true;
+    for(int s: shape_)
+      isComplete = isComplete && (s>=0);
+    SetComplete(isComplete);
+  }
+
+protected:
+  ShapeExpr shapeExpr_;
+  ShapeInt shape_;
+};
+
+class FuncType : public DerivedType {
+public:
+  using ParamList = std::vector<Object*>;
+
+public:
+  static FuncType* New(QualType derived,
+                       int funcSpec,
+                       bool variadic,
+                       const ParamList& params);
+  virtual ~FuncType() {}
+  virtual FuncType* ToFunc() { return this; }
+  virtual const FuncType* ToFunc() const { return this; }
+  virtual bool Compatible(const Type& other) const;
+  virtual int Width() const { return 1; }
+  virtual std::string Str() const;
+  const ParamList& Params() const { return params_; }
+  void SetParams(const ParamList& params) { params_ = params; }
+  bool Variadic() const { return variadic_; }
+  bool IsInline() const { return inlineNoReturn_ & F_INLINE; }
+  bool IsNoReturn() const { return inlineNoReturn_ & F_NORETURN; }
+
+protected:
+  FuncType(MemPool* pool, QualType derived, int inlineReturn,
+           bool variadic, const ParamList& params)
+      : DerivedType(pool, derived), inlineNoReturn_(inlineReturn),
+        variadic_(variadic), params_(params) {
+    SetComplete(false);
+  }
+
+private:
+  int inlineNoReturn_;
+  bool variadic_;
+  ParamList params_;
+};
+
+
+class StructType : public Type {
+public:
+  using MemberList = std::list<Object*>;
+  using Iterator = std::list<Object*>::iterator;
+
+public:
+  static StructType* New(bool isStruct,
+                         bool hasTag,
+                         Scope* parent);
+  virtual ~StructType() {}
+  virtual StructType* ToStruct() { return this; }
+  virtual const StructType* ToStruct() const { return this; }
+  virtual bool Compatible(const Type& other) const;
+  virtual int Width() const { return width_; }
+  virtual int Align() const { return align_; }
+  virtual std::string Str() const;
+
+  // struct/union
+  void AddMember(Object* member);
+  void AddBitField(Object* member, int offset);
+  bool IsStruct() const { return isStruct_; }
+  Object* GetMember(const std::string& member);
+  Scope* MemberMap() { return memberMap_; }
+  MemberList& Members() { return members_; }
+  int Offset() const { return offset_; }
+  bool HasTag() const { return hasTag_; }
+  void MergeAnony(Object* anony);
+  void Finalize();
+
+protected:
+  // Default is incomplete
+  StructType(MemPool* pool, bool isStruct, bool hasTag, Scope* parent);
+
+  StructType(const StructType& other);
+
+private:
+  void CalcWidth();
+
+  bool isStruct_;
+  bool hasTag_;
+  Scope* memberMap_;
+
+  MemberList members_;
+  int offset_;
+  int width_;
+  int align_;
+  int bitFieldAlign_;
+};
+
+#endif
diff --git a/include/triton/lang/visitor.h b/include/triton/lang/visitor.h
new file mode 100644
index 000000000..239071edf
--- /dev/null
+++ b/include/triton/lang/visitor.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#ifndef _WGTCC_VISITOR_H_
+#define _WGTCC_VISITOR_H_
+
+
+class BinaryOp;
+class UnaryOp;
+class TransOp;
+class ConditionalOp;
+class FuncCall;
+class Identifier;
+class Object;
+class Enumerator;
+class Constant;
+class TempVar;
+
+class Declaration;
+class IfStmt;
+class ForStmt;
+class JumpStmt;
+class ReturnStmt;
+class LabelStmt;
+class EmptyStmt;
+class CompoundStmt;
+class FuncDef;
+class TranslationUnit;
+
+
+class Visitor {
+public:
+  virtual ~Visitor() {}
+  virtual void VisitBinaryOp(BinaryOp* binary) = 0;
+  virtual void VisitUnaryOp(UnaryOp* unary) = 0;
+  virtual void VisitTransOp(TransOp* trans) = 0;
+  virtual void VisitConditionalOp(ConditionalOp* cond) = 0;
+  virtual void VisitFuncCall(FuncCall* funcCall) = 0;
+  virtual void VisitEnumerator(Enumerator* enumer) = 0;
+  virtual void VisitIdentifier(Identifier* ident) = 0;
+  virtual void VisitObject(Object* obj) = 0;
+  virtual void VisitConstant(Constant* cons) = 0;
+  virtual void VisitTempVar(TempVar* tempVar) = 0;
+
+  virtual void VisitDeclaration(Declaration* init) = 0;
+  virtual void VisitIfStmt(IfStmt* ifStmt) = 0;
+  virtual void VisitForStmt(ForStmt* ifStmt) = 0;
+  virtual void VisitJumpStmt(JumpStmt* jumpStmt) = 0;
+  virtual void VisitReturnStmt(ReturnStmt* returnStmt) = 0;
+  virtual void VisitLabelStmt(LabelStmt* labelStmt) = 0;
+  virtual void VisitEmptyStmt(EmptyStmt* emptyStmt) = 0;
+  virtual void VisitCompoundStmt(CompoundStmt* compStmt) = 0;
+  virtual void VisitFuncDef(FuncDef* funcDef) = 0;
+  virtual void VisitTranslationUnit(TranslationUnit* unit) = 0;
+};
+
+#endif
diff --git a/include/triton/runtime/arg.h b/include/triton/runtime/arg.h
new file mode 100644
index 000000000..6e255f0e7
--- /dev/null
+++ b/include/triton/runtime/arg.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#ifndef _TRITON_RUNTIME_ARG_H_
+#define _TRITON_RUNTIME_ARG_H_
+
+#include <string>
+#include <stdexcept>
+
+namespace triton{
+
+namespace driver{
+  class buffer;
+}
+
+namespace runtime {
+
+enum arg_type {
+  INT1_T,
+  INT8_T,
+  INT16_T,
+  INT32_T,
+  INT64_T,
+  HALF_T,
+  FLOAT_T,
+  DOUBLE_T,
+  BUFFER_T
+};
+
+inline size_t size_of(arg_type ty){
+  switch(ty){
+    case INT1_T: return 1;
+    case INT8_T: return 1;
+    case INT16_T: return 2;
+    case INT32_T: return 4;
+    case INT64_T: return 8;
+    case HALF_T: return 2;
+    case FLOAT_T: return 4;
+    case DOUBLE_T: return 8;
+    case BUFFER_T: return 8;
+    default: throw std::runtime_error("unknown type");
+  }
+}
+
+inline bool is_int_type(arg_type ty){
+  return ty == INT1_T || ty == INT8_T || ty == INT16_T ||
+         ty == INT32_T || ty == INT64_T;
+}
+
+class arg {
+private:
+  union value_t {
+    bool    int1;
+    int8_t  int8;
+    int16_t int16;
+    int32_t int32;
+    int64_t int64;
+    float   fp32;
+    double  fp64;
+    driver::buffer* buf;
+  };
+
+public:
+  // construct from primitive types
+  arg(int32_t x): ty_(INT32_T) { val_.int32 = x; }
+  arg(int64_t x): ty_(INT64_T) { val_.int64 = x; }
+  arg(float x): ty_(FLOAT_T)   { val_.fp32 = x; }
+  arg(double x): ty_(DOUBLE_T) { val_.fp64 = x; }
+  arg(driver::buffer* x): ty_(BUFFER_T) { val_.buf = x; }
+  // accessors
+  arg_type type() const { return ty_; }
+  void* data() const { return (void*)&val_; }
+
+
+private:
+  arg_type ty_;
+  value_t val_;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h
new file mode 100644
index 000000000..26253b7ee
--- /dev/null
+++ b/include/triton/runtime/function.h
@@ -0,0 +1,122 @@
+#pragma once
+
+#ifndef _TRITON_RUNTIME_FUNCTION_H_
+#define _TRITON_RUNTIME_FUNCTION_H_
+
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <functional>
+// codegen
+#include "triton/ir/context.h"
+#include "triton/codegen/target.h"
+#include "triton/runtime/arg.h"
+
+namespace llvm {
+  class Module;
+  class LLVMContext;
+}
+
+class Parser;
+
+namespace triton {
+
+namespace driver{
+  class module;
+  class stream;
+  class kernel;
+  class context;
+  class device;
+}
+
+namespace lang{
+class translation_unit;
+}
+
+namespace codegen{
+namespace analysis{
+class tiles;
+}
+}
+
+namespace ir {
+class module;
+class function;
+class context;
+}
+
+namespace runtime{
+
+
+typedef std::vector<size_t> grid_t;
+typedef std::map<std::string, size_t> params_t;
+template<typename T> inline T convert(const std::string& name);
+template<> inline long convert<long>(const std::string& name) { return std::stol(name); }
+template<> inline int convert<int>(const std::string& name) { return std::stoi(name); }
+
+class function {
+public:
+  struct options_space_t {
+    typedef std::pair<std::string, std::vector<std::string>> define_t;
+    std::vector<define_t> defines;
+    std::vector<int> num_warps;
+  };
+
+  struct options_t {
+    template<class T>
+    T D(const std::string& name) const {
+      return convert<T>(defines.at(name));
+    }
+
+    std::map<std::string, std::string> defines;
+    size_t num_warps;
+  };
+
+  typedef std::function<grid_t(const options_t&)> grid_fn_ty;
+
+
+private:
+  class caller {
+  public:
+    caller(ir::function *ir, std::shared_ptr<driver::module> program, const options_t& opt_);
+    void operator()(driver::stream *stream, const grid_t& grid, const std::vector<arg>& args) const;
+    const options_t opt() const { return opt_; }
+
+  private:
+    std::shared_ptr<driver::kernel> bin_;
+    std::shared_ptr<driver::module> parent_;
+    std::vector<arg_type> param_tys_;
+    options_t opt_;
+  };
+
+private:
+  typedef std::pair<driver::device*, std::vector<int64_t>> cache_key_t;
+
+private:
+  triton::lang::translation_unit *make_ast(const std::string &src);
+  std::unique_ptr<ir::module> make_ir(Parser &parser);
+  std::unique_ptr<driver::module> make_bin(ir::module &function, driver::context *context, const options_t &opt);
+  caller autotune(driver::stream *stream, const grid_fn_ty& grid, const std::vector<arg> &args);
+
+public:
+  static std::string preheader();
+
+public:
+  function(const std::string& src, const options_space_t& opt = options_space_t());
+  void operator()(const std::vector<arg>& args, const grid_t& grid, driver::stream* stream);
+  void operator()(const std::vector<arg>& args, const grid_fn_ty& grid, driver::stream *stream);
+  void set_cst(const std::string& name, void* data, size_t n_bytes);
+
+private:
+  ir::context ctx_;
+  std::string src_;
+  options_space_t opt_space_;
+  std::map<cache_key_t, caller> cache_;
+  std::map<std::string, std::vector<char>> cst_;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp
new file mode 100644
index 000000000..e1807f25e
--- /dev/null
+++ b/include/triton/tools/bench.hpp
@@ -0,0 +1,59 @@
+#pragma once
+
+#ifndef _TRITON_TOOLS_BENCH_H_
+#define _TRITON_TOOLS_BENCH_H_
+
+#include <chrono>
+#include <functional>
+#include <algorithm>
+#include "triton/driver/device.h"
+#include "triton/driver/stream.h"
+
+namespace triton{
+namespace tools{
+
+class timer{
+    typedef std::chrono::high_resolution_clock high_resolution_clock;
+    typedef std::chrono::nanoseconds nanoseconds;
+
+public:
+    explicit timer(bool run = false)
+    { if (run) start(); }
+
+    void start()
+    { _start = high_resolution_clock::now(); }
+
+    nanoseconds get() const
+    { return std::chrono::duration_cast<nanoseconds>(high_resolution_clock::now() - _start); }
+
+private:
+    high_resolution_clock::time_point _start;
+};
+
+inline double bench(std::function<void()> const & op, driver::stream * stream, bool normalize = false)
+{
+//  const driver::device * device = stream->context()->device();
+  timer tmr;
+  std::vector<size_t> times;
+  double total_time = 0;
+  op();
+  stream->synchronize();
+  while(total_time*1e-9 < 1e-2){
+    float norm = 1;
+    // normalize clock if possible to reduce noise in auto-tuning
+    if(normalize)
+    if(auto cu_device = dynamic_cast<const triton::driver::cu_device*>(stream->context()->device()))
+      norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock();
+    tmr.start();
+    op();
+    stream->synchronize();
+    times.push_back(norm*tmr.get().count());
+    total_time+=times.back();
+  }
+  return *std::min_element(times.begin(), times.end());
+}
+
+}
+}
+
+#endif
diff --git a/include/triton/tools/graph.h b/include/triton/tools/graph.h
new file mode 100644
index 000000000..b53e754cd
--- /dev/null
+++ b/include/triton/tools/graph.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#ifndef _TRITON_TOOLS_THREAD_GRAPH_H_
+#define _TRITON_TOOLS_THREAD_GRAPH_H_
+
+#include <map>
+#include <set>
+#include <vector>
+
+namespace triton {
+namespace tools{
+
+template<class node_t>
+class graph {
+  typedef std::map<node_t, std::set<node_t>> edges_t;
+
+public:
+  typedef std::map<size_t, std::vector<node_t>> cmap_t;
+  typedef std::map<node_t, size_t> nmap_t;
+
+private:
+  void connected_components_impl(node_t x, std::set<node_t> &nodes,
+                                 nmap_t* nmap, cmap_t* cmap, int id) const {
+    if(nmap)
+      (*nmap)[x] = id;
+    if(cmap)
+      (*cmap)[id].push_back(x);
+    if(nodes.find(x) != nodes.end()) {
+      nodes.erase(x);
+      for(const node_t &y: edges_.at(x))
+        connected_components_impl(y, nodes, nmap, cmap, id);
+    }
+  }
+
+public:
+  void connected_components(cmap_t *cmap, nmap_t *nmap) const {
+    if(cmap)
+      cmap->clear();
+    if(nmap)
+      nmap->clear();
+    std::set<node_t> nodes = nodes_;
+    unsigned id = 0;
+    while(!nodes.empty())
+      connected_components_impl(*nodes.begin(), nodes, nmap, cmap, id++);
+  }
+
+  void add_edge(node_t x, node_t y) {
+    nodes_.insert(x);
+    nodes_.insert(y);
+    edges_[x].insert(y);
+    edges_[y].insert(x);
+  }
+
+  void clear() {
+    nodes_.clear();
+    edges_.clear();
+  }
+
+private:
+  std::set<node_t> nodes_;
+  edges_t edges_;
+};
+
+}
+}
+
+#endif
diff --git a/include/triton/tools/sys/getenv.hpp b/include/triton/tools/sys/getenv.hpp
new file mode 100755
index 000000000..0319d8868
--- /dev/null
+++ b/include/triton/tools/sys/getenv.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
+ *
+ * This file is part of ISAAC.
+ *
+ * ISAAC is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301  USA
+ */
+
+#ifndef TDL_TOOLS_SYS_GETENV_HPP
+#define TDL_TOOLS_SYS_GETENV_HPP
+
+#include <string>
+#include <cstdlib>
+
+namespace triton
+{
+
+namespace tools
+{
+
+    inline std::string getenv(const char * name)
+    {
+        #ifdef _MSC_VER
+            char* cache_path = 0;
+            std::size_t sz = 0;
+            _dupenv_s(&cache_path, &sz, name);
+        #else
+            const char * cstr = std::getenv(name);
+        #endif
+        if(!cstr)
+            return "";
+        std::string result(cstr);
+        #ifdef _MSC_VER
+            free(cache_path);
+        #endif
+        return result;
+    }
+
+}
+
+}
+
+#endif
diff --git a/include/triton/tools/sys/mkdir.hpp b/include/triton/tools/sys/mkdir.hpp
new file mode 100755
index 000000000..e6c289535
--- /dev/null
+++ b/include/triton/tools/sys/mkdir.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
+ *
+ * This file is part of ISAAC.
+ *
+ * ISAAC is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301  USA
+ */
+
+#ifndef TDL_TOOLS_SYS_MKDIR_HPP
+#define TDL_TOOLS_SYS_MKDIR_HPP
+
+#include <cstring>
+#include <string>
+#include <cstdlib>
+#include <sys/stat.h>
+#include <errno.h>
+#if defined(_WIN32)
+  #include <direct.h>
+#endif
+
+namespace triton
+{
+
+namespace tools
+{
+
+    inline int mkdir(std::string const & path)
+    {
+        #if defined(_WIN32)
+            return _mkdir(path.c_str());
+        #else
+            return ::mkdir(path.c_str(), 0777);
+        #endif
+    }
+
+    inline int mkpath(std::string const & path)
+    {
+        int status = 0;
+        size_t pp = 0;
+        size_t sp;
+        while ((sp = path.find('/', pp)) != std::string::npos)
+        {
+            if (sp != pp){
+                status = mkdir(path.substr(0, sp));
+            }
+            pp = sp + 1;
+        }
+        return (status==0 || errno==EEXIST)?0:-1;
+    }
+
+}
+
+}
+
+#endif
diff --git a/include/triton/tools/thread_pool.h b/include/triton/tools/thread_pool.h
new file mode 100644
index 000000000..143ef30ab
--- /dev/null
+++ b/include/triton/tools/thread_pool.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#ifndef _TRITON_TOOLS_THREAD_POOL_H_
+#define _TRITON_TOOLS_THREAD_POOL_H_
+
+#include <vector>
+#include <queue>
+#include <memory>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <future>
+#include <functional>
+#include <stdexcept>
+
+class ThreadPool {
+public:
+    ThreadPool(size_t);
+    template<class F, class... Args>
+    auto enqueue(F&& f, Args&&... args)
+        -> std::future<typename std::result_of<F(Args...)>::type>;
+    ~ThreadPool();
+private:
+    // need to keep track of threads so we can join them
+    std::vector< std::thread > workers;
+    // the task queue
+    std::queue< std::function<void()> > tasks;
+
+    // synchronization
+    std::mutex queue_mutex;
+    std::condition_variable condition;
+    bool stop;
+};
+
+// the constructor just launches some amount of workers
+inline ThreadPool::ThreadPool(size_t threads)
+    :   stop(false)
+{
+    for(size_t i = 0;i<threads;++i)
+        workers.emplace_back(
+            [this]
+            {
+                for(;;)
+                {
+                    std::function<void()> task;
+
+                    {
+                        std::unique_lock<std::mutex> lock(this->queue_mutex);
+                        this->condition.wait(lock,
+                            [this]{ return this->stop || !this->tasks.empty(); });
+                        if(this->stop && this->tasks.empty())
+                            return;
+                        task = std::move(this->tasks.front());
+                        this->tasks.pop();
+                    }
+
+                    task();
+                }
+            }
+        );
+}
+
+// add new work item to the pool
+template<class F, class... Args>
+auto ThreadPool::enqueue(F&& f, Args&&... args)
+    -> std::future<typename std::result_of<F(Args...)>::type>
+{
+    using return_type = typename std::result_of<F(Args...)>::type;
+
+    auto task = std::make_shared< std::packaged_task<return_type()> >(
+            std::bind(std::forward<F>(f), std::forward<Args>(args)...)
+        );
+
+    std::future<return_type> res = task->get_future();
+    {
+        std::unique_lock<std::mutex> lock(queue_mutex);
+
+        // don't allow enqueueing after stopping the pool
+        if(stop)
+            throw std::runtime_error("enqueue on stopped ThreadPool");
+
+        tasks.emplace([task](){ (*task)(); });
+    }
+    condition.notify_one();
+    return res;
+}
+
+// the destructor joins all threads
+inline ThreadPool::~ThreadPool()
+{
+    {
+        std::unique_lock<std::mutex> lock(queue_mutex);
+        stop = true;
+    }
+    condition.notify_all();
+    for(std::thread &worker: workers)
+        worker.join();
+}
+
+#endif
diff --git a/lib/codegen/analysis/align.cc b/lib/codegen/analysis/align.cc
new file mode 100644
index 000000000..8e9ba699c
--- /dev/null
+++ b/lib/codegen/analysis/align.cc
@@ -0,0 +1,514 @@
+#include "triton/codegen/analysis/align.h"
+#include "triton/ir/utils.h"
+#include "triton/ir/module.h"
+#include "triton/ir/function.h"
+#include "triton/ir/basic_block.h"
+#include "triton/ir/instructions.h"
+#include "triton/ir/type.h"
+#include <iostream>
+
+namespace triton {
+namespace codegen{
+namespace analysis{
+
+
+// Function for extended Euclidean Algorithm
+int gcd_impl(int a, int b, int *x, int *y)
+{
+    // Base Case
+    if (a == 0)
+    {
+        *x = 0;
+        *y = 1;
+        return b;
+    }
+
+    int x1, y1; // To store results of recursive call
+    int gcd = gcd_impl(b%a, a, &x1, &y1);
+
+    // Update x and y using results of
+    // recursive call
+    *x = y1 - (b/a) * x1;
+    *y = x1;
+
+    return gcd;
+}
+
+int gcd(int a, int b) {
+  int x, y;
+  return gcd_impl(a, b, &x, &y);
+}
+
+
+inline int lcm(int a, int b) {
+  return (a * b) / gcd(a, b);
+}
+
+template<class T>
+inline T add_to_cache(ir::value *i, T value, std::map<ir::value*, T> &map) {
+  return map[i] = value;
+}
+
+/*
+ * is constant
+ */
+
+std::vector<unsigned> align::get_shapes(ir::value *v) {
+  ir::type *ty = v->get_type();
+  if(ty->is_tile_ty())
+    return ty->get_tile_shapes();
+  else
+    return {1};
+}
+
+std::vector<align::cst_info> align::populate_is_constant_phi(ir::phi_node* x) {
+  auto shapes = get_shapes(x);
+  std::vector<cst_info> result(shapes.size(), cst_info{1, 0});
+  for(unsigned n = 0; n < x->get_num_incoming(); n++){
+    ir::value* inc = x->get_incoming_value(n);
+    auto it = is_constant_.find(inc);
+    if(it != is_constant_.end())
+      result = it->second;
+  }
+  return add_to_cache(x, result, is_constant_);
+  // recurse
+  for(unsigned n = 0; n < x->get_num_incoming(); n++){
+    ir::value* inc = x->get_incoming_value(n);
+    auto cst = populate_is_constant(inc);
+    for(size_t d = 0; d < cst.size(); d++)
+      result[d].num_cst = std::min(result[d].num_cst, cst[d].num_cst);
+  }
+  return add_to_cache(x, result, is_constant_);
+}
+
+std::vector<align::cst_info> align::populate_is_constant_splat(ir::splat_inst* x) {
+  auto shapes = get_shapes(x);
+  ir::value* op = x->get_operand(0);
+  std::vector<cst_info> result;
+  auto op_cst = populate_is_constant(op);
+  for(auto d: shapes)
+    result.push_back(cst_info{d, op_cst[0].value});
+  return add_to_cache(x, result, is_constant_);
+}
+
+std::vector<align::cst_info> align::populate_is_constant_reshape(ir::reshape_inst* x) {
+  auto x_shapes = get_shapes(x);
+  std::vector<cst_info> result;
+  ir::value *op = x->get_operand(0);
+  auto op_shapes = op->get_type()->get_tile_shapes();
+  auto op_cst = populate_is_constant(op);
+  unsigned current = 0;
+  bool is_skewed = false;
+  for(size_t d = 0; d < x_shapes.size(); d ++){
+    cst_info ax ;
+    if(x_shapes[d] == 1)
+      ax = {1, op_cst[current].value};
+    else if(!is_skewed
+            && x_shapes[d] == op_shapes[current])
+      ax = {x_shapes[d], op_cst[current++].value};
+    else {
+      is_skewed = true;
+      ax = {x_shapes[d], 0};
+    }
+    result.push_back(ax);
+  }
+  return add_to_cache(x, result, is_constant_);
+}
+
+std::vector<align::cst_info> align::populate_is_constant_broadcast(ir::broadcast_inst* x) {
+  auto x_shapes = get_shapes(x);
+  std::vector<cst_info> result;
+  ir::value *op = x->get_operand(0);
+  auto op_shapes = op->get_type()->get_tile_shapes();
+  auto op_cst = populate_is_constant(op);
+  for(size_t d = 0; d < x_shapes.size(); d++)
+    if(op_shapes[d] == 1)
+      result.push_back(cst_info{x_shapes[d], op_cst[d].value});
+    else
+      result.push_back(op_cst[d]);
+  return add_to_cache(x, result, is_constant_);
+}
+
+std::vector<align::cst_info> align::populate_is_constant_binop(ir::binary_operator* x) {
+  auto x_shapes = get_shapes(x);
+  std::vector<cst_info> result;
+  ir::value* lhs_op = x->get_operand(0);
+  ir::value* rhs_op = x->get_operand(1);
+  auto lhs = populate_is_constant(lhs_op);
+  auto rhs = populate_is_constant(rhs_op);
+  auto max_contiguous = populate_max_contiguous(lhs_op);
+  for(size_t d = 0; d < x_shapes.size(); d++) {
+    cst_info ax;
+    if(lhs[d].num_cst==0 && rhs[d].value && x->is_int_div()){
+      // todo might not be entirely true
+      unsigned num_constants = gcd(max_contiguous[d], rhs[d].value);
+      ax = {num_constants, 0};
+    }
+    else
+      ax = {std::min(lhs[d].num_cst, rhs[d].num_cst), 0};
+    result.push_back(ax);
+  }
+  return add_to_cache(x, result, is_constant_);
+}
+
+std::vector<align::cst_info> align::populate_is_constant_gep(ir::getelementptr_inst* x) {
+  auto x_shapes = get_shapes(x);
+  ir::value* lhs_op = x->get_operand(0);
+  ir::value* rhs_op = x->get_operand(1);
+  auto lhs = populate_is_constant(lhs_op);
+  auto rhs = populate_is_constant(rhs_op);
+  std::vector<cst_info> result;
+  for(size_t d = 0; d < x_shapes.size(); d++)
+    result.push_back({std::min(lhs[d].num_cst, rhs[d].num_cst), 0});
+  return add_to_cache(x, result, is_constant_);
+}
+
+std::vector<align::cst_info> align::populate_is_constant_default(ir::value *v) {
+  auto shapes = get_shapes(v);
+  std::vector<cst_info> result(shapes.size(), {1, 0});
+  return add_to_cache(v, result, is_constant_);
+}
+
+std::vector<align::cst_info> align::populate_is_constant(ir::value *v) {
+  if(is_constant_.find(v) != is_constant_.end())
+    return is_constant_.at(v);
+  if(auto *x = dynamic_cast<ir::constant_int*>(v))
+    return add_to_cache(v, {cst_info{true, std::min<unsigned>(x->get_value(), 128)}}, is_constant_);
+  if(dynamic_cast<ir::make_range_sta*>(v))
+    return add_to_cache(v, {cst_info{true, 0}}, is_constant_);
+  if(auto *x = dynamic_cast<ir::phi_node*>(v))
+    return populate_is_constant_phi(x);
+  if(auto *x = dynamic_cast<ir::splat_inst*>(v))
+    return populate_is_constant_splat(x);
+  if(auto *x = dynamic_cast<ir::reshape_inst*>(v))
+    return populate_is_constant_reshape(x);
+  if(auto *x = dynamic_cast<ir::broadcast_inst*>(v))
+    return populate_is_constant_broadcast(x);
+  if(auto *x = dynamic_cast<ir::binary_operator*>(v))
+    return populate_is_constant_binop(x);
+  if(auto *x = dynamic_cast<ir::getelementptr_inst*>(v))
+    return populate_is_constant_gep(x);
+  return populate_is_constant_default(v);
+}
+
+
+/*
+ * max contiguous
+ */
+
+std::vector<unsigned> align::populate_max_contiguous_phi(ir::phi_node* x) {
+  auto shapes = get_shapes(x);
+  std::vector<unsigned> result(shapes.size(), 1);
+  for(unsigned n = 0; n < x->get_num_incoming(); n++){
+    ir::value* inc = x->get_incoming_value(n);
+    auto it = max_contiguous_.find(inc);
+    if(it != max_contiguous_.end())
+      result = it->second;
+  }
+  add_to_cache(x, result, max_contiguous_);
+  // recurse
+  for(unsigned n = 0; n < x->get_num_incoming(); n++){
+    ir::value* inc = x->get_incoming_value(n);
+    auto contiguous = populate_max_contiguous(inc);
+    for(size_t d = 0; d < result.size(); d++)
+      result[d] = std::min(result[d], contiguous[d]);
+  }
+  return add_to_cache(x, result, max_contiguous_);
+
+}
+
+std::vector<unsigned> align::populate_max_contiguous_splat(ir::splat_inst* x) {
+  auto x_shapes = get_shapes(x);
+  std::vector<unsigned> result;
+  for(size_t d = 0; d < x_shapes.size(); d++)
+    result.push_back({1});
+  return add_to_cache(x, result, max_contiguous_);
+}
+
+std::vector<unsigned> align::populate_max_contiguous_reshape(ir::reshape_inst* x) {
+  auto shapes = get_shapes(x);
+  std::vector<unsigned> result;
+  ir::value *op = x->get_operand(0);
+  auto op_shapes = op->get_type()->get_tile_shapes();
+  auto op_mc = populate_max_contiguous(op);
+  unsigned current = 0;
+  bool is_skewed = false;
+  for(size_t d = 0; d < shapes.size(); d ++){
+    if(shapes[d] == 1)
+      result.push_back(1);
+    else if(!is_skewed
+            && shapes[d] == op_shapes[current])
+      result.push_back(op_mc[current++]);
+    else {
+      is_skewed = true;
+      result.push_back(1);
+    }
+  }
+  return add_to_cache(x, result, max_contiguous_);
+}
+
+std::vector<unsigned> align::populate_max_contiguous_broadcast(ir::broadcast_inst* x) {
+  auto shapes = get_shapes(x);
+  std::vector<unsigned> result;
+  ir::value *op = x->get_operand(0);
+  auto op_shapes = op->get_type()->get_tile_shapes();
+  auto op_mc = populate_max_contiguous(op);
+  for(size_t d = 0; d < shapes.size(); d++)
+    if(op_shapes[d] == 1)
+      result.push_back(1);
+    else
+      result.push_back(op_mc[d]);
+  return add_to_cache(x, result, max_contiguous_);
+}
+
+std::vector<unsigned> align::populate_max_contiguous_binop(ir::binary_operator* x) {
+  auto shapes = get_shapes(x);
+  ir::value* lhs = x->get_operand(0);
+  ir::value* rhs = x->get_operand(1);
+  auto lhs_max_contiguous = populate_max_contiguous(lhs);
+  auto rhs_max_contiguous = populate_max_contiguous(rhs);
+  auto lhs_cst_info = populate_is_constant(lhs);
+  auto rhs_cst_info = populate_is_constant(rhs);
+  std::vector<unsigned> result;
+  for(size_t d = 0; d < shapes.size(); d++){
+    unsigned value = 1;
+    if(x->is_int_rem() && rhs_cst_info[d].value > 0)
+      value = std::min(lhs_max_contiguous[d], rhs_cst_info[d].value);
+    if(x->is_int_mult()){
+      unsigned lvalue = 1, rvalue = 1;
+      if(rhs_cst_info[d].value == 1)
+        lvalue = lhs_max_contiguous[d];
+      if(lhs_cst_info[d].value == 1)
+        rvalue = rhs_max_contiguous[d];
+      value = std::max(lvalue, rvalue);
+    }
+    if(x->is_int_add_sub()){
+      unsigned lvalue = 1, rvalue = 1;
+      if(lhs_cst_info[d].num_cst > 0)
+        lvalue = gcd(rhs_max_contiguous[d], lhs_cst_info[d].num_cst);
+      if(rhs_cst_info[d].num_cst > 0)
+        rvalue = gcd(lhs_max_contiguous[d], rhs_cst_info[d].num_cst);
+      value = std::max(lvalue, rvalue);
+    }
+    result.push_back(value);
+  }
+  return add_to_cache(x, result, max_contiguous_);
+}
+
+std::vector<unsigned> align::populate_max_contiguous_gep(ir::getelementptr_inst* x) {
+  auto shapes = get_shapes(x);
+  ir::value* lhs = x->get_operand(0);
+  ir::value* rhs = x->get_operand(1);
+  auto lhs_max_contiguous = populate_max_contiguous(lhs);
+  auto rhs_max_contiguous = populate_max_contiguous(rhs);
+  auto lhs_cst_info = populate_is_constant(lhs);
+  auto rhs_cst_info = populate_is_constant(rhs);
+  std::vector<unsigned> result(shapes.size(), 1);
+  for(size_t d = 0; d < shapes.size(); d++){
+    unsigned lvalue = 1, rvalue = 1;
+    if(lhs_cst_info[d].num_cst)
+      lvalue = rhs_max_contiguous[d];
+    if(rhs_cst_info[d].num_cst)
+      rvalue = lhs_max_contiguous[d];
+    result[d] = std::max(lvalue, rvalue);
+  }
+  return add_to_cache(x, result, max_contiguous_);
+}
+
+std::vector<unsigned> align::populate_max_contiguous_default(ir::value* v) {
+  if(!v->get_type()->is_tile_ty())
+    return add_to_cache(v, {1}, max_contiguous_);
+  auto shapes = v->get_type()->get_tile_shapes();
+  if(dynamic_cast<ir::make_range*>(v))
+    return add_to_cache(v, {shapes[0]}, max_contiguous_);
+  if(dynamic_cast<ir::make_range_sta*>(v))
+    return add_to_cache(v, {shapes[0]}, max_contiguous_);
+  return add_to_cache(v, std::vector<unsigned>(shapes.size(), 1), max_contiguous_);
+}
+
+std::vector<unsigned> align::populate_max_contiguous(ir::value *v){
+  if(max_contiguous_.find(v) != max_contiguous_.end())
+    return max_contiguous_.at(v);
+  if(auto *x = dynamic_cast<ir::splat_inst*>(v))
+    return populate_max_contiguous_splat(x);
+  if(auto *x = dynamic_cast<ir::reshape_inst*>(v))
+    return populate_max_contiguous_reshape(x);
+  if(auto *x = dynamic_cast<ir::broadcast_inst*>(v))
+    return populate_max_contiguous_broadcast(x);
+  if(auto *x = dynamic_cast<ir::binary_operator*>(v))
+    return populate_max_contiguous_binop(x);
+  if(auto *x = dynamic_cast<ir::getelementptr_inst*>(v))
+    return populate_max_contiguous_gep(x);
+  if(auto *x = dynamic_cast<ir::phi_node*>(v))
+    return populate_max_contiguous_phi(x);
+  return populate_max_contiguous_default(v);
+}
+
+
+/*
+ * starting multiple
+ */
+
+std::vector<unsigned> align::populate_starting_multiple_splat(ir::splat_inst* x){
+  auto shapes = get_shapes(x);
+  auto op = populate_starting_multiple(x->get_operand(0));
+  std::vector<unsigned> result(shapes.size(), op[0]);
+  return add_to_cache(x, result, starting_multiple_);
+}
+
+std::vector<unsigned> align::populate_starting_multiple_reshape(ir::reshape_inst* x){
+  auto op = populate_starting_multiple(x->get_operand(0));
+  auto op_shapes = get_shapes(x->get_operand(0));
+  auto shapes = get_shapes(x);
+  std::vector<unsigned> result(shapes.size(), 1);
+  unsigned current = 0;
+  bool is_skewed = false;
+  for(size_t d = 0; d < shapes.size(); d ++){
+    if(shapes[d] == 1)
+      result[d] = 1;
+    else if(!is_skewed
+            && shapes[d] == op_shapes[current])
+      result[d] = op[current++];
+    else {
+      is_skewed = true;
+      result[d] = 1;
+    }
+  }
+  return add_to_cache(x, result, starting_multiple_);
+}
+
+std::vector<unsigned> align::populate_starting_multiple_broadcast(ir::broadcast_inst* x){
+  auto result = populate_starting_multiple(x->get_operand(0));
+  return add_to_cache(x, result, starting_multiple_);
+}
+
+std::vector<unsigned> align::populate_starting_multiple_binop(ir::binary_operator* x){
+  auto lhs = populate_starting_multiple(x->get_operand(0));
+  auto rhs = populate_starting_multiple(x->get_operand(1));
+  std::vector<unsigned> result(lhs.size(), 1);
+  for(size_t d = 0; d < lhs.size(); d++){
+    if(x->is_int_mult())
+      result[d] = lhs[d] * rhs[d];
+    if(x->is_int_add_sub())
+      result[d] = gcd(lhs[d], rhs[d]);
+    if(x->is_int_div())
+      result[d] = std::max<unsigned>(lhs[d] / rhs[d], 1);
+    if(x->is_int_rem() && rhs[d] > 1)
+      result[d] = gcd(lhs[d], rhs[d]);
+    if(x->is_shl())
+      result[d] = lhs[d] << rhs[d];
+    if(x->is_shr())
+      result[d] = std::max<unsigned>(lhs[d] >> rhs[d], 1);
+  }
+  return add_to_cache(x, result, starting_multiple_);
+}
+
+std::vector<unsigned> align::populate_starting_multiple_gep(ir::getelementptr_inst* x){
+  auto lhs = populate_starting_multiple(x->get_operand(0));
+  auto rhs = populate_starting_multiple(x->get_operand(1));
+  std::vector<unsigned> result(lhs.size(), 1);
+  for(size_t d = 0; d < lhs.size(); d++)
+    result[d] = gcd(lhs[d], rhs[d]);
+  return add_to_cache(x, result, starting_multiple_);
+}
+
+std::vector<unsigned> align::populate_starting_multiple_phi(ir::phi_node* x){
+  auto shape = get_shapes(x);
+  std::vector<unsigned> result(shape.size(), 1);
+  for(unsigned n = 0; n < x->get_num_incoming(); n++){
+    ir::value* inc = x->get_incoming_value(n);
+    if(starting_multiple_.find(inc) != starting_multiple_.end())
+      result = starting_multiple_.at(inc);
+  }
+  add_to_cache(x, result, starting_multiple_);
+  // recurse
+  for(unsigned n = 0; n < x->get_num_incoming(); n++){
+    ir::value* inc = x->get_incoming_value(n);
+    auto sm = populate_starting_multiple(inc);
+    for(size_t d = 0; d < result.size(); d++)
+      result[d] = gcd(result[d], sm[d]);
+  }
+  return add_to_cache(x, result, starting_multiple_);
+}
+
+
+std::vector<unsigned> align::populate_starting_multiple_default(ir::value* v) {
+  ir::type* ty = v->get_type();
+  if(ty->is_tile_ty()) {
+    return add_to_cache(v, ty->get_tile_shapes(), starting_multiple_);
+  }
+  if(auto *x = dynamic_cast<ir::instruction*>(v)){
+    unsigned multiple_of = x->get_metadata(ir::metadata::multiple_of);
+    if(multiple_of > 0)
+      return add_to_cache(x, {multiple_of}, starting_multiple_);
+  }
+  if(auto *x = dynamic_cast<ir::argument*>(v)){
+    std::set<ir::attribute> attributes = x->get_parent()->get_attributes(x);
+    for(auto attr: attributes){
+      if(attr.get_kind() == ir::multiple_of){
+        return add_to_cache(x, {attr.get_value()}, starting_multiple_);
+      }
+      if(attr.get_kind() == ir::aligned){
+        ir::type* ty = x->get_type()->get_pointer_element_ty();
+        int nbits  = ty->get_primitive_size_in_bits();
+        int nbytes = nbits / 8;
+        return add_to_cache(x, {attr.get_value() / nbytes}, starting_multiple_);
+      }
+    }
+  }
+  return add_to_cache(v, {1}, starting_multiple_);
+}
+
+std::vector<unsigned> align::populate_starting_multiple(ir::value *v){
+  if(starting_multiple_.find(v) != starting_multiple_.end())
+    return starting_multiple_.at(v);
+  if(auto *x = dynamic_cast<ir::binary_operator*>(v))
+    return populate_starting_multiple_binop(x);
+  if(auto *x = dynamic_cast<ir::constant_int*>(v))
+    return add_to_cache(x, {std::min<unsigned>(x->get_value(), 128)}, starting_multiple_);
+  if(auto *x = dynamic_cast<ir::make_range*>(v))
+    return add_to_cache(x, {(unsigned)x->get_first()->get_value()}, starting_multiple_);
+  if(auto *x = dynamic_cast<ir::make_range_dyn*>(v))
+    return add_to_cache(x, {128}, starting_multiple_);
+  if(auto *x = dynamic_cast<ir::make_range_sta*>(v))
+    return add_to_cache(x, {(unsigned)x->get_range()->get_first()->get_value()}, starting_multiple_);
+  if(auto *x = dynamic_cast<ir::getelementptr_inst*>(v))
+    return populate_starting_multiple_gep(x);
+  if(auto *x = dynamic_cast<ir::splat_inst*>(v))
+    return populate_starting_multiple_splat(x);
+  if(auto *x = dynamic_cast<ir::reshape_inst*>(v))
+    return populate_starting_multiple_reshape(x);
+  if(auto *x = dynamic_cast<ir::broadcast_inst*>(v))
+    return populate_starting_multiple_broadcast(x);
+  if(auto *x = dynamic_cast<ir::phi_node*>(v))
+    return populate_starting_multiple_phi(x);
+  return populate_starting_multiple_default(v);
+}
+
+
+unsigned align::get(ir::value *v, unsigned ax) const {
+  unsigned starting_multiple = starting_multiple_.at(v)[ax];
+  unsigned max_contiguous = max_contiguous_.at(v)[ax];
+  return std::min(starting_multiple, max_contiguous);
+}
+
+std::vector<unsigned> align::contiguous(ir::value* v) const {
+  return max_contiguous_.at(v);
+}
+
+
+void align::populate(ir::value *v) {
+  populate_is_constant(v);
+  populate_starting_multiple(v);
+  populate_max_contiguous(v);
+
+}
+
+void align::run(ir::module &mod) {
+  ir::for_each_value(mod, [this](ir::value* v) { populate(v); } );
+}
+
+
+}
+}
+}
diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc
new file mode 100644
index 000000000..d0a66543a
--- /dev/null
+++ b/lib/codegen/analysis/allocation.cc
@@ -0,0 +1,107 @@
+#include <algorithm>
+#include <climits>
+#include "triton/codegen/analysis/layout.h"
+#include "triton/codegen/analysis/allocation.h"
+#include "triton/codegen/analysis/liveness.h"
+#include "triton/ir/utils.h"
+
+namespace triton{
+namespace codegen{
+namespace analysis{
+
+
+void allocation::run(ir::module &mod) {
+  using std::max;
+  using std::min;
+  typedef std::multimap<unsigned, segment> triples_map_type;
+
+  std::vector<shared_layout*> I;
+  for(auto x: liveness_->get())
+    I.push_back(x.first);
+  std::vector<shared_layout*> J = I;
+
+  triples_map_type H;
+  H.insert({0, segment{0, INT_MAX}});
+
+  std::vector<shared_layout*> V;
+  std::map<shared_layout*, unsigned> starts;
+  while(!J.empty()){
+    auto h_it = H.begin();
+    unsigned w = h_it->first;
+    segment xh = h_it->second;
+    H.erase(h_it);
+    auto j_it = std::find_if(J.begin(), J.end(), [&](shared_layout* JJ){
+      segment xj = liveness_->get(JJ);
+      bool res = xj.intersect(xh);
+      for(auto val: H)
+        res = res && !val.second.intersect(xj);
+      return res;
+    });
+    if(j_it != J.end()){
+      unsigned size = (*j_it)->get_size();
+      segment xj = liveness_->get(*j_it);
+      starts[*j_it] = w;
+      H.insert({w + size, segment{max(xh.start, xj.start), min(xh.end, xj.end)}});
+      if(xh.start < xj.start)
+        H.insert({w, segment{xh.start, xj.end}});
+      if(xj.end < xh.end)
+        H.insert({w, segment{xj.start, xh.end}});
+      V.push_back(*j_it);
+      J.erase(j_it);
+    }
+  }
+
+  // Build interference graph
+  std::map<shared_layout*, std::set<shared_layout*>> interferences;
+  for(shared_layout* x: V)
+  for(shared_layout* y: V){
+    if(x == y)
+      continue;
+    unsigned X0 = starts[x], Y0 = starts[y];
+    unsigned NX = x->get_size();
+    unsigned NY = y->get_size();
+    segment XS = {X0, X0 + NX};
+    segment YS = {Y0, Y0 + NY};
+    if(liveness_->get(x).intersect(liveness_->get(y))
+        && XS.intersect(YS))
+      interferences[x].insert(y);
+  }
+
+  // Initialize colors
+  std::map<shared_layout*, int> colors;
+  for(shared_layout* X: V)
+    colors[X] = (X==V[0])?0:-1;
+
+
+  // First-fit graph coloring
+  std::vector<bool> available(V.size());
+  for(shared_layout* x: V){
+    // Non-neighboring colors are available
+    std::fill(available.begin(), available.end(), true);
+    for(shared_layout* Y: interferences[x]){
+      int color = colors[Y];
+      if(color >= 0)
+        available[color] = false;
+    }
+    // Assigns first available color
+    auto It = std::find(available.begin(), available.end(), true);
+    colors[x] = std::distance(available.begin(), It);
+  }
+
+  // Finalize allocation
+  for(shared_layout* x: V){
+    unsigned Adj = 0;
+    for(shared_layout* y: interferences[x])
+      Adj = std::max<unsigned>(Adj, starts[y] + y->get_size());
+    offsets_[x] = starts[x] + colors[x] * Adj;
+  }
+
+  // Save maximum size of induced memory space
+  allocated_size_ = 0;
+  for(shared_layout* x: V)
+    allocated_size_ = std::max<size_t>(allocated_size_, starts[x] + x->get_size());
+}
+
+}
+}
+}
diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc
new file mode 100644
index 000000000..a01ef9aa1
--- /dev/null
+++ b/lib/codegen/analysis/axes.cc
@@ -0,0 +1,147 @@
+#include "triton/codegen/analysis/axes.h"
+#include "triton/ir/utils.h"
+#include "triton/ir/instructions.h"
+#include "triton/ir/type.h"
+
+
+
+namespace triton{
+namespace codegen{
+namespace analysis{
+
+axes::axes() {}
+
+void axes::update_graph_reduce(ir::instruction *i) {
+  auto* red = static_cast<ir::reduce_inst*>(i);
+  unsigned axis = red->get_axis();
+  ir::value *arg = red->get_operand(0);
+  auto in_shapes = arg->get_type()->get_tile_shapes();
+  unsigned current = 0;
+  for(unsigned d = 0; d < in_shapes.size(); d++){
+    if(d == axis)
+      continue;
+    graph_.add_edge({i, current++}, {arg, d});
+  }
+}
+
+void axes::update_graph_reshape(ir::instruction *i) {
+  auto* reshape = static_cast<ir::reshape_inst*>(i);
+  // operands
+  ir::value *op = reshape->get_operand(0);
+  // shapes
+  auto op_shapes = op->get_type()->get_tile_shapes();
+  auto res_shapes = reshape->get_type()->get_tile_shapes();
+  // construct edges
+  unsigned current = 0;
+  bool is_skewed = false;
+  for(unsigned d = 0; d < res_shapes.size(); d ++){
+    bool same_shape = res_shapes[d] == op_shapes[current];
+    // either add edge between axis or just add a node in the graph
+    if(!is_skewed && same_shape)
+      graph_.add_edge({i, d}, {op, current++});
+    else
+      graph_.add_edge({i, d}, {i, d});
+    // reshaping is skewed
+    if(res_shapes[d] > 1 && !same_shape)
+      is_skewed = true;
+  }
+}
+
+void axes::update_graph_trans(ir::instruction *i) {
+  auto *trans = static_cast<ir::trans_inst*>(i);
+  ir::value *op = trans->get_operand(0);
+  auto perm = trans->get_perm();
+  // add edge between axis perm[d] and axis d
+  for(unsigned d = 0; d < perm.size(); d++)
+    graph_.add_edge({i, perm[d]}, {op, d});
+}
+
+void axes::update_graph_broadcast(ir::instruction *i) {
+  auto *broadcast = static_cast<ir::broadcast_inst*>(i);
+  auto shapes = broadcast->get_type()->get_tile_shapes();
+  ir::value *op = broadcast->get_operand(0);
+  ir::type *op_ty = op->get_type();
+  const auto& op_shapes = op_ty->get_tile_shapes();
+  // add edge between non-broadcast axes
+  for(unsigned d = 0; d < shapes.size(); d ++)
+    if(op_shapes[d] == shapes[d])
+      graph_.add_edge({i, d}, {op, d});
+}
+
+void axes::update_graph_dot(ir::instruction *i) {
+  auto *dot = static_cast<ir::dot_inst*>(i);
+  auto shapes = dot->get_type()->get_tile_shapes();
+  ir::value *A = dot->get_operand(0);
+  ir::value *B = dot->get_operand(1);
+  ir::value *D = dot->get_operand(2);
+  // add edges between result and accumulator
+  for(unsigned d = 0; d < shapes.size(); d++)
+    graph_.add_edge({dot, d}, {D, d});
+}
+
+void axes::update_graph_elementwise(ir::instruction *i) {
+  if(i->get_num_operands() == 0)
+    return;
+  ir::value *op = i->get_operand(0);
+  if(!op->get_type()->is_tile_ty())
+    return;
+  auto rank = op->get_type()->get_tile_rank();
+  for(unsigned d = 0; d < rank; d++)
+  for(ir::value* opx: i->ops())
+  for(ir::value* opy: i->ops()){
+    if(!i->get_type()->is_void_ty())
+      graph_.add_edge({i, d}, {opx, d});
+    graph_.add_edge({opx, d}, {opy, d});
+  }
+}
+
+void axes::update_graph_no_edge(ir::instruction *i) {
+  if(!i->get_type()->is_tile_ty())
+    return;
+  auto rank = i->get_type()->get_tile_rank();
+  for(unsigned d = 0; d < rank; d++)
+    graph_.add_edge({i, d}, {i, d});
+}
+
+void axes::update_graph(ir::instruction *i) {
+  switch (i->get_id()) {
+    case ir::INST_REDUCE:           return update_graph_reduce(i);
+    case ir::INST_RESHAPE:          return update_graph_reshape(i);
+    case ir::INST_SPLAT:            return update_graph_no_edge(i);;
+    case ir::INST_TRANS:            return update_graph_trans(i);
+    case ir::INST_BROADCAST:        return update_graph_broadcast(i);
+    case ir::INST_DOT:              return update_graph_dot(i);
+    case ir::INST_COPY_TO_SHARED:   return update_graph_no_edge(i);;
+    case ir::INST_COPY_FROM_SHARED: return update_graph_no_edge(i);
+    case ir::INST_RECOALESCE:       return update_graph_no_edge(i);
+    default:                        return update_graph_elementwise(i);
+  }
+  return;
+}
+
+
+int axes::get(ir::value *value, unsigned dim) {
+  return axes_.at({value, dim});
+}
+
+std::vector<int> axes::get(ir::value *value) {
+  std::vector<int> result;
+  for(size_t d = 0; d < value->get_type()->get_tile_rank(); d++)
+    result.push_back(this->get(value, d));
+  return result;
+}
+
+void axes::run(ir::module &mod) {
+  // make graph
+  graph_.clear();
+  ir::for_each_instruction(mod, [this](ir::instruction *x) {
+    update_graph(x);
+  });
+  // find connected components
+  graph_.connected_components(nullptr, &axes_);
+}
+
+}
+}
+
+}
diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc
new file mode 100644
index 000000000..8b4a3242a
--- /dev/null
+++ b/lib/codegen/analysis/layout.cc
@@ -0,0 +1,442 @@
+#include <algorithm>
+#include <numeric>
+#include <iostream>
+#include "triton/codegen/analysis/axes.h"
+#include "triton/codegen/analysis/align.h"
+#include "triton/codegen/analysis/layout.h"
+#include "triton/ir/function.h"
+#include "triton/ir/module.h"
+#include "triton/ir/utils.h"
+
+namespace triton{
+namespace codegen{
+namespace analysis{
+
+/* -------------------------------- *
+ *          Helper Functions        *
+ * -------------------------------- */
+
+inline unsigned clamp(unsigned x, unsigned lo, unsigned hi) {
+  return std::min(std::max(x, lo), hi);
+}
+
+inline bool is_hmma_c(ir::value *v){
+  bool result = false;
+  if(auto *x = dynamic_cast<ir::dot_inst*>(v)){
+    ir::value *a = x->get_operand(0);
+    ir::type *a_ty = a->get_type();
+    ir::value *b = x->get_operand(1);
+    ir::type *b_ty = b->get_type();
+    result = a_ty->get_scalar_ty()->is_half_ty() &&
+             b_ty->get_scalar_ty()->is_half_ty();
+  }
+  return result;
+}
+
+inline void extract_io_use(ir::value *v, std::set<ir::value*>& result) {
+  for(ir::user* u: v->get_users()){
+    auto i = dynamic_cast<ir::io_inst*>(u);
+    if(i && i->get_pointer_operand() == v)
+      result.insert(v);
+  }
+}
+
+inline void extract_dot_use(ir::value *v, ir::value*& result, size_t n) {
+  for(ir::user* u: v->get_users()){
+    auto i = dynamic_cast<ir::dot_inst*>(u);
+    if(i && i->get_operand(n) == v)
+      result = v;
+  }
+}
+
+inline void extract_hmma_dot_use(ir::value *v, ir::value*& result, size_t n) {
+  for(ir::user* u: v->get_users()){
+    auto i = dynamic_cast<ir::dot_inst*>(u);
+    if(i && is_hmma_c(i) && i->get_operand(n) == v)
+      result = v;
+  }
+}
+
+
+inline bool is_trans(ir::value *v) {
+  if(dynamic_cast<ir::trans_inst *>(v)) {
+    return true;
+  }
+  if(auto *phi = dynamic_cast<ir::instruction *>(v)) {
+    bool result = true;
+    for(ir::value *op: phi->ops())
+      result = result && is_trans(op);
+    return result;
+  }
+  return false;
+}
+
+
+/* -------------------------------- *
+ *          Layout Visitor          *
+ * -------------------------------- */
+
+void layout_visitor::visit_layout(data_layout *layout) {
+  layout->accept(this);
+}
+
+
+/* -------------------------------- *
+ *        Base Data Layout          *
+ * -------------------------------- */
+
+data_layout::data_layout(id_t id,
+                         const std::vector<int> &axes,
+                         const std::vector<unsigned> &shape,
+                         const std::vector<ir::value *> &values,
+                         analysis::align* align): id_(id), axes_(axes), shape_(shape), values_(values) {
+  // io pointer
+  std::set<ir::value*> ptr;
+  for(ir::value* v: values_)
+    extract_io_use(v, ptr);
+  order_.resize(axes_.size());
+  std::iota(order_.begin(), order_.end(), 0);
+  auto largest = std::max_element(ptr.begin(), ptr.end(), [&](ir::value *x, ir::value *y){
+    return x->get_type()->get_tile_rank() < y->get_type()->get_tile_rank();
+  });
+  if(*largest){
+    auto max_contiguous = align->contiguous(*largest);
+    std::sort(order_.begin(), order_.end(), [&](unsigned a, unsigned b) {
+      return max_contiguous[a] > max_contiguous[b];
+    });
+  }
+}
+
+size_t data_layout::find_axis(int to_find) const {
+  auto it = std::find(axes_.begin(), axes_.end(), to_find);
+  return std::distance(axes_.begin(), it);
+}
+
+
+/* -------------------------------- *
+ *           MMA Layout             *
+ * -------------------------------- */
+
+mma884_layout::mma884_layout(size_t num_warps,
+                                     const std::vector<int>& axes,
+                                     const std::vector<unsigned>& shape,
+                                     const std::vector<ir::value *> &values,
+                                     analysis::align* align): data_layout(HMMA_884, axes, shape, values, align) {
+  /* fragments per warp */
+  // try to make things as square as possible to maximize data re-use
+  fpw_ = {1, 1, 1};
+  std::vector<int> fpw_nm1;
+  unsigned num_fragments = std::min<unsigned>((shape_[0]/8)*(shape_[1]/8), 4);
+  do {
+    fpw_nm1 = fpw_;
+    if(fpw_[0]*fpw_[1] < num_fragments)
+      fpw_[0] = clamp(fpw_[0]*2, 1, shape_[0] / 8);
+    if(fpw_[0]*fpw_[1] < num_fragments)
+      fpw_[1] = clamp(fpw_[1]*2, 1, shape_[1] / 8);
+  }while(fpw_nm1 != fpw_);
+
+  /* warps per tile */
+  // try to make things as square as possible to maximize data re-use
+  wpt_ = {1, 1, 1};
+  std::vector<int> wpt_nm1;
+  do{
+    wpt_nm1 = wpt_;
+    if(wpt_[0] * wpt_[1] * wpt_[2] < num_warps)
+      wpt_[0] = clamp(wpt_[0]*2, 1, shape_[0] / (fpw_[0]*8));
+    if(wpt_[0] * wpt_[1] * wpt_[2] < num_warps)
+      wpt_[1] = clamp(wpt_[1]*2, 1, shape_[1] / (fpw_[1]*8));
+  }while(wpt_nm1 != wpt_);
+
+  /* sanity check */
+  unsigned effective_num_warps = 1;
+  for(size_t d = 0; d < shape.size(); d++)
+    effective_num_warps *= wpt_[d];
+  if(num_warps != effective_num_warps)
+    throw std::runtime_error("cannot create a kernel with this amount of warps");
+}
+
+
+/* -------------------------------- *
+ *         Scanline Layout          *
+ * -------------------------------- */
+
+scanline_layout::scanline_layout(size_t num_warps,
+                                 const std::vector<int>& axes,
+                                 const std::vector<unsigned>& shape,
+                                 const std::vector<ir::value *> &values,
+                                 analysis::align* align): data_layout(SCANLINE, axes, shape, values, align){
+  unsigned size = std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<int>());
+  unsigned num_threads = num_warps * 32;
+  nts_.resize(shape_.size());
+  mts_.resize(shape_.size());
+  bool is_dot = std::any_of(values.begin(), values.end(),
+                            [&](ir::value* v) { return dynamic_cast<ir::dot_inst*>(v); });
+
+  ir::value *ptr = nullptr;
+  for(ir::value *v: values)
+    for(ir::user *usr: v->get_users())
+      if(auto *st = dynamic_cast<ir::store_inst*>(usr))
+        ptr = st->get_pointer_operand();
+
+  unsigned i = order_[0];
+  int contiguous = 4;
+  if(ptr)
+    contiguous = std::min<int>(align->contiguous(ptr)[i], 4);
+
+  nts_[i] = clamp(size / num_threads, 1, std::min<int>(contiguous, shape_[i]));
+  mts_[i] = clamp(num_threads, 1, shape_[i] / nts_[i]);
+  size /= shape_[i];
+  num_threads /= mts_[i];
+  if(is_dot)
+    nts_[order_[1]] = clamp(size / num_threads, 1, std::min<int>(4, shape_[order_[1]]));
+  for(size_t d = 1; d < shape_.size(); d++){
+    i = order_[d];
+    if(d > 1 || !is_dot)
+      nts_[i] = 1;
+    mts_[i] = clamp(num_threads, 1, shape_[i] / nts_[i]);
+    num_threads = num_threads / mts_[i];
+  }
+  /* sanity check */
+  unsigned effective_num_threads = 1;
+  for(size_t d = 0; d < shape_.size(); d++)
+    effective_num_threads *= mts_[d];
+
+  if(num_warps * 32 != effective_num_threads)
+    throw std::runtime_error("cannot create a kernel with this amount of warps");
+}
+
+
+/* -------------------------------- *
+ *          Shared Layout           *
+ * -------------------------------- */
+
+bool shared_layout::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){
+  if(phi->get_parent() != terminator->get_parent())
+    return false;
+  if(auto *br = dynamic_cast<ir::cond_branch_inst*>(terminator))
+    return br->get_true_dest() == phi->get_parent()
+           || br->get_false_dest() == phi->get_parent();
+  else if(dynamic_cast<ir::uncond_branch_inst*>(terminator))
+    return false;
+  else
+    throw std::runtime_error("unreachable");
+}
+
+
+void shared_layout::extract_double_bufferable(ir::value *v, std::shared_ptr<double_buffer_info_t>& res) {
+  auto* phi = dynamic_cast<ir::phi_node*>(v);
+  if(!phi || phi->get_num_incoming() != 2)
+    return;
+  ir::basic_block *block_0 = phi->get_incoming_block(0);
+  ir::basic_block *block_1 = phi->get_incoming_block(1);
+  ir::instruction *terminator_0 = block_0->get_inst_list().back();
+  ir::instruction *terminator_1 = block_1->get_inst_list().back();
+  bool is_latch_0 = is_loop_latch(phi, terminator_0);
+  bool is_latch_1 = is_loop_latch(phi, terminator_1);
+  ir::value *value_0 = phi->get_incoming_value(0);
+  ir::value *value_1 = phi->get_incoming_value(1);
+  ir::instruction *i_0 = dynamic_cast<ir::instruction*>(value_0);
+  ir::instruction *i_1 = dynamic_cast<ir::instruction*>(value_1);
+  if(!i_0 || !i_1 ||
+     !dynamic_cast<ir::copy_to_shared_inst*>(i_0) ||
+     !dynamic_cast<ir::copy_to_shared_inst*>(i_1) )
+    return;
+  if(is_latch_1)
+    res.reset(new double_buffer_info_t{value_0, value_1, phi});
+  if(is_latch_0)
+    res.reset(new double_buffer_info_t{value_1, value_0, phi});
+}
+
+
+shared_layout::shared_layout(const data_layout *arg,
+                                 const std::vector<int>& axes,
+                                 const std::vector<unsigned>& shape,
+                                 const std::vector<ir::value *> &values,
+                                 ir::type *ty,
+                                 analysis::align* align): data_layout(SHARED, axes, shape, values, align), ty_(ty) {
+
+  size_ = 0;
+
+  // double-buffering
+  for(ir::value *v: values)
+    extract_double_bufferable(v, double_buffer_);
+
+  // order
+  std::vector<int> arg_order = arg ? arg->get_order() : std::vector<int>{0};
+  order_ = arg_order;
+
+  ir::value* dot_a = nullptr;
+  ir::value* dot_b = nullptr;
+  ir::value* hmma_dot_a = nullptr;
+  ir::value* hmma_dot_b = nullptr;
+  for(ir::value* v: values){
+    extract_dot_use(v, dot_a, 0);
+    extract_dot_use(v, dot_b, 1);
+    extract_hmma_dot_use(v, hmma_dot_a, 0);
+    extract_hmma_dot_use(v, hmma_dot_b, 1);
+  }
+
+
+  // non-mma ordering
+  std::vector<int> col = {0, 1};
+  std::vector<int> row = {1, 0};
+  for(size_t s = 2; s < get_rank(); s++){
+    col.push_back(s);
+    row.push_back(s);
+  }
+  bool is_nonhmma_dot_a = dot_a && !hmma_dot_a;
+  bool is_nonhmma_dot_b = dot_b && !hmma_dot_b;
+  if(is_nonhmma_dot_a)
+    order_ = is_trans(dot_a) ? row : col;
+  else if(is_nonhmma_dot_b)
+    order_ = is_trans(dot_b) ? col : row;
+
+  // padding
+  size_t pad = 0;
+  if(hmma_dot_a){
+    bool row = is_trans(hmma_dot_a) ^ order_[0] != 0;
+    pad = 24 - shape_[row ? 0 : 1] % 32;
+  }
+  else if(hmma_dot_b){
+    bool row = is_trans(hmma_dot_b) ^ order_[0] != 0;
+    pad = 24 - shape_[row ? 1 : 0] % 32;
+  }
+  else if(order_ != arg_order) {
+    pad = 4;
+  }
+  shape_[order_[0]] += pad;
+
+  // size
+  size_ = ty_->get_primitive_size_in_bits() / 8;
+  for(auto s: shape_)
+     size_ *= s;
+  if(double_buffer_)
+    size_ *= 2;
+}
+
+
+/* -------------------------------- *
+ * ---- Layouts Inference Pass ---- *
+ * -------------------------------- */
+
+layouts::layouts(analysis::axes *axes, analysis::align *align, size_t num_warps)
+  : axes_(axes), align_(align), num_warps_(num_warps) { }
+
+
+void layouts::connect(ir::value *x, ir::value *y) {
+  if(x == y)
+    return;
+  if(!x->get_type()->is_tile_ty())
+    return;
+  if(!y->get_type()->is_tile_ty())
+    return;
+  std::vector<int> x_axes = axes_->get(x);
+  std::vector<int> y_axes = axes_->get(y);
+  std::set<int> sx_axes(x_axes.begin(), x_axes.end());
+  std::set<int> sy_axes(y_axes.begin(), y_axes.end());
+  std::set<int> common;
+  std::set_intersection(sx_axes.begin(), sx_axes.end(),
+                        sy_axes.begin(), sy_axes.end(),
+                        std::inserter(common, common.begin()));
+  graph_.add_edge(x, x);
+  graph_.add_edge(y, y);
+  if(!common.empty())
+    graph_.add_edge(x, y);
+}
+
+void layouts::make_graph(ir::instruction *i) {
+  for(ir::value* opx: i->ops())
+  for(ir::value* opy: i->ops()){
+    connect(i, opx);
+    connect(opx, opy);
+  }
+}
+
+void layouts::create(size_t id, const std::vector<ir::value*>& values) {
+  auto it_hmma_c = std::find_if(values.begin(), values.end(), &is_hmma_c);
+  auto cmp = [](ir::value* x, ir::value *y) {
+    return x->get_type()->get_tile_ranks1() <
+           y->get_type()->get_tile_ranks1();
+  };
+  std::vector<ir::value*> lvalue = values;
+  std::remove_if(lvalue.begin(), lvalue.end(), [&](ir::value* v) { return dynamic_cast<ir::trans_inst*>(v); });
+  ir::value *largest = *std::max_element(lvalue.begin(), lvalue.end(), cmp);
+  const auto& axes = axes_->get(largest);
+  const auto& shapes = largest->get_type()->get_tile_shapes();
+  auto it_cts = std::find_if(values.begin(), values.end(), [](ir::value* v) {
+      return dynamic_cast<ir::copy_to_shared_inst*>(v);
+  });
+  // type
+  if(it_hmma_c != values.end())
+    layouts_[id] = new mma884_layout(num_warps_, axes, shapes, values, align_);
+  else if(it_cts != values.end()){
+    ir::copy_to_shared_inst *cts = (ir::copy_to_shared_inst*)*it_cts;
+    ir::value *arg = cts->get_operand(0);
+    create(groups_.at(arg), values_.at(groups_.at(arg)));
+    layouts_[id] = new shared_layout(get(arg), axes, shapes, values, largest->get_type()->get_scalar_ty(), align_);
+  }
+  else
+    layouts_[id] = new scanline_layout(num_warps_, axes, shapes, values, align_);
+}
+
+void layouts::run(ir::module &mod) {
+  // make graph
+  graph_.clear();
+  ir::for_each_instruction(mod, [this](ir::instruction* i) {
+    make_graph(i);
+  });
+
+  // connected components
+  graph_.connected_components(&values_, &groups_);
+
+  // create layouts
+  for(const auto& x: values_)
+    create(x.first, x.second);
+
+  // create temporaries
+  size_t id = values_.size();
+  ir::for_each_instruction(mod, [this, &id](ir::instruction* i) {
+    if(auto *red = dynamic_cast<ir::reduce_inst*>(i)) {
+      id++;
+      ir::value *arg = red->get_operand(0);
+      unsigned axis = red->get_axis();
+      // shape
+      auto shapes = arg->get_type()->get_tile_shapes();
+      unsigned shape_ax = shapes[axis];
+      scanline_layout *layout = get(arg)->to_scanline();
+      unsigned per_thread = layout->nts(axis);
+      unsigned depth = shape_ax / per_thread;
+      shapes[axis] = depth;
+      // create layout
+      layouts_[id] = new shared_layout(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), align_);
+      tmp_[red] = id;
+    }
+    if(auto *recoalasce = dynamic_cast<ir::recoalesce_inst*>(i)){
+      ir::value *val = recoalasce->get_operand(0);
+      mma884_layout* in_layout = get(val)->to_mma884();
+      scanline_layout* out_layout = get(i)->to_scanline();
+      if(!in_layout || !out_layout)
+        return;
+      id++;
+      ir::type::tile_shapes_t in_shape = val->get_type()->get_tile_shapes();
+      ir::type::tile_shapes_t shape(in_shape.size());
+      size_t ld = out_layout->get_order(0);
+      shape[ld] = in_shape[ld];
+      for(size_t k = 0; k < in_shape.size(); k++)
+        if(k != ld)
+          shape[k] = 4*in_layout->to_mma884()->fpw(k)*in_layout->to_mma884()->wpt(k);
+      // create layout
+      layouts_[id] = new shared_layout(out_layout, axes_->get(val), shape, {recoalasce}, val->get_type()->get_scalar_ty(), align_);
+      tmp_[recoalasce] = id;
+    }
+    if(auto *atom = dynamic_cast<ir::atomic_cas_inst*>(i)){
+      id++;
+      layouts_[id] = new shared_layout(nullptr, {}, {1}, {atom}, atom->get_type()->get_scalar_ty(), align_);
+      tmp_[atom] = id;
+    }
+  });
+}
+
+}
+}
+}
diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc
new file mode 100644
index 000000000..224a93fc9
--- /dev/null
+++ b/lib/codegen/analysis/liveness.cc
@@ -0,0 +1,57 @@
+#include <climits>
+#include <iostream>
+#include "triton/codegen/analysis/liveness.h"
+#include "triton/codegen/analysis/layout.h"
+#include "triton/ir/function.h"
+#include "triton/ir/module.h"
+#include "triton/ir/utils.h"
+
+namespace triton{
+namespace codegen{
+namespace analysis{
+
+
+void liveness::run(ir::module &mod) {
+  intervals_.clear();
+
+  // Assigns index to each instruction
+  std::map<ir::value*, slot_index> indices;
+  for(ir::function *fn: mod.get_function_list()){
+    slot_index index = 0;
+    for(ir::basic_block *block: fn->blocks())
+    for(ir::instruction *instr: block->get_inst_list()){
+      index += 1;
+      indices.insert({instr, index});
+    }
+  }
+
+  // create live intervals
+  for(auto &x: layouts_->get_all()) {
+    shared_layout* layout = x.second->to_shared();
+    if(!layout)
+      continue;
+    // users
+    std::set<ir::user*> users;
+    for(ir::value *v: layout->get_values()){
+      for(ir::user *u: v->get_users())
+        users.insert(u);
+    }
+    // compute intervals
+    unsigned start = INT32_MAX;
+    for(ir::value *v: layout->get_values())
+      if(indices.find(v) != indices.end())
+        start = std::min(start, indices.at(v));
+    unsigned end = 0;
+    for(ir::user *u: users)
+      if(indices.find(u) != indices.end())
+        end = std::max(end, indices.at(u));
+    intervals_[layout] = segment{start, end};
+  }
+
+
+
+}
+
+}
+}
+}
diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc
new file mode 100644
index 000000000..ae7d0e876
--- /dev/null
+++ b/lib/codegen/selection/generator.cc
@@ -0,0 +1,1308 @@
+#include <numeric>
+#include "triton/codegen/selection/generator.h"
+#include "triton/codegen/selection/machine_layout.h"
+#include "triton/codegen/selection/machine_value.h"
+#include "triton/codegen/target.h"
+#include "triton/codegen/analysis/axes.h"
+#include "triton/codegen/analysis/allocation.h"
+#include "triton/codegen/analysis/align.h"
+#include "triton/codegen/transform/coalesce.h"
+#include "triton/ir/context.h"
+#include "triton/ir/module.h"
+#include "triton/ir/function.h"
+#include "triton/ir/type.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/InlineAsm.h"
+
+namespace triton{
+namespace codegen{
+
+using namespace llvm;
+
+
+llvm::Instruction::BinaryOps llvm_op(ir::binary_op_t op) {
+  using llop = llvm::Instruction::BinaryOps;
+  using ttop = ir::binary_op_t;
+  switch(op) {
+    case ttop::Add: return llop::Add;
+    case ttop::FAdd: return llop::FAdd;
+    case ttop::Sub: return llop::Sub;
+    case ttop::FSub: return llop::FSub;
+    case ttop::Mul: return llop::Mul;
+    case ttop::FMul: return llop::FMul;
+    case ttop::UDiv: return llop::UDiv;
+    case ttop::SDiv: return llop::SDiv;
+    case ttop::FDiv: return llop::FDiv;
+    case ttop::URem: return llop::URem;
+    case ttop::SRem: return llop::SRem;
+    case ttop::FRem: return llop::FRem;
+    case ttop::Shl: return llop::Shl;
+    case ttop::LShr: return llop::LShr;
+    case ttop::AShr: return llop::AShr;
+    case ttop::And: return llop::And;
+    case ttop::Or: return llop::Or;
+    case ttop::Xor: return llop::Xor;
+  }
+  throw std::runtime_error("unknown operator");
+}
+
+llvm::Instruction::CastOps llvm_op(ir::cast_op_t op) {
+  using llop = llvm::Instruction::CastOps;
+  using ttop = ir::cast_op_t;
+  switch(op){
+  case ttop::Trunc: return llop::Trunc;
+  case ttop::ZExt: return llop::ZExt;
+  case ttop::SExt: return llop::SExt;
+  case ttop::FPTrunc: return llop::FPTrunc;
+  case ttop::FPExt: return llop::FPExt;
+  case ttop::UIToFP: return llop::UIToFP;
+  case ttop::SIToFP: return llop::SIToFP;
+  case ttop::FPToUI: return llop::FPToUI;
+  case ttop::FPToSI: return llop::FPToSI;
+  case ttop::PtrToInt: return llop::PtrToInt;
+  case ttop::IntToPtr: return llop::IntToPtr;
+  case ttop::BitCast: return llop::BitCast;
+  case ttop::AddrSpaceCast: return llop::AddrSpaceCast;
+  }
+  throw std::runtime_error("unknown operator");
+}
+
+llvm::CmpInst::Predicate llvm_pred(ir::cmp_pred_t pred) {
+  using llop = llvm::CmpInst::Predicate;
+  using ttop = ir::cmp_pred_t;
+  switch(pred){
+    case ttop::FIRST_FCMP_PREDICATE: return llop::FIRST_FCMP_PREDICATE;
+    case ttop::FCMP_FALSE: return llop::FCMP_FALSE;
+    case ttop::FCMP_OEQ: return llop::FCMP_OEQ;
+    case ttop::FCMP_OGT: return llop::FCMP_OGT;
+    case ttop::FCMP_OGE: return llop::FCMP_OGE;
+    case ttop::FCMP_OLT: return llop::FCMP_OLT;
+    case ttop::FCMP_OLE: return llop::FCMP_OLE;
+    case ttop::FCMP_ONE: return llop::FCMP_ONE;
+    case ttop::FCMP_ORD: return llop::FCMP_ORD;
+    case ttop::FCMP_UNO: return llop::FCMP_UNO;
+    case ttop::FCMP_UEQ: return llop::FCMP_UEQ;
+    case ttop::FCMP_UGT: return llop::FCMP_UGT;
+    case ttop::FCMP_UGE: return llop::FCMP_UGE;
+    case ttop::FCMP_ULT: return llop::FCMP_ULT;
+    case ttop::FCMP_ULE: return llop::FCMP_ULE;
+    case ttop::FCMP_UNE: return llop::FCMP_UNE;
+    case ttop::FCMP_TRUE: return llop::FCMP_TRUE;
+    case ttop::LAST_FCMP_PREDICATE: return llop::LAST_FCMP_PREDICATE;
+    case ttop::FIRST_ICMP_PREDICATE: return llop::FIRST_ICMP_PREDICATE;
+    case ttop::ICMP_EQ: return llop::ICMP_EQ;
+    case ttop::ICMP_NE: return llop::ICMP_NE;
+    case ttop::ICMP_UGT: return llop::ICMP_UGT;
+    case ttop::ICMP_UGE: return llop::ICMP_UGE;
+    case ttop::ICMP_ULT: return llop::ICMP_ULT;
+    case ttop::ICMP_ULE: return llop::ICMP_ULE;
+    case ttop::ICMP_SGT: return llop::ICMP_SGT;
+    case ttop::ICMP_SGE: return llop::ICMP_SGE;
+    case ttop::ICMP_SLT: return llop::ICMP_SLT;
+    case ttop::ICMP_SLE: return llop::ICMP_SLE;
+    case ttop::LAST_ICMP_PREDICATE: return llop::LAST_ICMP_PREDICATE;
+  }
+  throw std::runtime_error("unknown operator");
+}
+
+
+inline Type *llvm_type(ir::type *ty, LLVMContext &ctx) {
+  // function
+  if(auto* tt = dynamic_cast<ir::function_type*>(ty)){
+    Type *return_ty = llvm_type(tt->get_return_ty(), ctx);
+    std::vector<Type*> param_tys;
+    std::transform(tt->params_begin(), tt->params_end(), std::back_inserter(param_tys),
+                   [&ctx](ir::type* t){ return llvm_type(t, ctx);});
+    return FunctionType::get(return_ty, param_tys, false);
+  }
+  // pointer
+  if(ty->is_pointer_ty()){
+    Type *elt_ty = llvm_type(ty->get_pointer_element_ty(), ctx);
+    unsigned addr_space = ty->get_pointer_address_space();
+    return PointerType::get(elt_ty, addr_space);
+  }
+  // integer
+  if(ty->is_integer_ty()){
+    unsigned bitwidth = ty->get_integer_bitwidth();
+    return IntegerType::get(ctx, bitwidth);
+  }
+  // primitive types
+  switch(ty->get_type_id()){
+    case ir::type::VoidTyID:      return Type::getVoidTy(ctx);
+    case ir::type::HalfTyID:      return Type::getHalfTy(ctx);
+    case ir::type::FloatTyID:     return Type::getFloatTy(ctx);
+    case ir::type::DoubleTyID:    return Type::getDoubleTy(ctx);
+    case ir::type::X86_FP80TyID:  return Type::getX86_FP80Ty(ctx);
+    case ir::type::PPC_FP128TyID: return Type::getPPC_FP128Ty(ctx);
+    case ir::type::LabelTyID:     return Type::getLabelTy(ctx);
+    case ir::type::MetadataTyID:  return Type::getMetadataTy(ctx);
+    case ir::type::TokenTyID:     return Type::getTokenTy(ctx);
+    default: break;
+  }
+  // unknown type
+  throw std::runtime_error("unknown conversion from ir::type to Type");
+}
+
+
+inline llvm::Attribute llvm_attr(llvm::LLVMContext& ctx, ir::attribute attr) {
+  switch(attr.get_kind()){
+    case ir::noalias: return llvm::Attribute::get(ctx, llvm::Attribute::NoAlias);
+    case ir::readonly: return llvm::Attribute::get(ctx, llvm::Attribute::ReadOnly);
+    case ir::writeonly: return llvm::Attribute::get(ctx, llvm::Attribute::WriteOnly);
+    case ir::aligned: return llvm::Attribute::get(ctx, llvm::Attribute::Alignment, attr.get_value());
+    default: throw std::runtime_error("cannot convert ir::attribute_t to llvm::Attribute");
+  }
+}
+
+inline bool is_trans(ir::value *v) {
+  if(dynamic_cast<ir::trans_inst *>(v)) {
+    return true;
+  }
+  if(auto *phi = dynamic_cast<ir::instruction *>(v)) {
+    bool result = true;
+    for(ir::value *op: phi->ops())
+      result = result && is_trans(op);
+    return result;
+  }
+  return false;
+}
+
+
+
+
+generator::generator(analysis::axes *a_axes,
+                    analysis::layouts *layouts,
+                    analysis::align *alignment,
+                    analysis::allocation *alloc,
+                     target *tgt,
+                    unsigned num_warps)
+  : a_axes_(a_axes), layouts_(layouts), alignment_(alignment), alloc_(alloc),
+    tgt_(tgt), num_warps_(num_warps) {
+
+}
+
+
+void generator::visit_value(ir::value* v) {
+  if(!seen_.insert(v).second)
+    return;
+  // create machine tile
+  if(v->get_type()->is_tile_ty()){
+    tmap_[v] = machine_layouts_.at(layouts_->get(v))->create(v);
+  }
+  // visit operands
+  BasicBlock *current = builder_->GetInsertBlock();
+  auto *inst = dynamic_cast<ir::instruction*>(v);
+  if(inst && !dynamic_cast<ir::phi_node*>(v))
+    for(ir::value *op: inst->ops())
+      visit_value(op);
+  // change insert point for phi node
+  builder_->SetInsertPoint(current);
+  auto *phi = dynamic_cast<ir::phi_node*>(v);
+  if(phi && !current->empty() && current->getFirstNonPHI())
+    builder_->SetInsertPoint(&*current->getFirstNonPHI());
+  // visit user
+  if(auto *usr = dynamic_cast<ir::user*>(v))
+    usr->accept(this);
+  // revert insert point
+  if(phi && !current->empty() && current->getFirstNonPHI())
+    builder_->SetInsertPoint(current);
+}
+
+void generator::visit_phi_node(ir::phi_node* phi) {
+  Type *ty = llvm_type(phi->get_type()->get_scalar_ty(), *ctx_);
+  unsigned num_ops = phi->get_num_operands();
+  for_each(phi, [&](indices_t idx){
+    set_value(phi, idx, builder_->CreatePHI(ty, num_ops));
+  });
+}
+
+void generator::visit_binary_operator(ir::binary_operator*binop) {
+  for_each(binop, [&](indices_t idx){
+    Value *lhs = get_value(binop->get_operand(0), idx);
+    Value *rhs = get_value(binop->get_operand(1), idx);
+    Value *ret = builder_->CreateBinOp(llvm_op(binop->get_op()), lhs, rhs);
+    set_value(binop, idx, ret);
+  });
+}
+
+void generator::visit_getelementptr_inst(ir::getelementptr_inst* gep) {
+  for_each(gep, [&](indices_t idx){
+    Value *ptr = get_value(gep->get_operand(0), idx);
+    std::vector<Value*> idx_vals;
+    std::transform(gep->idx_begin(), gep->idx_end(), std::back_inserter(idx_vals),
+                   [&](ir::value* x){ return get_value(x, idx);});
+    Type *source_ty = llvm_type(gep->get_source_elt_ty()->get_scalar_ty(), *ctx_);
+    Value *ret = builder_->CreateGEP(source_ty, ptr, idx_vals);
+    set_value(gep, idx, ret);
+  });
+}
+
+void generator::visit_icmp_inst(ir::icmp_inst* icmp) {
+  for_each(icmp, [&](indices_t idx){
+    ir::cmp_pred_t pred = icmp->get_pred();
+    Value *lhs = get_value(icmp->get_operand(0), idx);
+    Value *rhs = get_value(icmp->get_operand(1), idx);
+    Value *ret = builder_->CreateICmp(llvm_pred(pred), lhs, rhs);
+    set_value(icmp, idx, ret);
+  });
+}
+
+void generator::visit_fcmp_inst(ir::fcmp_inst* fcmp) {
+  for_each(fcmp, [&](indices_t idx){
+    ir::cmp_pred_t pred = fcmp->get_pred();
+    Value *lhs = get_value(fcmp->get_operand(0), idx);
+    Value *rhs = get_value(fcmp->get_operand(1), idx);
+    Value *ret = builder_->CreateFCmp(llvm_pred(pred), lhs, rhs);
+    set_value(fcmp, idx, ret);
+  });
+}
+
+void generator::visit_cast_inst(ir::cast_inst* cast) {
+  for_each(cast, [&](indices_t idx){
+    Value *arg = get_value(cast->get_operand(0), idx);
+    Type *dst_ty = llvm_type(cast->get_type()->get_scalar_ty(), *ctx_);
+    Value *ret = builder_->CreateCast(llvm_op(cast->get_op()), arg, dst_ty);
+    set_value(cast, idx, ret);
+  });
+}
+
+void generator::visit_return_inst(ir::return_inst* rr) {
+  ir::value *ret_val = rr->get_return_value();
+  builder_->CreateRet(ret_val ? vmap_.at(ret_val) : nullptr);
+}
+
+void generator::visit_cond_branch_inst(ir::cond_branch_inst* br) {
+  BasicBlock *true_dest  = (BasicBlock*)vmap_.at(br->get_true_dest());
+  BasicBlock *false_dest = (BasicBlock*)vmap_.at(br->get_false_dest());
+  Value *cond = vmap_.at(br->get_cond());
+  builder_->CreateCondBr(cond, true_dest, false_dest);
+}
+
+void generator::visit_uncond_branch_inst(ir::uncond_branch_inst* br) {
+  BasicBlock *dest = (BasicBlock*)vmap_.at(br->get_dest());
+  builder_->CreateBr(dest);
+}
+
+
+void generator::visit_unmasked_load_inst(ir::unmasked_load_inst* x) {
+  if(!x->get_type()->is_tile_ty()){
+    Value *ptr = get_value(x->get_pointer_operand(), {});
+    set_value(x, {}, builder_->CreateLoad(ptr));
+    return;
+  }
+  // find vector size
+  ir::value *ptr = x->get_pointer_operand();
+  size_t ld = layouts_->get(ptr)->get_order(0);
+  unsigned alignment = std::max<int>(alignment_->get(ptr, ld), 1);
+
+
+  // vector loads
+  std::map<unsigned, Value*> packets;
+  for_each(x, [&](indices_t idx){
+    distributed_tile* result = (distributed_tile*)tmap_.at(x);
+    // vector size
+    unsigned contiguous = 1;
+    if(ld < x->get_type()->get_tile_rank())
+      contiguous = result->axis(ld).contiguous;
+    unsigned vector_size = std::min<unsigned>(contiguous, alignment);
+
+    unsigned linear = result->get_linear_index(idx);
+    unsigned id = linear / vector_size;
+    if(linear % vector_size == 0) {
+      distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr);
+      Value *ptr = pointers->get_value(idx);
+      ptr = builder_->CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size),
+                                                        ptr->getType()->getPointerAddressSpace()));
+      packets[id] = builder_->CreateLoad(ptr);
+    }
+  });
+
+  // extract result element
+  for_each(x, [&](indices_t idx){
+    distributed_tile* result = (distributed_tile*)tmap_.at(x);
+    // vector size
+    unsigned contiguous = 1;
+    if(ld < x->get_type()->get_tile_rank())
+      contiguous = result->axis(ld).contiguous;
+    unsigned vector_size = std::min<unsigned>(contiguous, alignment);
+    unsigned linear = result->get_linear_index(idx);
+    unsigned id = linear / vector_size;
+    set_value(x, idx, builder_->CreateExtractElement(packets.at(id), linear % vector_size));
+  });
+}
+
+void generator::visit_masked_load_inst(ir::masked_load_inst* x) {
+  // find vector size
+  ir::value *ptr = x->get_pointer_operand();
+  size_t ld = layouts_->get(ptr)->get_order(0);
+  unsigned alignment = alignment_->get(ptr, ld);
+  distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr);
+  distributed_tile *masks = (distributed_tile*)tmap_.at(x->get_mask_operand());
+  distributed_tile *false_values = (distributed_tile*)tmap_.at(x->get_false_value_operand());
+  std::map<unsigned, Value*> packets;
+  for_each(x, [&](indices_t idx){
+    distributed_tile* result = (distributed_tile*)tmap_.at(x);
+    unsigned vector_size = std::min<unsigned>(result->axis(ld).contiguous, alignment);
+    unsigned linear = result->get_linear_index(idx);
+    unsigned id = linear / vector_size;
+    if(linear % vector_size == 0) {
+      Value *ptr = pointers->get_value(idx);
+      ptr = builder_->CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size),
+                                                        ptr->getType()->getPointerAddressSpace()));
+
+      Value *mask = masks->get_value(idx);
+      BasicBlock *current_bb = builder_->GetInsertBlock();
+      Function *parent = builder_->GetInsertBlock()->getParent();
+      BasicBlock *mask_then_bb = BasicBlock::Create(*ctx_, "mask_then", parent);
+      BasicBlock *mask_done_bb = BasicBlock::Create(*ctx_, "mask_done", parent);
+      builder_->CreateCondBr(mask, mask_then_bb, mask_done_bb);
+      builder_->SetInsertPoint(mask_then_bb);
+      Value *result_then = builder_->CreateLoad(ptr);
+      builder_->CreateBr(mask_done_bb);
+      builder_->SetInsertPoint(mask_done_bb);
+      Value *current_result = nullptr;
+      if(false_values){
+        current_result = builder_->CreatePHI(result_then->getType(), 2);
+        ((PHINode*)current_result)->addIncoming(result_then, mask_then_bb);
+        Value *result_false = false_values->get_value(idx);
+        if(result_then->getType()->isVectorTy())
+          result_false = builder_->CreateVectorSplat(vector_size, llvm::UndefValue::get(result_false->getType()));
+        ((PHINode*)current_result)->addIncoming(result_false, current_bb);
+      }
+      else
+        current_result = result_then;
+
+//      ConstantInt *cst = nullptr;
+//      if(GetElementPtrInst *gep = dyn_cast<GetElementPtrInst>(ptr))
+//        if(gep->getNumIndices() == 1)
+//          cst = dyn_cast<ConstantInt>(gep->idx_begin());
+//          llvm::Value* mask = masks->get_value(idx);
+//          std::string offset = "";
+//          if(cst)
+//            offset = " + " + std::to_string(cst->getValue().getSExtValue()*2*vector_size);
+//          Type *fp16x2_ty = VectorType::get(builder_->getHalfTy(), 2);
+//          Type *fp16x2_pack4_ty = StructType::get(*ctx_, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty});
+//          FunctionType *ty = FunctionType::get(fp16x2_pack4_ty, {mask->getType(), ptr->getType()}, false);
+//          std::string asm_str = "@$0 ld.global.nc.v4.b32 {$1, $2, $3, $4}, [$5" + offset + "];";
+//          if(false_values)
+//            asm_str += "\n\t@!$0 mov.v4.b32 {$1, $2, $3, $4}, {0, 0, 0, 0};";
+//          InlineAsm *iasm = InlineAsm::get(ty, asm_str, "b,=r,=r,=r,=r,l", true);
+//          Value *current_result = builder_->CreateCall(iasm, {mask, ptr});
+
+      packets[id] = current_result;
+    }
+  });
+  // extract result element
+  for_each(x, [&](indices_t idx){
+    distributed_tile* result = (distributed_tile*)tmap_.at(x);
+    unsigned vector_size = std::min<unsigned>(result->axis(ld).contiguous, alignment);
+    unsigned linear = result->get_linear_index(idx);
+    unsigned id = linear / vector_size;
+//        Value *tmp = builder_->CreateExtractValue(packets.at(id), {(linear % vector_size) / 2});
+//        Value *res = builder_->CreateExtractElement(tmp, (linear % vector_size) % 2);
+//        result->set_value(idx, res);
+    result->set_value(idx, builder_->CreateExtractElement(packets.at(id), linear % vector_size));
+  });
+}
+
+void generator::visit_unmasked_store_inst(ir::unmasked_store_inst* st) {
+  for_each(st->get_pointer_operand(), [&](indices_t idx){
+    Value *ptr = get_value(st->get_pointer_operand(), idx);
+    Value *val = get_value(st->get_value_operand(), idx);
+     builder_->CreateStore(val, ptr);
+  });
+}
+
+void generator::visit_masked_store_inst(ir::masked_store_inst* st) {
+  distributed_tile* ptrs = (distributed_tile*)tmap_.at(st->get_pointer_operand());
+  distributed_tile* masks = (distributed_tile*)tmap_.at(st->get_mask_operand());
+  // vector size
+  int vector_size = 1;
+  int ld = ptrs->get_order()[0];
+  unsigned alignment = alignment_->get(st->get_pointer_operand(), ld);
+  vector_size = std::min<unsigned>(ptrs->axis(ld).contiguous, alignment);
+  // create packets
+  std::map<unsigned, Value*> packets;
+  ir::value *arg = st->get_value_operand();
+  for_each(arg, [&](indices_t idx){
+    distributed_tile* in = (distributed_tile*)tmap_.at(arg);
+    unsigned linear = in->get_linear_index(idx);
+    unsigned id = linear / vector_size;
+    Value *in_value = in->get_value(idx);
+    if(linear % vector_size == 0)
+      packets[id] = UndefValue::get(VectorType::get(in_value->getType(), vector_size));
+    packets[id] = builder_->CreateInsertElement(packets.at(id), in_value, linear % vector_size);
+  });
+  // write-back packets
+  for_each(arg, [&](indices_t idx){
+    distributed_tile* in = (distributed_tile*)tmap_.at(arg);
+    unsigned linear = in->get_linear_index(idx);
+    unsigned id = linear / vector_size;
+    if(linear % vector_size == 0){
+      // fetch tile elements
+      Value *elt = packets[id];
+      Value *ptr = ptrs->get_value(idx);
+      Value *pred = masks->get_value(idx);
+      // type information
+      Type *ty = elt->getType();
+      unsigned nbits = ty->getScalarSizeInBits();
+      unsigned nbytes = nbits / 8;
+      // extract pointer offset
+      std::string offset = "";
+      if(GetElementPtrInst *gep = dyn_cast<GetElementPtrInst>(ptr))
+      if(gep->getNumIndices() == 1)
+      if(ConstantInt *cst = dyn_cast<ConstantInt>(gep->idx_begin())){
+        offset = " + " + std::to_string(cst->getValue().getSExtValue()*nbytes);
+        ptr = gep->getPointerOperand();
+      }
+      ptr = builder_->CreateBitCast(ptr, ty->getPointerTo(1));
+      // asm argument type
+      std::vector<Type*> arg_ty = {pred->getType(), ptr->getType()};
+      for(int v = 0; v < vector_size; v++)
+        arg_ty.push_back(ty->getScalarType());
+      // asm function type
+      FunctionType *fn_ty = FunctionType::get(builder_->getVoidTy(), arg_ty, false);
+      // asm string
+      std::string asm_str;
+      asm_str += "@$0 st.global";
+      if(vector_size > 1)
+        asm_str += ".v" + std::to_string(vector_size);
+      asm_str += ".b" + std::to_string(nbits) + " [$1" + offset + "],";
+      if(vector_size > 1)
+        asm_str += "{";
+      for(int v = 0; v < vector_size; v++){
+        if(v > 0)
+          asm_str += ", ";
+        asm_str += "$" + std::to_string(2 + v);
+      }
+      if(vector_size > 1)
+        asm_str += "}";
+      asm_str += ";";
+      // asm constraint
+      std::string constraint = "b,l";
+      for(int v = 0; v < vector_size; v++){
+        constraint += ",";
+        constraint += (nbits == 32 ? "r" : "h");
+      }
+      // create inline asm
+      InlineAsm *iasm = InlineAsm::get(fn_ty, asm_str, constraint, true);
+      // call asm
+      std::vector<Value*> args = {pred, ptr};
+      for(int v = 0; v < vector_size; v++)
+        args.push_back(builder_->CreateExtractElement(elt, builder_->getInt32(v)));
+      builder_->CreateCall(iasm, args);
+    }
+  });
+}
+
+
+void generator::visit_reshape_inst(ir::reshape_inst* reshape) {
+  for_each(reshape, [&](indices_t out_idx){
+    distributed_tile* result = (distributed_tile*)tmap_.at(reshape);
+    unsigned pos = result->get_linear_index(out_idx);
+    ir::value* in = reshape->get_operand(0);
+    distributed_tile *in_tile = (distributed_tile*)tmap_.at(in);
+    indices_t in_idx = in_tile->get_ordered_indices(pos);
+    set_value(reshape, out_idx, get_value(in, in_idx));
+  });
+}
+
+void generator::visit_splat_inst(ir::splat_inst* splat) {
+  Value *in = get_value(splat->get_operand(0), {});
+  for_each(splat, [&](indices_t idx){
+    set_value(splat, idx, in);
+  });
+}
+
+void generator::visit_broadcast_inst(ir::broadcast_inst* bcast) {
+  ir::value* in = bcast->get_operand(0);
+  const auto& in_shapes = in->get_type()->get_tile_shapes();
+  distributed_tile *in_tile = (distributed_tile*)tmap_.at(in);
+  for_each(bcast, [&](indices_t out_idx){
+    indices_t in_idx = out_idx;
+    for(size_t k = 0; k < in_idx.size(); k++){
+      if(in_shapes[k] == 1)
+        in_idx[k] = builder_->getInt32(0);
+    }
+    set_value(bcast, out_idx, in_tile->get_value(in_idx));
+  });
+}
+
+void generator::visit_downcast_inst(ir::downcast_inst* x) {
+  vmap_[x] = tmap_[x->get_operand(0)]->get_value({builder_->getInt32(0)});
+}
+
+void generator::visit_get_program_id_inst(ir::get_program_id_inst* pid) {
+  Module *module = builder_->GetInsertBlock()->getModule();
+  Value *ret = tgt_->get_block_id(module, *builder_, pid->get_axis());
+  vmap_[pid] = ret;
+}
+
+void generator::visit_get_num_program_inst(ir::get_num_program_inst* np) {
+  Module *module = builder_->GetInsertBlock()->getModule();
+  Value *ret = tgt_->get_num_blocks(module, *builder_, np->get_axis());
+  vmap_[np] = ret;
+}
+
+void generator::visit_atomic_cas_inst(ir::atomic_cas_inst* cas) {
+  BasicBlock *current = builder_->GetInsertBlock();
+  Module *module = current->getModule();
+  Value *tid = tgt_->get_local_id(module, *builder_, 0);
+  Value *pred = builder_->CreateICmpEQ(tid, builder_->getInt32(0));
+  BasicBlock *tid_0_bb = BasicBlock::Create(*ctx_, "tid_0", current->getParent());
+  BasicBlock *tid_0_done_bb = BasicBlock::Create(*ctx_, "tid_0_done", current->getParent());
+  tgt_->add_barrier(module, *builder_);
+  tgt_->add_memfence(module, *builder_);
+  builder_->CreateCondBr(pred, tid_0_bb, tid_0_done_bb);
+  builder_->SetInsertPoint(tid_0_bb);
+  Value *cas_ptr = vmap_.at(cas->get_operand(0));
+  Value *cas_cmp = vmap_.at(cas->get_operand(1));
+  Value *cas_val = vmap_.at(cas->get_operand(2));
+  Value *old = builder_->CreateAtomicCmpXchg(cas_ptr, cas_cmp, cas_val,
+                                             AtomicOrdering::Monotonic,
+                                             AtomicOrdering::Monotonic);
+  old = builder_->CreateExtractValue(old, {0});
+  Value *atom_ptr;
+  atom_ptr = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(alloc_->offset(layouts_->get(layouts_->tmp(cas)))));
+  atom_ptr = builder_->CreateBitCast(atom_ptr, PointerType::get(old->getType(), 3));
+
+  builder_->CreateStore(old, atom_ptr);
+  builder_->CreateBr(tid_0_done_bb);
+  builder_->SetInsertPoint(tid_0_done_bb);
+  tgt_->add_memfence(module, *builder_);
+  tgt_->add_barrier(module, *builder_);
+  vmap_[cas] = builder_->CreateLoad(atom_ptr);
+}
+
+void generator::visit_atomic_exch_inst(ir::atomic_exch_inst* xchg) {
+  BasicBlock *current = builder_->GetInsertBlock();
+  Module *module = current->getModule();
+  Value *rmw_ptr = vmap_.at(xchg->get_operand(0));
+  Value *rmw_val = vmap_.at(xchg->get_operand(1));
+  Value *tid = tgt_->get_local_id(module, *builder_, 0);
+  Value *pred = builder_->CreateICmpEQ(tid, builder_->getInt32(0));
+  BasicBlock *tid_0_bb = BasicBlock::Create(*ctx_, "tid_0", current->getParent());
+  BasicBlock *tid_0_done_bb = BasicBlock::Create(*ctx_, "tid_0_done", current->getParent());
+  tgt_->add_memfence(module, *builder_);
+  builder_->CreateCondBr(pred, tid_0_bb, tid_0_done_bb);
+  builder_->SetInsertPoint(tid_0_bb);
+  builder_->CreateAtomicRMW(AtomicRMWInst::Xchg, rmw_ptr, rmw_val,
+                                          AtomicOrdering::Monotonic,
+                                          SyncScope::System);
+  builder_->CreateBr(tid_0_done_bb);
+  builder_->SetInsertPoint(tid_0_done_bb);
+  tgt_->add_memfence(module, *builder_);
+}
+
+void generator::visit_atomic_add_inst(ir::atomic_add_inst*) {
+  throw std::runtime_error("unsupported");
+}
+
+void generator::visit_hmma_dot(ir::dot_inst* dot, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK) {
+  const auto& shapes = dot->get_type()->get_tile_shapes();
+  machine_mma884_layout* hmma = (machine_mma884_layout*)machine_layouts_.at(layouts_->get(dot));
+  TA->set_vector_size(4*hmma->pack_size_0_);
+  TB->set_vector_size(4*hmma->pack_size_1_);
+  TA->set_return_mode(true);
+  TB->set_return_mode(true);
+
+  std::map<std::vector<Value*>, std::vector<Value*>> fcs;
+
+  for_each(dot, [&](indices_t idx){
+    std::vector<Value*> key(idx.size() - 2);
+    std::copy(idx.begin() + 2, idx.end(), key.begin());
+    fcs[key].push_back(TD->get_value(idx));
+  });
+
+  Type *fp32_ty = builder_->getFloatTy();
+  Type *fp16x2_ty = VectorType::get(builder_->getHalfTy(), 2);
+  Type *fp32_pack8_ty = StructType::get(*ctx_, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty});
+  FunctionType *mma_ty = FunctionType::get(fp32_pack8_ty, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}, false);
+
+
+  Value* u_thread_id = tgt_->get_local_id(builder_->GetInsertBlock()->getModule(), *builder_, 0);
+
+  auto ord_a = layouts_->get(dot->get_operand(0))->get_order();
+  auto ord_b = layouts_->get(dot->get_operand(1))->get_order();
+
+  bool is_a_trans = is_trans(dot->get_operand(0));
+  bool is_b_trans = is_trans(dot->get_operand(1));
+  bool is_a_row = is_a_trans ^ (ord_a[0] != 0);
+  bool is_b_row = is_b_trans ^ (ord_b[0] != 0);
+
+  Value *offset_a_i = hmma->offset_a_i_;
+  Value *offset_a_k = hmma->offset_a_k_;
+  if(is_a_row){
+    offset_a_i = builder_->CreateAdd(offset_a_i, builder_->CreateURem(u_thread_id, builder_->getInt32(4)));
+    offset_a_k = builder_->getInt32(0);
+  }
+
+  Value *offset_b_j = hmma->offset_b_j_;
+  Value *offset_b_k = hmma->offset_b_k_;
+  if(!is_b_row){
+    offset_b_j = builder_->CreateAdd(offset_b_j, builder_->CreateURem(u_thread_id, builder_->getInt32(4)));
+    offset_b_k = builder_->getInt32(0);
+  }
+
+  std::string op_a = is_a_row ? "row" : "col";
+  std::string op_b = is_b_row ? "row" : "col";
+
+  InlineAsm *mma_fn = InlineAsm::get(mma_ty, " mma.sync.aligned.m8n8k4." + op_a + "." + op_b + ".f32.f16.f16.f32 "
+                                             "{$0, $1, $2, $3, $4, $5, $6, $7}, "
+                                             "{$8, $9}, "
+                                             "{$10, $11}, "
+                                             "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false);
+  analysis::mma884_layout* layout = layouts_->get(dot)->to_mma884();
+
+  unsigned fpw_0 = layout->fpw(0);
+  unsigned fpw_1 = layout->fpw(1);
+  unsigned wts_0 = fpw_0 * 8;
+  unsigned wts_1 = fpw_1 * 8;
+  unsigned wpt_0 = layout->wpt(0);
+  unsigned wpt_1 = layout->wpt(1);
+  unsigned stride_rep_i = wpt_0 * wts_0;
+  unsigned stride_rep_j = wpt_1 * wts_1;
+  unsigned num_rep_i = shapes[0] / stride_rep_i;
+  unsigned ld_fc = num_rep_i * 2;
+
+
+  for(auto& x: fcs){
+    std::vector<Value *>& fc = x.second;
+    for(unsigned pack_i = 0; pack_i < hmma->num_packs_0_; pack_i++)
+    for(unsigned pack_j = 0; pack_j < hmma->num_packs_1_; pack_j++){
+    for(unsigned K = 0; K < NK; K += 4){
+      Value *_K = builder_->getInt32(K);
+      Value *current_offset_a_i = builder_->CreateAdd(offset_a_i, builder_->getInt32(pack_i*stride_rep_i*hmma->pack_size_0_));
+      Value *current_offset_b_i = builder_->CreateAdd(offset_b_j, builder_->getInt32(pack_j*stride_rep_j*hmma->pack_size_1_));
+      indices_t idx_a = {current_offset_a_i, builder_->CreateAdd(offset_a_k, _K)};
+      indices_t idx_b = {builder_->CreateAdd(offset_b_k, _K), current_offset_b_i};
+      idx_a.insert(idx_a.end(), x.first.begin(), x.first.end());
+      idx_b.insert(idx_b.end(), x.first.begin(), x.first.end());
+      Value *ha = TA->get_value(idx_a);
+      Value *hb = TB->get_value(idx_b);
+      for(unsigned ii = 0; ii < hmma->pack_size_0_; ii++)
+      for(unsigned jj = 0; jj < hmma->pack_size_1_; jj++){
+        Value *ha0 = builder_->CreateBitCast(builder_->CreateExtractElement(ha, builder_->getInt32(ii*hmma->pack_size_0_ + 0)), fp16x2_ty);
+        Value *ha1 = builder_->CreateBitCast(builder_->CreateExtractElement(ha, builder_->getInt32(ii*hmma->pack_size_0_ + 1)), fp16x2_ty);
+        Value *hb0 = builder_->CreateBitCast(builder_->CreateExtractElement(hb, builder_->getInt32(jj*hmma->pack_size_0_ + 0)), fp16x2_ty);
+        Value *hb1 = builder_->CreateBitCast(builder_->CreateExtractElement(hb, builder_->getInt32(jj*hmma->pack_size_0_ + 1)), fp16x2_ty);
+        std::vector<size_t> idx = {
+          (pack_i*2*hmma->pack_size_0_ + ii*2 + 0) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 0)*ld_fc,
+          (pack_i*2*hmma->pack_size_0_ + ii*2 + 0) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 1)*ld_fc,
+          (pack_i*2*hmma->pack_size_0_ + ii*2 + 1) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 0)*ld_fc,
+          (pack_i*2*hmma->pack_size_0_ + ii*2 + 1) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 1)*ld_fc,
+          (pack_i*2*hmma->pack_size_0_ + ii*2 + 0) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 2)*ld_fc,
+          (pack_i*2*hmma->pack_size_0_ + ii*2 + 0) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 3)*ld_fc,
+          (pack_i*2*hmma->pack_size_0_ + ii*2 + 1) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 2)*ld_fc,
+          (pack_i*2*hmma->pack_size_0_ + ii*2 + 1) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 3)*ld_fc
+        };
+        Value *nc = builder_->CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[idx[0]], fc[idx[1]], fc[idx[2]], fc[idx[3]], fc[idx[4]], fc[idx[5]], fc[idx[6]], fc[idx[7]]});
+        fc[idx[0]] = builder_->CreateExtractValue(nc, {0});
+        fc[idx[1]] = builder_->CreateExtractValue(nc, {1});
+        fc[idx[2]] = builder_->CreateExtractValue(nc, {2});
+        fc[idx[3]] = builder_->CreateExtractValue(nc, {3});
+        fc[idx[4]] = builder_->CreateExtractValue(nc, {4});
+        fc[idx[5]] = builder_->CreateExtractValue(nc, {5});
+        fc[idx[6]] = builder_->CreateExtractValue(nc, {6});
+        fc[idx[7]] = builder_->CreateExtractValue(nc, {7});
+      }
+    }
+    }
+  }
+
+  // write back
+  unsigned i = 0;
+  for_each(dot, [&](indices_t idx){
+    std::vector<Value*> key(idx.size() - 2);
+    std::copy(idx.begin() + 2, idx.end(), key.begin());
+    if(i >= fcs.at(key).size())
+      i = 0;
+    set_value(dot, idx, fcs.at(key)[i++]);
+  });
+
+  TA->set_return_mode(false);
+  TB->set_return_mode(false);
+
+}
+void generator::visit_scanline_dot(ir::dot_inst* dot, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK,
+                                   Type *c_ty, Function *f_mul_add) {
+  TA->set_vector_size(TD->axis(0).contiguous);
+  TB->set_vector_size(TD->axis(1).contiguous);
+  for_each(dot, [&](indices_t idx){
+    Value *res = TD->get_value(idx);
+    for(unsigned K = 0; K < NK; ++K){
+      // input indices
+      indices_t a_idx = {idx[0], builder_->getInt32(K)};
+      indices_t b_idx = {builder_->getInt32(K), idx[1]};
+      // add batching dimension
+      for(size_t i = 2; i < idx.size(); i++){
+        a_idx.insert(a_idx.end(), idx[i]);
+        b_idx.insert(b_idx.end(), idx[i]);
+      }
+      // load value
+      Value *a = TA->get_value(a_idx);
+      Value *b = TB->get_value(b_idx);
+      if(a->getType() != c_ty)
+        a = builder_->CreateFPCast(a, c_ty);
+      if(b->getType() != c_ty)
+        b = builder_->CreateFPCast(b, c_ty);
+      res = builder_->CreateCall(f_mul_add, {a, b, res});
+    }
+    set_value(dot, idx, res);
+  });
+}
+
+void generator::visit_outer_dot(ir::dot_inst* dot, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, unsigned NK,
+                                Type *c_ty, Function *f_mul_add) {
+  for_each(dot, [&](indices_t idx){
+    Value *res = TD->get_value(idx);
+    indices_t a_idx = {idx[0], builder_->getInt32(0)};
+    indices_t b_idx = {builder_->getInt32(0), idx[1]};
+    std::swap(a_idx[0], a_idx[1]);
+    std::swap(b_idx[0], b_idx[1]);
+    Value *a = TA->get_value(a_idx);
+    Value *b = TB->get_value(b_idx);
+    if(a->getType() != c_ty)
+      a = builder_->CreateFPCast(a, c_ty);
+    if(b->getType() != c_ty)
+      b = builder_->CreateFPCast(b, c_ty);
+    res = builder_->CreateCall(f_mul_add, {a, b, res});
+    set_value(dot, idx, res);
+  });
+}
+
+void generator::visit_dot_inst(ir::dot_inst* dot) {
+  Function *fn = builder_->GetInsertBlock()->getParent();
+
+  Module *module = fn->getParent();
+  ir::value *A = dot->get_operand(0);
+  ir::value *B = dot->get_operand(1);
+  ir::value *D = dot->get_operand(2);
+
+  distributed_tile *TD = (distributed_tile*)tmap_.at(D);
+  Type *c_ty = llvm_type(D->get_type()->get_scalar_ty(), *ctx_);
+  Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty});
+  auto A_shapes = A->get_type()->get_tile_shapes();
+  size_t red_axis = 1;
+  unsigned NK = A_shapes[red_axis];
+
+  if(NK != 1) {
+    shared_tile *TA = (shared_tile*)tmap_.at(A);
+    shared_tile *TB = (shared_tile*)tmap_.at(B);
+    if(layouts_->get(dot)->to_mma884())
+      visit_hmma_dot(dot, TA, TB, TD, NK);
+    else
+      visit_scanline_dot(dot, TA, TB, TD, NK, c_ty, f_mul_add);
+  }
+  else {
+    distributed_tile *TA = (distributed_tile*)tmap_.at(A);
+    distributed_tile *TB = (distributed_tile*)tmap_.at(B);
+    visit_outer_dot(dot, TA, TB, TD, NK, c_ty, f_mul_add);
+  }
+}
+
+void generator::visit_trans_inst(ir::trans_inst* trans) {
+  shared_tile* in = (shared_tile*)tmap_.at(trans->get_operand(0));
+  shared_tile* out = new shared_tile(in->get_ty(), in->get_shapes(), in->get_order(), in->get_pointer(), *builder_, in->get_offset(), trans->get_perm());
+  tmap_[trans] = out;
+}
+
+void generator::visit_sqrt_inst(ir::sqrt_inst* sqt) {
+  for_each(sqt, [&](indices_t idx){
+    Value *val = get_value(sqt->get_operand(0), idx);
+    Module* module = builder_->GetInsertBlock()->getModule();
+    Value *sqrt = Intrinsic::getDeclaration(module, Intrinsic::sqrt, {val->getType()});
+    Value *ret = builder_->CreateCall(sqrt, {val});
+    set_value(sqt, idx, ret);
+  });
+}
+
+void generator::visit_reduce_inst(ir::reduce_inst* x) {
+  std::map<indices_t, Value*> partial;
+  ir::value *arg = x->get_operand(0);
+  distributed_tile* arg_tile = (distributed_tile*)tmap_.at(arg);
+  ir::reduce_inst::op_t op = x->get_op();
+  auto accumulate = [&](Value* x, Value *y) -> Value* {
+    switch(op) {
+      case ir::reduce_inst::ADD: return builder_->CreateAdd(x, y);
+      case ir::reduce_inst::SUB: return builder_->CreateSub(x, y);
+      case ir::reduce_inst::MAX: return builder_->CreateMaximum(x, y);
+      case ir::reduce_inst::MIN: return builder_->CreateMinimum(x, y);
+      case ir::reduce_inst::FADD: return builder_->CreateFAdd(x, y);
+      case ir::reduce_inst::FSUB: return builder_->CreateFSub(x, y);
+      case ir::reduce_inst::FMAX: return builder_->CreateSelect(builder_->CreateFCmpOGT(x, y), x, y);
+      case ir::reduce_inst::FMIN: return builder_->CreateSelect(builder_->CreateFCmpOLT(x, y), x, y);
+      default: break;
+    }
+    assert(false);
+    return nullptr;
+  };
+
+  // reduce within thread
+  unsigned axis = x->get_axis();
+  arg_tile->for_each([&](indices_t idx) {
+    indices_t pidx = idx;
+    pidx[axis] = builder_->getInt32(0);
+    Value *current = arg_tile->get_value(idx);
+    // current partial result is not initialized -- create
+    if(partial.find(pidx) == partial.end())
+      partial[pidx] = current;
+    // current partial result is initialized -- accumulate
+    else
+      partial[pidx] = accumulate(partial[pidx], current);
+  });
+
+  // reduce within blocks
+  machine_data_layout *slayout = machine_layouts_.at(layouts_->get(layouts_->tmp(x)));
+  shared_tile *stile = (shared_tile*)slayout->create(x);
+  unsigned depth = stile->get_shapes()[axis];
+
+  unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace();
+  Type *res_ty = builder_->getFloatTy();
+  Value *base_ptr = builder_->CreateBitCast(sh_mem_ptr_, PointerType::get(res_ty, addr_space));
+  for(auto& x: partial) {
+    // current element being computed
+    Value *lane = axes_.at(a_axes_->get(arg, axis)).thread_id;
+    Value *&result = x.second;
+    indices_t write_idx = x.first;
+    write_idx[axis] = lane;
+    // shared memory write  pointer
+    Value *write_offset = shared_tile::shared_offset(*builder_, stile->get_shapes(), stile->get_perm(), stile->get_order(), write_idx);
+    Value *write_ptr = builder_->CreateGEP(base_ptr, write_offset);
+    // initialize shared memory
+    tgt_->add_barrier(mod_, *builder_);
+    builder_->CreateStore(result, write_ptr);
+    // build result
+    for(unsigned i = depth/2; i > 0; i >>= 1){
+      // current indices
+      indices_t current(write_idx.size(), builder_->getInt32(0));
+      current[axis] = builder_->getInt32(i);
+      // shared memory offset
+      Value *read_offset = shared_tile::shared_offset(*builder_, stile->get_shapes(), stile->get_perm(), stile->get_order(), current);
+      Value *is_active = builder_->CreateICmpULT(lane, builder_->getInt32(i));
+      read_offset = builder_->CreateSelect(is_active, read_offset, builder_->getInt32(0));
+      // shared memory read pointer
+      Value *read_ptr = builder_->CreateGEP(write_ptr, read_offset);
+      tgt_->add_barrier(mod_, *builder_);
+      Value *next = builder_->CreateLoad(read_ptr);
+      // accumulate
+      result = accumulate(result, next);
+      // write back
+      builder_->CreateStore(result, write_ptr);
+    }
+  }
+  tgt_->add_barrier(mod_, *builder_);
+  // write back
+  for_each(x, [&](indices_t idx) {
+    indices_t red_idx = idx;
+    red_idx.insert(red_idx.begin() + axis, builder_->getInt32(0));
+    Value *read_offset = shared_tile::shared_offset(*builder_, stile->get_shapes(), stile->get_perm(), stile->get_order(),  red_idx);
+    Value *read_ptr = builder_->CreateGEP(base_ptr, read_offset);
+    set_value(x, idx, builder_->CreateLoad(read_ptr));
+  });
+}
+
+void generator::visit_select_inst(ir::select_inst* select) {
+  for_each(select, [&](indices_t idx){
+    Value *pred = get_value(select->get_operand(0), idx);
+    Value *if_value = get_value(select->get_operand(1), idx);
+    Value *else_value = get_value(select->get_operand(2), idx);
+    Value *ret = builder_->CreateSelect(pred, if_value, else_value);
+    set_value(select, idx, ret);
+  });
+
+}
+
+void generator::visit_recoalesce_inst(ir::recoalesce_inst* rc) {
+  ir::value *op = rc->get_operand(0);
+  ir::tile_type::tile_shapes_t shape = rc->get_type()->get_tile_shapes();
+  size_t rank = shape.size();
+  // temporary layout
+  shared_tile *tmp = (shared_tile*)machine_layouts_.at(layouts_->get(layouts_->tmp(rc)))
+                                   ->create(rc);
+  // pointer to temporary shared memory
+  Type *ty = llvm_type(rc->get_type()->get_scalar_ty(), *ctx_);
+  // layouts
+  analysis::mma884_layout* in_layout = layouts_->get(op)->to_mma884();
+  analysis::scanline_layout* out_layout = layouts_->get(rc)->to_scanline();
+  // machine tiles
+  distributed_tile *in_dt = (distributed_tile*)(tmap_.at(op));
+  distributed_tile *out_dt = (distributed_tile*)(tmap_.at(rc));
+  // WMMA configuration
+  long wmma_pt[3] = { 2, 4, 1};
+  long wmma[3] = { 8*in_layout->wpt(0)*in_layout->fpw(0),
+                   8*in_layout->wpt(1)*in_layout->fpw(1),
+                   1};
+  // Work per thread for input  layout
+  long in_pt[3]  = { shape[0] / wmma[0],
+                     shape[1] / wmma[1],
+                     1 };
+  // Work per thread for output layout
+  long out_pt[3] = { shape[0] / out_layout->mts(0),
+                     shape[1] / out_layout->mts(1),
+                     1 };
+  if(rank > 2){
+    wmma[2] = in_layout->wpt(2)*in_layout->fpw(2);
+    in_pt[2] = shape[2] / wmma[2];
+    out_pt[2] = shape[2] / out_layout->mts(2);
+  }
+  // Orders
+  auto ord = out_layout->get_order();
+  if(ord.size() < 3)
+    ord.push_back(2);
+  // pointer lanes
+  std::vector<std::vector<Value*>> ptrs;
+  for(int in_zz = 0; in_zz < wmma_pt[ord[2]]; in_zz++) {
+    std::vector<Value*> current;
+    for(int in_cc = 0; in_cc < wmma_pt[ord[1]]; in_cc++) {
+      Value *base;
+      base = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(alloc_->offset(layouts_->get(layouts_->tmp(rc)))));
+      base = builder_->CreateBitCast(base, PointerType::get(ty, 3));
+
+      // shared memory stride
+      Value *stride_0 = builder_->getInt32(tmp->get_shapes()[ord[0]]);
+      // indices
+      Value *idx_cc = axes_.at(a_axes_->get(op, ord[1])).values[in_cc];
+      // offset
+      Value *off = builder_->CreateMul(stride_0, idx_cc);
+      if(rank > 2){
+        Value *stride_1 = builder_->CreateMul(stride_0,
+                                              builder_->getInt32(tmp->get_shapes()[ord[1]]));
+        Value *idx_zz = axes_.at(a_axes_->get(op, ord[2])).values[in_zz];
+        off = builder_->CreateAdd(off, builder_->CreateMul(stride_1, idx_zz));
+      }
+      current.push_back(builder_->CreateGEP(base, off));
+    }
+    ptrs.push_back(current);
+  }
+  // Re-coalesce loops
+  for(int in_z = 0; in_z < in_pt[ord[2]]; in_z++)
+  for(int in_c = 0; in_c < in_pt[ord[1]]; in_c++){
+    // write to shared
+    tgt_->add_barrier(mod_, *builder_);
+    for(int in_zz = 0; in_zz < wmma_pt[ord[2]]; in_zz++)
+    for(int in_cc = 0; in_cc < wmma_pt[ord[1]]; in_cc++){
+      std::vector<int> starts(rank), len(rank);
+      starts[ord[0]] = 0;
+      starts[ord[1]] = in_c*wmma_pt[ord[1]] + in_cc;
+      len[ord[0]] = wmma_pt[ord[0]]*in_pt[ord[0]];
+      len[ord[1]] = 1;
+      if(rank > 2){
+        starts[ord[2]] = in_z*wmma_pt[ord[2]] + in_zz;
+        len[ord[2]] = 1;
+      }
+      in_dt->for_each([&](indices_t idx){
+        Value *write_ptr = builder_->CreateGEP(ptrs[in_zz][in_cc], idx[ord[0]]);
+        builder_->CreateStore(in_dt->get_value(idx), write_ptr);
+      }, starts, len);
+    }
+    tgt_->add_barrier(mod_, *builder_);
+    // load from shared
+    for(int out_zz = 0; out_zz < out_pt[ord[2]] / in_pt[ord[2]]; out_zz++)
+    for(int out_cc = 0; out_cc < out_pt[ord[1]] / in_pt[ord[1]]; out_cc++){
+      std::vector<int> starts(rank), len(rank);
+      starts[ord[0]] = 0;
+      starts[ord[1]] = in_c*(out_pt[ord[1]] / in_pt[ord[1]]) + out_cc;
+      len[ord[0]] = out_pt[ord[0]];
+      len[ord[1]] = 1;
+      if(rank > 2){
+        starts[ord[2]] = in_z*(out_pt[ord[2]] / in_pt[ord[2]]) + out_zz;
+        len[ord[2]] = 1;
+      }
+      out_dt->for_each([&](indices_t idx){
+        indices_t read_idx(rank);
+        read_idx[ord[0]] = idx[ord[0]];
+        read_idx[ord[1]] = axes_.at(a_axes_->get(rc, ord[1])).values[out_cc];
+        if(rank > 2)
+          read_idx[ord[2]] = axes_.at(a_axes_->get(rc, ord[2])).values[out_zz];
+        out_dt->set_value(idx, tmp->get_value(read_idx));
+      }, starts, len);
+    }
+  }
+  tgt_->add_barrier(mod_, *builder_);
+}
+
+void generator::visit_copy_to_shared_inst(ir::copy_to_shared_inst* cts) {
+  unsigned vector_size = 1;
+  ir::value *arg = cts->get_operand(0);
+  analysis::shared_layout* out_layout = layouts_->get(cts)->to_shared();
+  analysis::scanline_layout* in_layout = layouts_->get(arg)->to_scanline();
+  auto out_order = out_layout->get_order();
+  auto in_order = in_layout->get_order();
+  // tiles
+  if(out_order == in_order)
+    vector_size = in_layout->nts(in_order[0]);
+
+  std::map<unsigned, Value*> packets;
+  for_each(arg, [&](indices_t idx){
+    distributed_tile* in = (distributed_tile*)tmap_.at(arg);
+    unsigned linear = in->get_linear_index(idx);
+    unsigned id = linear / vector_size;
+    Value *in_value = in->get_value(idx);
+    if(linear % vector_size == 0)
+      packets[id] = UndefValue::get(VectorType::get(in_value->getType(), vector_size));
+    packets[id] = builder_->CreateInsertElement(packets.at(id), in_value, linear % vector_size);
+  });
+
+  for_each(arg, [&](indices_t idx){
+    distributed_tile* in = (distributed_tile*)tmap_.at(arg);
+    shared_tile* result = (shared_tile*)tmap_.at(cts);
+    unsigned linear = in->get_linear_index(idx);
+    unsigned id = linear / vector_size;
+    if(linear % vector_size == 0)
+      result->set_value(idx, packets[id]);
+  });
+}
+void generator::visit_copy_from_shared_inst(ir::copy_from_shared_inst* cfs) {
+  for_each(cfs, [&](indices_t idx){
+    set_value(cfs, idx, get_value(cfs->get_operand(0), idx));
+  });
+}
+
+void generator::visit_barrier_inst(ir::barrier_inst*) {
+  Module *module = builder_->GetInsertBlock()->getModule();
+  tgt_->add_barrier(module, *builder_);
+}
+
+void generator::visit_make_range_dyn(ir::make_range_dyn* x) {
+  for_each(x, [&](indices_t idx){
+    assert(idx.size() == 1);
+    if(idx[0] == builder_->getInt32(0))
+      set_value(x, idx, idx[0]);
+    else{
+      BinaryOperator *bin_add = dyn_cast<BinaryOperator>(idx[0]);
+      assert(bin_add);
+      Value *res = bin_add->getOperand(0);
+      set_value(x, idx, res);
+    }
+  });
+}
+
+void generator::visit_make_range_sta(ir::make_range_sta* x) {
+  for_each(x, [&](indices_t idx){
+    assert(idx.size() == 1);
+    if(idx[0] == builder_->getInt32(0)){
+      set_value(x, idx, idx[0]);
+    }
+    else{
+      BinaryOperator *bin_add = dyn_cast<BinaryOperator>(idx[0]);
+      assert(bin_add);
+      Value *res = bin_add->getOperand(1);
+      assert(isa<Constant>(res));
+      set_value(x, idx, res);
+    }
+  });
+}
+
+void generator::visit_make_range(ir::make_range* x) {
+  for_each(x, [&](indices_t idx){
+    assert(idx.size() == 1);
+    set_value(x, idx, idx[0]);
+  });
+}
+
+
+
+void generator::visit_undef_value(ir::undef_value *ud) {
+  vmap_[ud] = llvm::UndefValue::get(llvm_type(ud->get_type(), *ctx_));
+}
+
+void generator::visit_constant_int(ir::constant_int *cst){
+  Type *ty = llvm_type(cst->get_type()->get_scalar_ty(), *ctx_);
+  vmap_[cst] = ConstantInt::get(ty, cst->get_value());
+}
+
+void generator::visit_constant_fp(ir::constant_fp *cst){
+  Type *ty = llvm_type(cst->get_type()->get_scalar_ty(), *ctx_);
+  vmap_[cst] = ConstantFP::get(ty, cst->get_value());
+}
+
+void generator::visit_alloc_const(ir::alloc_const *alloc) {
+  unsigned size = ((ir::constant_int*)alloc->get_operand(0))->get_value();
+  Type *element_ty = llvm_type(alloc->get_type()->get_pointer_element_ty(), *ctx_);
+  Type *array_ty = llvm::ArrayType::get(element_ty, size);
+  Value *array = new llvm::GlobalVariable(*mod_, array_ty, false, llvm::GlobalVariable::ExternalLinkage,
+                                            nullptr, alloc->get_name(), nullptr, llvm::GlobalVariable::NotThreadLocal, 4);
+  vmap_[alloc] = builder_->CreateBitCast(array, element_ty->getPointerTo(4));
+}
+
+
+void generator::visit_function(ir::function* fn) {
+  LLVMContext &ctx = builder_->getContext();
+  FunctionType *fn_ty = (FunctionType*)llvm_type(fn->get_fn_type(), *ctx_);
+  if(!tgt_->is_gpu()){
+    Type *fn_ret_ty = fn_ty->getReturnType();
+    std::vector<Type*> fn_args_ty;
+    for(unsigned i = 0; i < fn_ty->getNumParams(); i++)
+      fn_args_ty.push_back(fn_ty->getParamType(i));
+    fn_args_ty.push_back(builder_->getInt32Ty());
+    fn_args_ty.push_back(builder_->getInt32Ty());
+    fn_args_ty.push_back(builder_->getInt32Ty());
+    fn_ty = FunctionType::get(fn_ret_ty, fn_args_ty, false);
+  }
+  Function *ret = Function::Create(fn_ty, Function::ExternalLinkage, fn->get_name(), mod_);
+  // set attributes
+  for(auto attr_pair: fn->attrs()){
+    unsigned id = attr_pair.first;
+    for(ir::attribute attr: attr_pair.second)
+    if(attr.is_llvm_attr())
+      ret->addAttribute(id, llvm_attr(ctx, attr));
+  }
+  // set metadata
+  tgt_->set_kernel(*builder_, ctx, mod_, ret);
+  Metadata *md_args[] = {
+    ValueAsMetadata::get(ret),
+    MDString::get(ctx, "maxntidx"),
+    ValueAsMetadata::get(builder_->getInt32(num_warps_*32))
+  };
+  mod_->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(MDNode::get(ctx, md_args));
+  // set arguments
+  for(unsigned i = 0; i < fn->args().size(); i++)
+    vmap_[fn->args()[i]] = &*(ret->arg_begin() + i);
+  // create blocks
+  for(ir::basic_block *block: fn->blocks()) {
+    BasicBlock *dst_block = BasicBlock::Create(ctx, block->get_name(), ret);
+    vmap_[block] = dst_block;
+  }
+  builder_->SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]);
+  // initialize layouts
+  for(auto x: layouts_->get_all())
+    visit_layout(x.second);
+  // generate LLVM-IR code
+  for(ir::basic_block *block: fn->blocks())
+    visit_basic_block(block);
+  // finalize
+  finalize_function(fn);
+}
+
+
+
+void generator::visit_layout_hmma_884(analysis::mma884_layout* layout) {
+  machine_layouts_[layout] = new machine_mma884_layout(mod_, &*builder_, tgt_, a_axes_, axes_, layout);
+}
+
+void generator::visit_layout_scanline(analysis::scanline_layout* layout) {
+  machine_layouts_[layout] = new machine_scanline_layout(mod_, &*builder_, tgt_, a_axes_, axes_, layout);
+}
+
+void generator::visit_layout_shared(analysis::shared_layout* layout) {
+
+  machine_layouts_[layout] = new machine_shared_layout(mod_, &*builder_, tgt_, alloc_, sh_mem_ptr_, layout, vmap_, tmap_);
+}
+
+void generator::visit_basic_block(ir::basic_block * block) {
+  BasicBlock *parent = (BasicBlock*)vmap_[block];
+  builder_->SetInsertPoint(parent);
+  for(ir::instruction *i: block->get_inst_list())
+    visit_value(i);
+  vmap_[block] = builder_->GetInsertBlock();
+}
+
+void generator::visit_argument(ir::argument* arg) {
+
+}
+
+void generator::for_each(ir::value *x, const std::function<void(indices_t)>& fn) {
+  if(!x->get_type()->is_tile_ty())
+    return fn({});
+  else {
+//    if(tmap_.find(x) == tmap_.end())
+//      tmap_[x] = machine_layouts_.at(layouts_->get(x))->create(x);
+    if(auto *dt = dynamic_cast<distributed_tile*>(tmap_.at(x)))
+      dt->for_each(fn);
+  }
+}
+
+Value* generator::get_value(ir::value *x, const indices_t& idx) {
+  if(x->get_type()->is_tile_ty())
+    return tmap_.at(x)->get_value(idx);
+  return vmap_.at(x);
+}
+
+void generator::set_value(ir::value *x, const indices_t& idx, Value* v) {
+  if(x->get_type()->is_tile_ty())
+    tmap_.at(x)->set_value(idx, v);
+  else
+    vmap_[x] = v;
+}
+
+
+void generator::finalize_shared_layout(analysis::shared_layout *shared) {
+  if(shared->get_double_buffer()) {
+    auto info = *shared->get_double_buffer();
+    ir::phi_node *phi = info.phi;
+    PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer();
+    PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset();
+    for(unsigned n = 0; n < phi->get_num_incoming(); n++){
+      ir::basic_block* inc_block = phi->get_incoming_block(n);
+      ir::value* inc_val = phi->get_incoming_value(n);
+      BasicBlock *llvm_inc_block = (BasicBlock*)vmap_.at(inc_block);
+      shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val);
+      if(inc_val == info.latch){
+        builder_->SetInsertPoint(llvm_inc_block->getTerminator());
+        Value *next_offset = builder_->CreateNeg(offset);
+        offset->addIncoming(next_offset, llvm_inc_block);
+      }
+      else {
+        unsigned num_bytes = shared->get_type()->get_primitive_size_in_bits() / 8;
+        offset->addIncoming(builder_->getInt32(shared->get_size() / (2*num_bytes)), llvm_inc_block);
+      }
+      ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block);
+    }
+  }
+}
+
+void generator::finalize_function(ir::function *fn) {
+  // finalize double-buffering
+  for(const auto& x: layouts_->get_all())
+  if(auto *shared = dynamic_cast<analysis::shared_layout*>(x.second))
+    finalize_shared_layout(shared);
+  // finalize phi
+  for(ir::basic_block *block: fn->blocks())
+  for(ir::instruction *inst: block->get_inst_list())
+    if(auto *phi = dynamic_cast<ir::phi_node*>(inst))
+      finalize_phi_node(phi);
+}
+
+void generator::finalize_phi_node(ir::phi_node *phi) {
+  auto it = tmap_.find(phi);
+  if(it != tmap_.end() && dynamic_cast<shared_tile*>(it->second))
+    return;
+  for(unsigned n = 0; n < phi->get_num_incoming(); n++){
+    ir::basic_block *inc_block = phi->get_incoming_block(n);
+    BasicBlock *llvm_inc_block = (BasicBlock*)vmap_.at(inc_block);
+    for_each(phi, [&](indices_t idx){
+      PHINode *llvm_phi = (PHINode*)get_value(phi, idx);
+      Value *llvm_inc_val = get_value(phi->get_incoming_value(n), idx);
+      llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block);
+    });
+  }
+}
+
+void generator::visit(ir::module &src, llvm::Module &dst) {
+  mod_ = &dst;
+  ctx_ = &dst.getContext();
+  builder_ = new Builder(*ctx_);
+  // allocate shared memory
+  if(tgt_->is_gpu())
+  if(unsigned alloc_size = alloc_->allocated_size()){
+    Type *int_8_ty = Type::getInt8Ty(*ctx_);
+    Type *int_32_ty = Type::getInt32Ty(*ctx_);
+    ArrayType *array_ty = ArrayType::get(int_32_ty, alloc_size/4);
+    Type *ptr_ty = PointerType::get(int_8_ty, 3);
+    GlobalVariable *sh_mem_array =
+      new GlobalVariable(*mod_, array_ty, false, GlobalVariable::ExternalLinkage,
+                         nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3);
+    sh_mem_ptr_ = builder_->CreateBitCast(sh_mem_array, ptr_ty);
+  }
+  // visit functions
+  for(ir::function *fn: src.get_function_list())
+    visit_function(fn);
+}
+
+
+}
+}
diff --git a/lib/codegen/selection/machine_layout.cc b/lib/codegen/selection/machine_layout.cc
new file mode 100644
index 000000000..7ed00c586
--- /dev/null
+++ b/lib/codegen/selection/machine_layout.cc
@@ -0,0 +1,326 @@
+#include <numeric>
+#include "triton/codegen/selection/machine_layout.h"
+#include "triton/codegen/selection/machine_value.h"
+#include "triton/codegen/selection/generator.h"
+#include "triton/codegen/analysis/allocation.h"
+#include "triton/codegen/analysis/axes.h"
+#include "triton/codegen/target.h"
+#include "triton/ir/instructions.h"
+#include "triton/ir/type.h"
+#include "llvm/IR/IRBuilder.h"
+
+namespace triton{
+namespace codegen{
+
+using namespace llvm;
+
+inline Type *llvm_type(ir::type *ty, LLVMContext &ctx) {
+  // function
+  if(auto* tt = dynamic_cast<ir::function_type*>(ty)){
+    Type *return_ty = llvm_type(tt->get_return_ty(), ctx);
+    std::vector<Type*> param_tys;
+    std::transform(tt->params_begin(), tt->params_end(), std::back_inserter(param_tys),
+                   [&ctx](ir::type* t){ return llvm_type(t, ctx);});
+    return FunctionType::get(return_ty, param_tys, false);
+  }
+  // pointer
+  if(ty->is_pointer_ty()){
+    Type *elt_ty = llvm_type(ty->get_pointer_element_ty(), ctx);
+    unsigned addr_space = ty->get_pointer_address_space();
+    return PointerType::get(elt_ty, addr_space);
+  }
+  // integer
+  if(ty->is_integer_ty()){
+    unsigned bitwidth = ty->get_integer_bitwidth();
+    return IntegerType::get(ctx, bitwidth);
+  }
+  // primitive types
+  switch(ty->get_type_id()){
+    case ir::type::VoidTyID:      return Type::getVoidTy(ctx);
+    case ir::type::HalfTyID:      return Type::getHalfTy(ctx);
+    case ir::type::FloatTyID:     return Type::getFloatTy(ctx);
+    case ir::type::DoubleTyID:    return Type::getDoubleTy(ctx);
+    case ir::type::X86_FP80TyID:  return Type::getX86_FP80Ty(ctx);
+    case ir::type::PPC_FP128TyID: return Type::getPPC_FP128Ty(ctx);
+    case ir::type::LabelTyID:     return Type::getLabelTy(ctx);
+    case ir::type::MetadataTyID:  return Type::getMetadataTy(ctx);
+    case ir::type::TokenTyID:     return Type::getTokenTy(ctx);
+    default: break;
+  }
+  // unknown type
+  throw std::runtime_error("unknown conversion from ir::type to Type");
+}
+
+// Grid construction
+inline std::vector<Value*> delinearize(Value *trailing, const std::vector<int>& order, std::vector<int> &shapes, IRBuilder<> &builder){
+  size_t dim = shapes.size();
+  std::vector<Value*> result(dim);
+  for(unsigned k = 0; k < dim - 1; k++){
+    Constant *dim_k = builder.getInt32(shapes[order[k]]);
+    Value *rem = builder.CreateURem(trailing, dim_k);
+    trailing = builder.CreateUDiv(trailing, dim_k);
+    result[order[k]] = rem;
+  }
+  result[order[dim - 1]] = trailing;
+  return result;
+}
+
+inline int32_t ceil(int32_t num, int32_t div){
+  return (num + div - 1)/div;
+}
+
+
+
+machine_shared_layout::machine_shared_layout(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc,
+                                                 Value *&sh_mem_ptr, analysis::shared_layout *layout,
+                                                 std::map<ir::value *, Value *>& vmap,
+                                                 std::map<ir::value *, tile *>& tmap)
+  : mod_(mod), builder_(builder), tgt_(tgt), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), layout_(layout), vmap_(vmap), tmap_(tmap) {
+
+  Type* ty = llvm_type(layout_->get_type(), builder_->getContext());
+  PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr_->getType()->getPointerAddressSpace());
+  // double-buffered
+  if(layout_->get_double_buffer()) {
+    BasicBlock *current = builder_->GetInsertBlock();
+    auto info = *layout_->get_double_buffer();
+    ir::phi_node *phi = info.phi;
+    BasicBlock *parent = (BasicBlock*)vmap_.at((ir::value*)(phi->get_parent()));
+    if(parent->empty())
+      builder_->SetInsertPoint(parent);
+    else
+      builder_->SetInsertPoint(&*parent->getFirstNonPHI());
+    // create pointers
+    ptr_ = builder_->CreatePHI(ptr_ty, 2);
+    pre_ptr_ = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(alloc_->offset(layout_)));
+    pre_ptr_ = builder_->CreateBitCast(pre_ptr_, ptr_->getType());
+    offset_ = builder_->CreatePHI(builder_->getInt32Ty(), 2);
+    next_ptr_ = builder_->CreateGEP(ptr_, offset_, "next_ptr");
+    builder_->SetInsertPoint(current);
+  }
+  else{
+    size_t offset = alloc_->offset(layout_);
+    ptr_ = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(offset));
+    ptr_ = builder_->CreateBitCast(ptr_, ptr_ty);
+  }
+}
+
+
+tile* machine_shared_layout::create(ir::value *v) {
+  Type* ty = llvm_type(layout_->get_type(), builder_->getContext());
+  auto double_buffer = layout_->get_double_buffer();
+  // offset
+  Value *offset = nullptr;
+  if(double_buffer && v == double_buffer->phi)
+    offset = offset_;
+  // base pointer
+  Value *ptr = ptr_;
+  if(double_buffer && v == double_buffer->latch)
+    ptr = next_ptr_;
+  else if(double_buffer && v == double_buffer->first)
+    ptr = pre_ptr_;
+  // create tile
+  return new shared_tile(ty, layout_->get_shape(), layout_->get_order(), ptr, *builder_, offset);
+}
+
+machine_distributed_layout::machine_distributed_layout(Module *mod, Builder *builder, target *tgt,
+                             analysis::axes *a_axes, std::map<unsigned, distributed_axis>& axes,
+                             analysis::data_layout *layout)
+  : mod_(mod), builder_(builder), tgt_(tgt), a_axes_(a_axes), axes_(axes), layout_(layout) {
+
+}
+
+tile *machine_distributed_layout::create(ir::value *v) {
+  Type *ty = llvm_type(v->get_type()->get_scalar_ty(), builder_->getContext());
+  const auto &shapes = v->get_type()->get_tile_shapes();
+  size_t rank = shapes.size();
+  std::vector<distributed_axis> axes(rank);
+  std::vector<int> order(rank);
+  // compute axes
+  for(size_t d = 0; d < shapes.size(); d++){
+    if(shapes[d] > 1){
+      unsigned x = a_axes_->get(v, d);
+      axes[d] = axes_.at(x);
+    }
+    else{
+      axes[d].contiguous = 1;
+      axes[d].values = {builder_->getInt32(0)};
+    }
+  }
+  // compute order
+  std::iota(order.begin(), order.end(), 0);
+  auto cmp = [&](int x, int y) {
+    unsigned axx = a_axes_->get(v, x);
+    unsigned axy = a_axes_->get(v, y);
+    size_t posx = layout_->find_axis(axx);
+    size_t posy = layout_->find_axis(axy);
+    if(posx < rank && posy < rank)
+      return layout_->get_order(posx) < layout_->get_order(posy);
+    return false;
+  };
+  std::sort(order.begin(), order.end(), cmp);
+
+  return new distributed_tile(ty, shapes, order, axes, *builder_);
+}
+
+machine_mma884_layout::machine_mma884_layout(Module *mod, Builder *builder,
+                          target *tgt, analysis::axes *a_axes,
+                          std::map<unsigned, distributed_axis>& axes,
+                          analysis::mma884_layout* layout)
+  : machine_distributed_layout(mod, builder, tgt, a_axes, axes, layout) {
+
+  Value *warp_size = builder_->getInt32(32);
+  Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0);
+  Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size);
+  Value *u_warp_id = builder_->CreateUDiv(u_thread_id_0, warp_size);
+
+  const auto& shape = layout->get_shape();
+  if(shape.size() > 3)
+    throw std::runtime_error("unsupported");
+  bool is_batched = shape.size() >= 3;
+
+  Value *_1 = builder_->getInt32(1);
+  Value *_2 = builder_->getInt32(2);
+  Value *_3 = builder_->getInt32(3);
+  Value *_4 = builder_->getInt32(4);
+  Value *_16 = builder_->getInt32(16);
+
+  // fragments per warp
+  unsigned fpw_0 = layout->fpw(0);
+  unsigned fpw_1 = layout->fpw(1);
+  unsigned fpw_2 = is_batched ? layout->fpw(2) : 1;
+  // warps per tile
+  unsigned wpt_0 = layout->wpt(0);
+  unsigned wpt_1 = layout->wpt(1);
+  unsigned wpt_2 = is_batched ? layout->wpt(2) : 1;
+  // mma warp tile size
+  unsigned hmma_wts_0 = fpw_0 * 8;
+  unsigned hmma_wts_1 = fpw_1 * 8;
+  unsigned hmma_wts_2 = is_batched ? fpw_2 : 1;
+  // mma block tile size
+  unsigned hmma_bts_0 = hmma_wts_0 * wpt_0;
+  unsigned hmma_bts_1 = hmma_wts_1 * wpt_1;
+  unsigned hmma_bts_2 = is_batched ? hmma_wts_2 * wpt_2 : 1;
+  // number of repetition
+  unsigned num_rep_0 = shape[0] / hmma_bts_0;
+  unsigned num_rep_1 = shape[1] / hmma_bts_1;
+  unsigned num_rep_2 = is_batched ? shape[2] / hmma_bts_2 : 1;
+  // size of each pack (interleaving)
+  pack_size_0_ = std::min<unsigned>(num_rep_0, 1);
+  pack_size_1_ = std::min<unsigned>(num_rep_1, 1);
+  // number of packs (interleaving)
+  num_packs_0_ = num_rep_0 / pack_size_0_;
+  num_packs_1_ = num_rep_1 / pack_size_1_;
+
+  /* intra warp offset */
+  // offset of quad in pair
+  Value *in_pair_off_a = builder_->CreateMul(builder_->CreateUDiv(builder_->CreateAnd(u_thread_id, _16), builder_->getInt32(4)),
+                                           builder_->getInt32(fpw_0 * pack_size_0_));
+  Value *in_pair_off_b = builder_->CreateMul(builder_->CreateUDiv(builder_->CreateAnd(u_thread_id, _16), builder_->getInt32(4)),
+                                           builder_->getInt32(fpw_1 * pack_size_1_));
+
+  // Quad pair id
+  Value *pair_a_id = builder_->CreateUDiv(builder_->CreateURem(u_thread_id, _16), _4);
+  Value *pair_b_id = builder_->CreateUDiv(builder_->CreateURem(u_thread_id, _16), _4);
+  pair_a_id = builder_->CreateURem(pair_a_id, builder_->getInt32(fpw_0));
+  pair_b_id = builder_->CreateUDiv(pair_b_id, builder_->getInt32(fpw_0));
+  pair_b_id = builder_->CreateURem(pair_b_id, builder_->getInt32(fpw_1));
+  // Quad pair offset
+  Value *pair_a_off = builder_->CreateMul(pair_a_id, builder_->getInt32(4 * pack_size_0_));
+  Value *pair_b_off = builder_->CreateMul(pair_b_id, builder_->getInt32(4 * pack_size_1_));
+
+  /* inter warp offset */
+  Value *warp_id_0 = builder_->CreateURem(u_warp_id, builder_->getInt32(wpt_0));
+  Value *warp_id_12 = builder_->CreateUDiv(u_warp_id, builder_->getInt32(wpt_0));
+  Value *warp_id_1 = builder_->CreateURem(warp_id_12, builder_->getInt32(wpt_1));
+  Value *warp_id_2 = builder_->CreateUDiv(warp_id_12, builder_->getInt32(wpt_1));
+  Value *warp_offset_i = builder_->CreateMul(warp_id_0, builder_->getInt32(hmma_wts_0 * pack_size_0_));
+  Value *warp_offset_j = builder_->CreateMul(warp_id_1, builder_->getInt32(hmma_wts_1 * pack_size_1_));
+
+  /* offsets */
+  // a offset
+  offset_a_i_ = builder_->CreateAdd(warp_offset_i, builder_->CreateAdd(pair_a_off, in_pair_off_a));
+  offset_a_k_ = builder_->CreateAnd(u_thread_id, _3);
+  // b offsets
+  offset_b_j_ = builder_->CreateAdd(warp_offset_j, builder_->CreateAdd(pair_b_off, in_pair_off_b));
+  offset_b_k_ = builder_->CreateAnd(u_thread_id, _3);
+
+  // c offsets
+  Value *offset_c_i = builder_->CreateAdd(builder_->CreateAnd(u_thread_id, _1), offset_a_i_);
+  Value *offset_c_j = builder_->CreateAdd(builder_->CreateAnd(u_thread_id, _2),
+                                        builder_->CreateAdd(warp_offset_j, pair_b_off));
+
+  /* indices */
+  // i indices
+  std::vector<Value*> idx_i;
+  for(unsigned pack = 0; pack < num_packs_0_; pack++)
+  for(unsigned ii = 0; ii < pack_size_0_; ii++)
+  for(unsigned i = 0; i < 2; i++){
+    idx_i.push_back(builder_->CreateAdd(offset_c_i, builder_->getInt32(pack*hmma_bts_0*pack_size_0_ + ii*4 + i*2)));
+  }
+  // j indices
+  std::vector<Value*> idx_j;
+  for(unsigned pack = 0; pack < num_packs_1_; pack++)
+  for(unsigned jj = 0; jj < pack_size_1_; jj++)
+  for(unsigned j = 0; j < 2; j++){
+    idx_j.push_back(builder_->CreateAdd(offset_c_j, builder_->getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_)));
+    idx_j.push_back(builder_->CreateAdd(offset_c_j, builder_->getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_ + 1)));
+  }
+  // z indices
+  std::vector<Value*> idx_z;
+  for(unsigned pack = 0; pack < num_rep_2; pack++)
+    idx_z.push_back(builder_->CreateAdd(warp_id_2, builder_->getInt32(pack*hmma_bts_2)));
+
+
+  /* axes */
+  axes_[layout->get_axis(0)] = distributed_axis{1, idx_i, warp_id_0};
+  axes_[layout->get_axis(1)] = distributed_axis{1, idx_j, warp_id_1};
+  if(is_batched)
+    axes_[layout->get_axis(2)] = distributed_axis{1, idx_z, warp_id_2};
+}
+
+
+machine_scanline_layout::machine_scanline_layout(Module *mod, Builder *builder,
+                                                     target *tgt,
+                                                     analysis::axes *a_axes, std::map<unsigned, distributed_axis> &axes,
+                                                     analysis::scanline_layout* layout)
+  : machine_distributed_layout(mod, builder, tgt, a_axes, axes, layout) {
+
+  Value *warp_size = builder_->getInt32(32);
+  Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0);
+  Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size);
+  Value *u_warp_id = builder_->CreateUDiv(u_thread_id_0, warp_size);
+
+  auto order = layout->get_order();
+  const auto& shape = layout->get_shape();
+  Value* full_thread_id = builder_->CreateAdd(builder_->CreateMul(u_warp_id, builder_->getInt32(32)), u_thread_id);
+  // Delinearize
+  size_t dim = shape.size();
+  std::vector<Value*> thread_id(dim);
+  for(unsigned k = 0; k < dim - 1; k++){
+    Constant *dim_k = builder_->getInt32(layout->mts(order[k]));
+    Value *rem = builder_->CreateURem(full_thread_id, dim_k);
+    full_thread_id = builder_->CreateUDiv(full_thread_id, dim_k);
+    thread_id[order[k]] = rem;
+  }
+  thread_id[order[dim - 1]] = full_thread_id;
+  // Create axes
+  for(unsigned k = 0; k < dim; k++) {
+    int nts = layout->nts(k);
+    int mts = layout->mts(k);
+    std::string str_k = std::to_string(k);
+    Value *contiguous_k = builder_->getInt32(nts);
+    Value *scaled_thread_id = builder_->CreateMul(thread_id[k], contiguous_k);
+    unsigned per_block  = nts * mts;
+    unsigned per_thread = nts * shape[k] / per_block;
+    std::vector<Value*> idx_list(per_thread);
+    for(unsigned n = 0 ; n < per_thread; n++){
+      unsigned offset = n / nts * per_block + n % nts;
+      idx_list[n] = builder_->CreateAdd(scaled_thread_id, builder_->getInt32(offset), "idx_" + str_k + "_" + std::to_string(n));
+    }
+    axes_[layout->get_axis(k)] = distributed_axis{nts, idx_list, thread_id[k]};
+  }
+}
+
+
+}
+}
diff --git a/lib/codegen/selection/machine_value.cc b/lib/codegen/selection/machine_value.cc
new file mode 100644
index 000000000..c70ba85b0
--- /dev/null
+++ b/lib/codegen/selection/machine_value.cc
@@ -0,0 +1,214 @@
+#include <numeric>
+#include <iostream>
+#include "llvm/IR/IRBuilder.h"
+#include "triton/codegen/selection/machine_value.h"
+
+namespace triton{
+namespace codegen{
+
+using namespace llvm;
+
+/* Distributed Tile */
+void distributed_tile::init_indices() {
+  std::vector<size_t> id(axes_.size(), 0);
+  // build
+  size_t k = 0;
+  while(true) {
+    indices_t current;
+    for(size_t d = 0; d < id.size(); d++)
+      current.push_back(axes_[d].values[id[d]]);
+    size_t sz = indices_.size();
+    indices_[current] = sz;
+    values_[current] = nullptr;
+    ordered_indices_.push_back(current);
+    id[order_[0]]++;
+    while(id[order_[k]] == axes_[order_[k]].values.size()){
+      if(k == id.size() - 1)
+        return;
+      id[order_[k++]] = 0;
+      id[order_[k]]++;
+    }
+    k = 0;
+  }
+}
+
+
+distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const std::vector<int>& order, const axes_t &axes, llvm::IRBuilder<> &builder)
+    : tile(ty, shapes), axes_(axes), order_(order), builder_(builder) {
+  init_indices();
+}
+
+void distributed_tile::set_value(indices_t idx, Value *x) {
+  assert(x->getType() == ty_ && "cannot set a value of different type");
+  Value *&result = values_[idx];
+  assert(!result && "value cannot be set twice");
+  result = x;
+}
+
+Value* distributed_tile::get_value(indices_t idx) {
+  Value *result = values_.at(idx);
+  assert(result && "value has not been set");
+  return result;
+}
+
+unsigned distributed_tile::get_linear_index(indices_t idx) {
+  return indices_[idx];
+}
+
+indices_t distributed_tile::get_ordered_indices(unsigned id) {
+  return ordered_indices_.at(id);
+}
+
+
+void distributed_tile::for_each(std::function<void (indices_t)> fn, int start, int end) {
+  if(end < 0)
+    end = ordered_indices_.size() + end + 1;
+  for(unsigned i = start; i < end; i++)
+    fn(ordered_indices_[i]);
+}
+
+void distributed_tile::for_each(std::function<void(indices_t)> fn, std::vector<int> starts, std::vector<int> sizes){
+  int rank = sizes.size();
+  int len = 1;
+  for(int s: sizes)
+    len *= s;
+
+  for(int i = 0; i < len; i++){
+    indices_t idx(rank);
+    int current = i;
+    for(int k = 0; k < rank; k++){
+      idx[k] = axes_[k].values.at(starts[k] + current % sizes[k]);
+      current = current / sizes[k];
+    }
+    fn(idx);
+  }
+}
+
+
+/* Shared Tile */
+void shared_tile::extract_constant(Value *arg, Value *&non_cst, Value *&cst) {
+  BinaryOperator *bin_op = dyn_cast<BinaryOperator>(arg);
+  Constant *_0 = ConstantInt::get(Type::getInt32Ty(arg->getContext()), 0);
+  if(dyn_cast<Constant>(arg)){
+    cst = arg;
+    non_cst = _0;
+    return;
+  }
+  if(!bin_op || bin_op->getOpcode() != llvm::BinaryOperator::Add){
+    non_cst = arg;
+    cst = _0;
+    return;
+  }
+  Constant *cst_lhs = dyn_cast<Constant>(bin_op->getOperand(0));
+  Constant *cst_rhs = dyn_cast<Constant>(bin_op->getOperand(1));
+  if(cst_lhs && cst_rhs){
+    cst = arg;
+    non_cst = _0;
+  }
+  else if(cst_lhs){
+    cst = cst_lhs;
+    non_cst = bin_op->getOperand(1);
+  }
+  else if(cst_rhs){
+    cst = cst_rhs;
+    non_cst = bin_op->getOperand(0);
+  }
+  else{
+    non_cst = arg;
+    cst = _0;
+  }
+}
+
+void shared_tile::extract_constant(const indices_t &arg_idx, indices_t &non_cst_idx, indices_t &cst_idx) {
+  non_cst_idx.clear();
+  cst_idx.clear();
+  for(Value *idx: arg_idx){
+    Value *non_cst, *cst;
+    extract_constant(idx, non_cst, cst);
+    non_cst_idx.push_back(non_cst);
+    cst_idx.push_back(cst);
+  }
+}
+
+
+Value* shared_tile::shared_offset(llvm::IRBuilder<> &builder, const shapes_t& shapes,
+                                  const std::vector<int>& perm, const std::vector<int>& order,
+                                  indices_t idx) {
+  // strides
+  std::vector<Value*> strides(order.size());
+  strides[order[0]] = builder.getInt32(1);
+  for(size_t i = 1; i < idx.size(); i++)
+    strides[order[i]] = builder.CreateMul(strides[order[i-1]], builder.getInt32(shapes[order[i-1]]));
+  // result
+  Value *result = builder.getInt32(0);
+  for(size_t i = 0; i < strides.size(); i++)
+    result = builder.CreateAdd(result, builder.CreateMul(idx[perm[i]], strides[i]));
+  return result;
+}
+
+shared_tile::shared_tile(Type *ty, const shapes_t &shapes, const std::vector<int>& order, Value *ptr, llvm::IRBuilder<> &builder, Value *offset, const std::vector<int>& perm):
+  tile(ty, shapes), order_(order), ptr_(ptr), builder_(builder), offset_(offset), vector_size_(1), perm_(perm){
+  return_vector_ = false;
+  if(perm_.empty()){
+    perm_.resize(shapes.size());
+    std::iota(perm_.begin(), perm_.end(), 0);
+  }
+}
+
+void shared_tile::set_value(indices_t idx, Value *value) {
+  Value *ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, perm_, order_, idx));
+  unsigned addr_space = ptr->getType()->getPointerAddressSpace();
+  ptr = builder_.CreateBitCast(ptr, value->getType()->getPointerTo(addr_space));
+  builder_.CreateStore(value, ptr);
+}
+
+void shared_tile::set_vector_size(unsigned vector_size) {
+  vector_size_ = vector_size;
+}
+
+void shared_tile::set_return_mode(bool return_vector){
+  return_vector_ = return_vector;
+}
+
+
+Value* shared_tile::get_value(indices_t idx) {
+  indices_t non_cst_idx, cst_idx;
+  extract_constant(idx, non_cst_idx, cst_idx);
+  Value *&base_ptr = ptr_cache_[non_cst_idx];
+  unsigned vector_size = vector_size_;
+  Type *ty = ty_;
+  if(ty->isHalfTy() && (vector_size % 2 == 0)){
+    ty = IntegerType::get(ty->getContext(), 32);
+    vector_size = vector_size / 2;
+  }
+  if(base_ptr == nullptr){
+//    BasicBlock* store = builder_.GetInsertBlock();
+//    if(!non_cst_idx.empty())
+//    if(isa<Instruction>(non_cst_idx.front())){
+//      builder_.SetInsertPoint((Instruction*)non_cst_idx.front());
+//    }
+    base_ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, perm_, order_, non_cst_idx));
+    if(vector_size_ > 1){
+      Type *vec_ty = VectorType::get(ty, vector_size);
+      Type *vec_ptr_ty = PointerType::get(vec_ty, base_ptr->getType()->getPointerAddressSpace());
+      base_ptr = builder_.CreateBitCast(base_ptr, vec_ptr_ty);
+    }
+//    builder_.SetInsertPoint(store);
+  }
+  Value *offset = shared_offset(builder_, shapes_, perm_, order_, cst_idx);
+  Value *div = offset;
+  if(vector_size_ > 1)
+    div = builder_.CreateUDiv(offset, builder_.getInt32(vector_size_));
+  Value *ptr = builder_.CreateGEP(base_ptr, div);
+  Value *result = builder_.CreateLoad(ptr);
+  if(return_vector_ == false && vector_size_ > 1) {
+    Value *rem = builder_.CreateURem(offset, builder_.getInt32(vector_size_));
+    result = builder_.CreateExtractElement(result, rem);
+  }
+  return result;
+}
+
+
+
+}
+}
diff --git a/lib/codegen/target.cc b/lib/codegen/target.cc
new file mode 100644
index 000000000..f63b4b899
--- /dev/null
+++ b/lib/codegen/target.cc
@@ -0,0 +1,174 @@
+#include "triton/codegen/target.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/IRBuilder.h"
+#include <iostream>
+
+using namespace llvm;
+
+namespace triton{
+namespace codegen{
+
+// base
+bool target::is_gpu() const {
+  return is_gpu_;
+}
+
+// AMD
+void amd_cl_target::set_kernel(IRBuilder<>& builder, LLVMContext &ctx, Module *module, Function* fn) {
+  fn->setCallingConv(CallingConv::AMDGPU_KERNEL);
+}
+
+Instruction* amd_cl_target::add_barrier(Module *module, IRBuilder<>& builder) {
+  Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::amdgcn_s_barrier);
+  return builder.CreateCall(barrier, {});
+}
+
+Value* amd_cl_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) {
+  Value* group_id = get_block_id(module, builder, ax);
+  Value* result = builder.CreateMul(builder.getInt32(stride), group_id);
+  return result;
+}
+
+Instruction* amd_cl_target::add_memfence(Module *module, IRBuilder<>& builder) {
+  throw std::runtime_error("not implemented");
+}
+
+
+Value* amd_cl_target::get_block_id(Module *module, IRBuilder<>& builder, unsigned ax) {
+  static std::array<Intrinsic::ID, 3> ids = {
+    Intrinsic::amdgcn_workgroup_id_x,
+    Intrinsic::amdgcn_workgroup_id_y,
+    Intrinsic::amdgcn_workgroup_id_z
+  };
+  Value* get_group_id = Intrinsic::getDeclaration(module, ids[ax]);
+  Value* group_id = builder.CreateCall(get_group_id, {});
+  return group_id;
+}
+
+Value* amd_cl_target::get_num_blocks(Module *module, IRBuilder<>& builder, unsigned ax) {
+  static std::array<Intrinsic::ID, 3> ids = {
+    Intrinsic::r600_read_ngroups_x,
+    Intrinsic::r600_read_ngroups_y,
+    Intrinsic::r600_read_ngroups_z
+  };
+  Value* get_num_group = Intrinsic::getDeclaration(module, ids[ax]);
+  return builder.CreateCall(get_num_group, {});
+}
+
+Value* amd_cl_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) {
+  static std::array<Intrinsic::ID, 3> ids = {
+    Intrinsic::amdgcn_workitem_id_x,
+    Intrinsic::amdgcn_workitem_id_y,
+    Intrinsic::amdgcn_workitem_id_z
+  };
+  Function *get_local_id = Intrinsic::getDeclaration(module, ids[ax]);
+  return builder.CreateCall(get_local_id, {});
+}
+
+// NVIDIA
+
+void nvidia_cu_target::set_kernel(IRBuilder<>& builder, LLVMContext &ctx, Module *module, Function* fn){
+  // set metadata
+  Metadata *md_args[] = {
+    ValueAsMetadata::get(fn),
+    MDString::get(ctx, "kernel"),
+    ValueAsMetadata::get(builder.getInt32(1))
+  };
+  module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(MDNode::get(ctx, md_args));
+}
+
+Instruction* nvidia_cu_target::add_barrier(Module *module, IRBuilder<>& builder) {
+  Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::nvvm_barrier0);
+  return builder.CreateCall(barrier, {});
+}
+
+Instruction* nvidia_cu_target::add_memfence(Module *module, IRBuilder<>& builder) {
+  Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::nvvm_membar_gl);
+  return builder.CreateCall(barrier, {});
+}
+
+
+Value* nvidia_cu_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) {
+  Value* group_id = get_block_id(module, builder, ax);
+  Value* result = builder.CreateMul(builder.getInt32(stride), group_id);
+  return result;
+}
+
+Value* nvidia_cu_target::get_block_id(Module *module, IRBuilder<>& builder, unsigned ax) {
+  static std::array<Intrinsic::ID, 3> cta_ids = {
+    Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
+    Intrinsic::nvvm_read_ptx_sreg_ctaid_y,
+    Intrinsic::nvvm_read_ptx_sreg_ctaid_z
+  };
+  Value* get_cta_id = Intrinsic::getDeclaration(module, cta_ids[ax]);
+  Value* cta_id = builder.CreateCall(get_cta_id, {});
+  return cta_id;
+}
+
+Value* nvidia_cu_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) {
+  static std::array<Intrinsic::ID, 3> ids = {
+    Intrinsic::nvvm_read_ptx_sreg_tid_x,
+    Intrinsic::nvvm_read_ptx_sreg_tid_y,
+    Intrinsic::nvvm_read_ptx_sreg_tid_z
+  };
+  Function *get_local_id = Intrinsic::getDeclaration(module, ids[ax]);
+  return builder.CreateCall(get_local_id, {});
+}
+
+Value* nvidia_cu_target::get_num_blocks(Module *module, IRBuilder<>& builder, unsigned ax) {
+  static std::array<Intrinsic::ID, 3> ids = {
+    Intrinsic::nvvm_read_ptx_sreg_nctaid_x,
+    Intrinsic::nvvm_read_ptx_sreg_nctaid_y,
+    Intrinsic::nvvm_read_ptx_sreg_nctaid_z
+  };
+  Value* get_nctaid = Intrinsic::getDeclaration(module, ids[ax]);
+  return builder.CreateCall(get_nctaid, {});
+}
+
+// CPU
+
+void cpu_target::set_kernel(IRBuilder<>& builder, LLVMContext &ctx, Module *module, Function* fn) {
+  // normal cpu functions can be kernels
+}
+
+Instruction* cpu_target::add_barrier(Module *module, IRBuilder<>& builder) {
+  // no barrier on CPU
+  return (Instruction*)builder.CreateAdd(builder.getInt32(0), builder.getInt32(0));
+}
+
+Instruction* cpu_target::add_memfence(Module *module, IRBuilder<>& builder) {
+  // no barrier on CPU
+  return (Instruction*)builder.CreateAdd(builder.getInt32(0), builder.getInt32(0));
+}
+
+
+Value* cpu_target::get_block_id(Module *module, llvm::IRBuilder<> &builder, unsigned ax) {
+  const Function *fn = builder.GetInsertBlock()->getParent();
+  size_t num_params = fn->getFunctionType()->getNumParams();
+  static std::array<const Argument*, 3> ids = {
+    fn->arg_begin() + num_params - 3,
+    fn->arg_begin() + num_params - 2,
+    fn->arg_begin() + num_params - 1
+  };
+  return (Argument*)ids[ax];
+}
+
+Value* cpu_target::get_num_blocks(Module *module, IRBuilder<>& builder, unsigned ax) {
+  throw std::runtime_error("not implemented");
+}
+
+
+Value* cpu_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) {
+  Value* result = builder.CreateMul(builder.getInt32(stride), get_block_id(module, builder, ax));
+  return result;
+}
+
+Value* cpu_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) {
+  return builder.getInt32(0);
+}
+
+}
+}
diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc
new file mode 100644
index 000000000..14a295b00
--- /dev/null
+++ b/lib/codegen/transform/coalesce.cc
@@ -0,0 +1,143 @@
+#include <algorithm>
+#include <iostream>
+#include "triton/ir/utils.h"
+#include "triton/ir/instructions.h"
+#include "triton/ir/function.h"
+#include "triton/ir/module.h"
+#include "triton/codegen/transform/coalesce.h"
+#include "triton/codegen/analysis/align.h"
+#include "triton/codegen/analysis/layout.h"
+
+namespace triton {
+namespace codegen{
+namespace transform{
+
+coalesce::coalesce(analysis::align* align, analysis::layouts *layouts)
+  : align_(align), layout_(layouts) { }
+
+// Find all values that are used as pointer operands in LD/ST
+void coalesce::extract_io_use(ir::value *v, std::set<ir::io_inst*>& result) {
+  for(ir::user* u: v->get_users()){
+    auto i = dynamic_cast<ir::io_inst*>(u);
+    if(i && i->get_pointer_operand() == v)
+      result.insert(i);
+  }
+}
+
+void coalesce::extract_ld(ir::io_inst* i, std::map<int, std::vector<ir::io_inst*>>& result) {
+  ir::value *ptr = i->get_pointer_operand();
+  auto contiguous = align_->contiguous(ptr);
+  auto it = std::max_element(contiguous.begin(), contiguous.end());
+  int axis = std::distance(contiguous.begin(), it);
+  result[axis].push_back(i);
+}
+
+ir::value* coalesce::rematerialize(ir::value *x, ir::builder &builder,
+                                   std::map<ir::value*, ir::value*>& seen) {
+  if(seen.find(x) != seen.end())
+    return seen.at(x);
+  auto i = dynamic_cast<ir::instruction*>(x);
+  // not an instruction -- forward value
+  if(!i)
+    return x;
+  // already in shared memory -- forward value
+  if(dynamic_cast<ir::copy_to_shared_inst*>(x)){
+    return x;
+  }
+  // set insert point
+  auto& inst_list = i->get_parent()->get_inst_list();
+  auto pos = ++std::find(inst_list.begin(), inst_list.end(), i);
+  builder.set_insert_point(pos);
+  if(dynamic_cast<ir::load_inst*>(x)){
+    ir::value *ret = builder.insert(ir::copy_to_shared_inst::create(x));
+    return ret;
+  }
+  // default -- recursive clone
+  ir::instruction *cloned = builder.insert(i->clone());
+  seen[i] = cloned;
+  // rematerialize operands
+  for(ir::value *op: cloned->ops())
+    cloned->replace_uses_of_with(op, rematerialize(op, builder, seen));
+  return cloned;
+}
+
+void coalesce::run(ir::module &mod) {
+  size_t num_groups = layout_->num_layouts();
+
+
+  for(size_t id = 0; id < num_groups; id++) {
+    if(!layout_->get(id)->to_mma884())
+      continue;
+    // extract memory stores
+    const auto& values = layout_->values_of(id);
+    ir::value* dot = nullptr;
+    for(ir::value *v: values)
+      if(auto x = dynamic_cast<ir::dot_inst*>(v))
+        dot = x;
+
+    ir::builder& builder = mod.get_builder();
+    std::vector<ir::value*> worklist = {dot};
+    std::set<ir::value*> seen;
+    while(!worklist.empty()) {
+      ir::value *current = worklist.back();
+      seen.insert(current);
+      worklist.pop_back();
+      // stop if trunc
+      if(auto x = dynamic_cast<ir::fp_trunc_inst*>(current)){
+        builder.set_insert_point_after(x);
+        ir::recoalesce_inst* rc = ir::recoalesce_inst::create(x);
+        builder.insert(rc);
+        x->replace_all_uses_with(rc);
+        rc->replace_uses_of_with(rc, x);
+        break;
+      }
+      // recurse
+      for(ir::user *u: current->get_users())
+        if(seen.find(u) == seen.end())
+          worklist.push_back(u);
+    }
+  }
+
+  // find values to rematerialize
+  std::vector<ir::io_inst*> remat;
+  for(size_t id = 0; id < num_groups; id++) {
+    const auto& values = layout_->values_of(id);
+    // extract pointers used in ld/st operations
+    std::set<ir::io_inst*> io;
+    for(ir::value *v: values)
+      extract_io_use(v, io);
+    // extract leading axes
+    std::map<int, std::vector<ir::io_inst*>> axes;
+    for(ir::io_inst *i: io){
+      if(i->get_pointer_operand()->get_type()->get_tile_ranks1() == layout_->get(id)->get_rank())
+        extract_ld(i, axes);
+    }
+    // update list of values to rematerialize
+    if(axes.empty())
+      continue;
+    for(auto it = ++axes.rbegin(); it != axes.rend(); it++)
+      remat.insert(remat.begin(), it->second.begin(), it->second.end());
+  }
+  // rematerialize values
+  for(ir::io_inst *r: remat) {
+    ir::builder& builder = mod.get_builder();
+    // rematerialize operands
+    std::map<ir::value*, ir::value*> seen;
+    for(ir::value *op: r->ops())
+      r->replace_uses_of_with(op, rematerialize(op, mod.get_builder(), seen));
+    // copy to shared if load
+    auto& inst_list = r->get_parent()->get_inst_list();
+    auto pos = ++std::find(inst_list.begin(), inst_list.end(), r);
+    builder.set_insert_point(pos);
+    if(dynamic_cast<ir::load_inst*>(r)){
+      ir::instruction *cts = builder.insert(ir::copy_to_shared_inst::create(r));
+      r->replace_all_uses_with(cts);
+      cts->replace_uses_of_with(cts, r);
+    }
+  }
+}
+
+
+}
+}
+}
diff --git a/lib/codegen/transform/cts.cc b/lib/codegen/transform/cts.cc
new file mode 100644
index 000000000..ae2791cc8
--- /dev/null
+++ b/lib/codegen/transform/cts.cc
@@ -0,0 +1,95 @@
+#include "triton/codegen/transform/cts.h"
+#include "triton/ir/module.h"
+#include "triton/ir/function.h"
+#include "triton/ir/basic_block.h"
+#include "triton/ir/instructions.h"
+#include <iostream>
+
+namespace triton {
+namespace codegen{
+namespace transform{
+
+
+inline bool is_shmem_op(ir::instruction* i, int op) {
+  if(i->get_id() == ir::INST_DOT)
+    return op==0 || op==1;
+  if(i->get_id() == ir::INST_COPY_FROM_SHARED)
+    return op==0;
+  if(i->get_id() == ir::INST_TRANS)
+    return op==0;
+  return false;
+}
+
+inline bool is_shmem_res(ir::value* v){
+  ir::instruction* i = dynamic_cast<ir::instruction*>(v);
+  if(!i)
+    return false;
+  if(i->get_id() == ir::INST_TRANS)
+    return true;
+  if(i->get_id() == ir::INST_REDUCE)
+    return true;
+  if(i->get_id() == ir::INST_COPY_TO_SHARED)
+    return true;
+  return false;
+}
+
+
+// run pass on module
+void add_copy(ir::instruction *parent, ir::value *x, ir::builder &builder, bool to_shared) {
+  auto *i = dynamic_cast<ir::instruction*>(x);
+  // not an instruction
+  if(!i) {
+    builder.set_insert_point(parent);
+    ir::value *copy;
+    if(to_shared)
+      copy = builder.create_copy_to_shared(x);
+    else
+      copy = builder.create_copy_from_shared(x);
+    parent->replace_uses_of_with(x, copy);
+    return;
+  }
+  // phi node
+  if(auto* phi = dynamic_cast<ir::phi_node*>(x)) {
+    for(unsigned i = 0; i < phi->get_num_incoming(); ++i)
+      add_copy(phi, phi->get_incoming_value(i), builder, to_shared);
+    return;
+  }
+  // already in shared memory
+  if(to_shared && is_shmem_res(i))
+    return;
+  // copy
+  builder.set_insert_point_after(i);
+  ir::value *copy;
+  if(to_shared)
+    copy = builder.create_copy_to_shared(x);
+  else
+    copy = builder.create_copy_from_shared(x);
+  parent->replace_uses_of_with(x, copy);
+}
+
+void cts::run(ir::module &mod) {
+  // Add shared copies
+  ir::builder &builder = mod.get_builder();
+  for(ir::function* fn: mod.get_function_list()){
+    for(ir::basic_block* block: fn->blocks())
+    for(ir::instruction* i: block->get_inst_list()){
+      size_t num_op = i->get_num_operands();
+      // copy to shared operands
+      for(size_t k = 0; k < num_op; k++)
+        if(is_shmem_op(i, k))
+          add_copy(i, i->get_operand(k), builder, true);
+      // copy from shared operands
+      for(size_t k = 0; k < num_op; k++)
+        if(!dynamic_cast<ir::phi_node*>(i) &&
+           !is_shmem_op(i,k) &&
+           is_shmem_res(i->get_operand(k))){
+          add_copy(i, i->get_operand(k), builder, false);
+        }
+    }
+  }
+}
+
+
+}
+}
+}
diff --git a/lib/codegen/transform/dce.cc b/lib/codegen/transform/dce.cc
new file mode 100644
index 000000000..907b1621f
--- /dev/null
+++ b/lib/codegen/transform/dce.cc
@@ -0,0 +1,75 @@
+#include "triton/codegen/transform/dce.h"
+#include "triton/ir/function.h"
+#include "triton/ir/basic_block.h"
+#include "triton/ir/module.h"
+#include "triton/ir/utils.h"
+
+namespace triton {
+namespace codegen{
+namespace transform{
+
+
+void dce::run(ir::module &mod) {
+  std::list<ir::instruction*> work_list;
+  std::set<ir::instruction*> marked;
+
+  // initialize work-list
+  for(ir::function *fn: mod.get_function_list()){
+    std::vector<ir::basic_block*> rpo = ir::cfg::reverse_post_order(fn);
+    // iterate through blocks
+    for(ir::basic_block *block: rpo)
+    for(ir::instruction *i: block->get_inst_list()){
+      switch(i->get_id()){
+        case ir::INST_RETURN:
+        case ir::INST_UNCOND_BRANCH:
+        case ir::INST_COND_BRANCH:
+        case ir::INST_UNMASKED_STORE:
+        case ir::INST_MASKED_STORE:
+        case ir::INST_ATOMIC_ADD:
+        case ir::INST_ATOMIC_CAS:
+        case ir::INST_ATOMIC_EXCH:
+        case ir::INST_BARRIER: {
+          work_list.push_back(i);
+          marked.insert(i);
+          break;
+        }
+        default:
+          break;
+      }
+    }
+  }
+
+  // mark -- ignore branches
+  while(!work_list.empty()){
+    ir::instruction* current = work_list.back();
+    work_list.pop_back();
+    // mark instruction operands
+    for(ir::value* op: current->ops()) {
+      if(auto *i = dynamic_cast<ir::instruction*>(op)){
+        if(marked.insert(i).second)
+          work_list.push_back(i);
+      }
+    }
+    // TODO: mark last intstruction of current's reverse-dominance frontier
+  }
+
+  // sweep -- delete non-branch unmarked instructions
+  std::vector<ir::instruction*> to_delete;
+  for(ir::function *fn: mod.get_function_list()){
+    std::vector<ir::basic_block*> rpo = ir::cfg::reverse_post_order(fn);
+    // iterate through blocks
+    for(ir::basic_block *block: rpo)
+    for(ir::instruction *i: block->get_inst_list()){
+      if(marked.find(i) == marked.end())
+        to_delete.push_back(i);
+    }
+  }
+
+  // delete
+  for(ir::instruction* i: to_delete)
+    i->erase_from_parent();
+}
+
+}
+}
+}
diff --git a/lib/codegen/transform/disassociate.cc b/lib/codegen/transform/disassociate.cc
new file mode 100644
index 000000000..2244ebccd
--- /dev/null
+++ b/lib/codegen/transform/disassociate.cc
@@ -0,0 +1,76 @@
+#include "triton/codegen/transform/disassociate.h"
+#include "triton/ir/utils.h"
+#include "triton/ir/instructions.h"
+#include "triton/ir/builder.h"
+#include "triton/ir/module.h"
+#include <iostream>
+
+namespace triton {
+namespace codegen{
+namespace transform{
+
+void extract_retile_chain(ir::user *root,
+                          std::map<int, std::set<ir::user*>>& result,
+                          int depth,
+                          std::set<ir::value*>& seen) {
+  if(!seen.insert(root).second)
+    return;
+  result[depth].insert(root);
+  if(dynamic_cast<ir::make_range*>(root) ||
+     dynamic_cast<ir::splat_inst*>(root)){
+    return;
+  }
+  for(ir::value *op: root->ops()){
+    ir::user *u = dynamic_cast<ir::user*>(op);
+    if(!u)
+      continue;
+    extract_retile_chain(u, result, depth + 1, seen);
+  }
+}
+
+void disassociate::run(ir::module &mod) {
+  ir::builder &bld = mod.get_builder();
+
+  std::map<ir::user*, std::map<int, std::set<ir::user*>>> clone_info;
+  ir::for_each_instruction(mod, [&](ir::instruction *i){
+    if(dynamic_cast<ir::reshape_inst*>(i)){
+      std::map<int, std::set<ir::user*>> chains;
+      std::set<ir::value*> seen;
+      if(!dynamic_cast<ir::user*>(i->get_operand(0)))
+        return;
+      extract_retile_chain(i, chains, 0, seen);
+      if(chains.size())
+        clone_info[i] = chains;
+    }
+  });
+
+  for(const auto& x: clone_info){
+    int depth = 1;
+    std::map<ir::instruction*, ir::instruction*> clone_map;
+    while(x.second.find(depth) != x.second.end()){
+      // clone all users
+      const auto& remat = x.second.at(depth);
+      for(ir::user* u: remat){
+        ir::instruction *y = (ir::instruction*)u;
+        ir::instruction *cloned = y->clone();
+        bld.set_insert_point(y);
+        bld.insert(cloned);
+        clone_map[y] = cloned;
+        // replace operands of parents
+        if(depth > 1)
+          for(ir::user* ux: x.second.at(depth - 1))
+            clone_map.at((ir::instruction*)ux)->replace_uses_of_with(y, cloned);
+        else
+          x.first->replace_uses_of_with(y, cloned);
+      }
+      depth += 1;
+    }
+  }
+
+
+}
+
+
+}
+}
+}
diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc
new file mode 100644
index 000000000..450e98315
--- /dev/null
+++ b/lib/codegen/transform/membar.cc
@@ -0,0 +1,168 @@
+#include <vector>
+#include <set>
+#include <algorithm>
+#include "triton/codegen/analysis/layout.h"
+#include "triton/codegen/analysis/allocation.h"
+#include "triton/codegen/transform/membar.h"
+#include "triton/ir/module.h"
+#include "triton/ir/function.h"
+#include "triton/ir/basic_block.h"
+#include "triton/ir/instructions.h"
+#include "triton/ir/utils.h"
+
+namespace triton {
+
+namespace codegen{
+namespace transform{
+
+bool membar::intersect(const interval_vec_t &X, interval_t x) {
+  return std::any_of(X.begin(), X.end(), [&](const interval_t &y){
+    bool left_intersect = y.first <= x.first && x.first < y.second;
+    bool right_intersect = y.first <= x.second && x.second < y.second;
+    return left_intersect || right_intersect;
+  });
+}
+
+bool membar::intersect(const interval_vec_t &X, const interval_vec_t &Y) {
+  return std::any_of(Y.begin(), Y.end(), [&](const interval_t &y){
+    return intersect(X, y);
+  });
+}
+
+void membar::add_reference(ir::value *v, interval_vec_t &res){
+  auto *i = dynamic_cast<ir::instruction*>(v);
+  if(!i)
+    return;
+  if(!i->get_type()->is_tile_ty())
+    return;
+  analysis::shared_layout* layout = layouts_->get(v)->to_shared();
+  if(!layout)
+    return;
+  if(alloc_->has_offset(layout)){
+    unsigned offset = alloc_->offset(layout);
+    res.push_back(interval_t(offset, offset + layout->get_size()));
+  }
+}
+
+void membar::get_read_intervals(ir::instruction *i, interval_vec_t &res){
+  for(ir::value *op: i->ops())
+    add_reference(op, res);
+}
+
+void membar::get_written_intervals(ir::instruction *i, interval_vec_t &res){
+  if(!dynamic_cast<ir::phi_node*>(i) && !dynamic_cast<ir::trans_inst*>(i))
+    add_reference(i, res);
+}
+
+void membar::insert_barrier(ir::instruction *instr, ir::builder &builder) {
+  if(auto *phi = dynamic_cast<ir::phi_node*>(instr)) {
+    std::set<ir::value*> incoming;
+    for(unsigned n = 0; n < phi->get_num_incoming(); n++){
+      ir::instruction *inc_val = dynamic_cast<ir::instruction*>(phi->get_incoming_value(n));
+      assert(inc_val);
+      if(incoming.insert(inc_val).second){
+        ir::basic_block *block = inc_val->get_parent();
+        builder.set_insert_point(block->get_inst_list().back());
+        builder.create_barrier();
+      }
+    }
+  }
+  else {
+    builder.set_insert_point(instr);
+    builder.create_barrier();
+  }
+}
+
+membar::interval_vec_t membar::join(const std::vector<interval_vec_t>& intervals) {
+  membar::interval_vec_t result;
+  for(auto x: intervals)
+    for(interval_t i: x)
+      result.push_back(i);
+  return result;
+}
+
+std::pair<membar::interval_vec_t,
+          membar::interval_vec_t> membar::transfer(ir::basic_block *block,
+                                            const interval_vec_t &written_to,
+                                            const interval_vec_t &read_from,
+                                            std::set<ir::instruction*>& insert_loc,
+                                            std::set<ir::value*>& safe_war) {
+  ir::basic_block::inst_list_t instructions = block->get_inst_list();
+  interval_vec_t new_written_to = written_to;
+  interval_vec_t new_read_from = read_from;
+
+  for(ir::instruction *i: instructions){
+    interval_vec_t read, written;
+    get_read_intervals(i, read);
+    get_written_intervals(i, written);
+    bool read_after_write = intersect(new_written_to, read);
+    bool write_after_read = intersect(new_read_from, written);
+    // double buffering
+    if(safe_war.find(i) != safe_war.end()){
+      write_after_read = false;
+      read_after_write = false;
+    }
+    // record hazards
+    if(read_after_write || write_after_read) {
+      insert_loc.insert(i);
+      new_written_to.clear();
+      new_read_from.clear();
+    }
+    std::copy(written.begin(), written.end(), std::back_inserter(new_written_to));
+    std::copy(read.begin(), read.end(), std::back_inserter(new_read_from));
+  }
+  return std::make_pair(new_written_to, new_read_from);
+}
+
+void membar::run(ir::module &mod) {
+  ir::builder &builder = mod.get_builder();
+  // extract phi-node associates with double-buffered
+  // shared-memory copies. These can be read from and written to
+  // without needing synchronization
+  std::set<ir::value*> safe_war;
+  for(const auto& x: layouts_->get_all()){
+    analysis::shared_layout* layout = x.second->to_shared();
+    if(!layout || !layout->get_double_buffer())
+      continue;
+    for(ir::value *v: layout->get_values())
+      if(v != layout->get_double_buffer()->phi)
+        safe_war.insert(v);
+  }
+
+
+
+  for(ir::function *fn: mod.get_function_list()){
+    std::vector<ir::basic_block*> rpo = ir::cfg::reverse_post_order(fn);
+    std::map<ir::basic_block*, interval_vec_t> written_to;
+    std::map<ir::basic_block*, interval_vec_t> read_from;
+    std::set<ir::instruction*> insert_locs;
+    size_t n_inserted_im1 = 0;
+    bool done = false;
+    do{
+      // find barrier location
+      for(ir::basic_block *block: rpo){
+        // written to
+        std::vector<interval_vec_t> pred_written_to;
+        for(ir::basic_block* pred: block->get_predecessors())
+          pred_written_to.push_back(written_to[pred]);
+        // read from
+        std::vector<interval_vec_t> pred_read_from;
+        for(ir::basic_block* pred: block->get_predecessors())
+          pred_read_from.push_back(read_from[pred]);
+        // apply transfer function
+        auto result = transfer(block, join(pred_written_to), join(pred_read_from), insert_locs, safe_war);
+        written_to[block] = result.first;
+        read_from[block] = result.second;
+      }
+      size_t n_inserted_i = insert_locs.size();
+      done = (n_inserted_im1 == n_inserted_i);
+      n_inserted_im1 = n_inserted_i;
+    }while(!done);
+    for(ir::instruction* i: insert_locs)
+      insert_barrier(i, builder);
+  }
+}
+
+}
+}
+}
diff --git a/lib/codegen/transform/peephole.cc b/lib/codegen/transform/peephole.cc
new file mode 100644
index 000000000..98b70e4ab
--- /dev/null
+++ b/lib/codegen/transform/peephole.cc
@@ -0,0 +1,191 @@
+#include <algorithm>
+#include "triton/ir/module.h"
+#include "triton/ir/function.h"
+#include "triton/codegen/transform/peephole.h"
+
+namespace triton {
+namespace codegen{
+namespace transform{
+
+
+ir::value* rewrite_trans_phi_impl(ir::value *value, ir::builder &builder,
+                                 const std::vector<int>& perm) {
+  if(auto phi = dynamic_cast<ir::phi_node*>(value)) {
+    // transpose operands
+    std::vector<ir::value*> incs;
+    for(unsigned n = 0; n < phi->get_num_incoming(); n++)
+      incs.push_back(rewrite_trans_phi_impl(phi->get_incoming_value(n), builder, perm));
+    // create phi for transposed values
+    builder.set_insert_point(phi);
+    ir::phi_node* result = builder.create_phi(incs[0]->get_type(), incs.size());
+    for(unsigned n = 0; n < phi->get_num_incoming(); n++)
+      result->add_incoming(incs[n], phi->get_incoming_block(n));
+    return result;
+  }
+  else if(auto i = dynamic_cast<ir::instruction*>(value)){
+    ir::basic_block* block = i->get_parent();
+    auto it = std::find(block->begin(), block->end(), i);
+    it++;
+    builder.set_insert_point(it);
+    ir::instruction *trans = (ir::instruction*)builder.create_trans(i, perm);
+    trans->set_operand(0, i);
+    return trans;
+  }
+  return nullptr;
+}
+
+bool peephole::rewrite_trans_phi(ir::instruction* value, ir::builder& builder) {
+  auto trans = dynamic_cast<ir::trans_inst*>(value);
+  if(!trans)
+    return false;
+  auto users = trans->get_users();
+  auto ops = trans->ops();
+  if(users.size() > 1 || ops.size() > 1)
+    return false;
+  ir::value* op = *ops.begin();
+  // trans(phi) -> phi(trans(), trans()...)
+  auto* phi = dynamic_cast<ir::phi_node*>(op);
+  if(!phi)
+    return false;
+  ir::value* new_phi = rewrite_trans_phi_impl(phi, builder, trans->get_perm());
+  if(!new_phi)
+    return false;
+  trans->replace_all_uses_with(new_phi);
+
+  return true;
+}
+
+bool peephole::rewrite_dot(ir::instruction *value, ir::builder& builder){
+  // dot(a, b, 0) + c -> dot(a, b, c)
+  auto add = dynamic_cast<ir::binary_operator*>(value);
+  if(add && add->get_op() == ir::binary_op_t::FAdd) {
+    ir::value *lhs = add->get_operand(0);
+    ir::value *rhs = add->get_operand(1);
+    ir::dot_inst *lhs_dot = dynamic_cast<ir::dot_inst*>(lhs);
+    ir::dot_inst *rhs_dot = dynamic_cast<ir::dot_inst*>(rhs);
+    if(!lhs_dot && !rhs_dot)
+      return false;
+    ir::dot_inst *dot = lhs_dot ? lhs_dot : rhs_dot;
+    ir::value *other = (dot == lhs) ? rhs : lhs;
+    ir::value *acc = dot->get_operand(2);
+    ir::splat_inst *splat = dynamic_cast<ir::splat_inst*>(acc);
+    ir::constant_fp *_0 = nullptr;
+    if(splat)
+      _0 = dynamic_cast<ir::constant_fp*>(splat->get_operand(0));
+    if(!(_0 && _0->get_value() == 0.0))
+      return false;
+    ir::value *a = dot->get_operand(0);
+    ir::value *b = dot->get_operand(1);
+    builder.set_insert_point(add);
+    ir::value * new_dot = builder.insert(ir::dot_inst::create_nn(a, b, other, dot->get_name()));
+    add->replace_all_uses_with(new_dot);
+    return true;
+  }
+}
+
+bool peephole::rewrite_unit_red(ir::instruction *value, ir::builder& builder){
+  auto x = dynamic_cast<ir::reduce_inst*>(value);
+  if(!x)
+    return false;
+  ir::value *arg = x->get_operand(0);
+  auto shapes = arg->get_type()->get_tile_shapes();
+  if(shapes[x->get_axis()] == 1){
+    builder.set_insert_point(x);
+    ir::value* new_red = builder.create_reshape(arg, x->get_type()->get_tile_shapes());
+    x->replace_all_uses_with(new_red);
+    return true;
+  }
+  return false;
+}
+
+bool peephole::rewrite_mult(ir::instruction *value, ir::builder& builder) {
+    auto binop = dynamic_cast<ir::binary_operator*>(value);
+    if(binop && binop->get_op() == ir::binary_op_t::Mul) {
+      ir::value *lhs = binop->get_operand(0);
+      ir::value *rhs = binop->get_operand(1);
+      ir::constant_int *_1_lhs = nullptr;
+      if(ir::splat_inst *splat = dynamic_cast<ir::splat_inst*>(lhs))
+        _1_lhs = dynamic_cast<ir::constant_int*>(splat->get_operand(0));
+      ir::constant_int *_1_rhs = nullptr;
+      if(ir::splat_inst *splat = dynamic_cast<ir::splat_inst*>(rhs))
+        _1_rhs = dynamic_cast<ir::constant_int*>(splat->get_operand(0));
+      if(_1_lhs){
+        binop->replace_all_uses_with(rhs);
+        return true;
+      }
+      else if(_1_rhs){
+        binop->replace_all_uses_with(lhs);
+        return true;
+      }
+    }
+    return false;
+}
+
+
+bool peephole::rewrite_gep_ptr_min_off_plus_off(ir::instruction *value, ir::builder& builder) {
+  auto x = dynamic_cast<ir::getelementptr_inst*>(value);
+  if(!x)
+    return false;
+  auto y = dynamic_cast<ir::getelementptr_inst*>(x->get_pointer_operand());
+  if(!y)
+    return false;
+  auto idx = *y->idx_begin();
+  auto z = dynamic_cast<ir::binary_operator*>(idx);
+  if(!z)
+    return false;
+  bool is_sub = z->get_op() == ir::binary_op_t::Sub;
+  auto *lhs = dynamic_cast<ir::constant_int*>(z->get_operand(0));
+  bool is_lhs_0 = lhs && (lhs->get_value()==0);
+  bool is_rhs_eq_x_rhs = z->get_operand(1) == *x->idx_begin();
+  if(is_sub && is_lhs_0 && is_rhs_eq_x_rhs){
+    x->replace_all_uses_with(y->get_pointer_operand());
+    return true;
+  }
+  return false;
+}
+
+
+void peephole::run(ir::module &mod) {
+  ir::builder &builder = mod.get_builder();
+  // keep track of whether any modification was made
+  std::set<ir::value*> seen;
+  size_t n_seen;
+
+  // rewrite dots first
+  do{
+    n_seen = seen.size();
+    for(ir::function *fn: mod.get_function_list())
+    for(ir::basic_block *block: fn->blocks())
+    for(ir::instruction* i: block->get_inst_list()){
+      if(seen.find(i) != seen.end())
+        continue;
+      bool was_modified = rewrite_dot(i, builder);
+      if(was_modified){
+        seen.insert(i);
+      }
+    }
+  }while(seen.size() != n_seen);
+
+  // rewrite other ops
+  seen.clear();
+  do{
+    n_seen = seen.size();
+    for(ir::function *fn: mod.get_function_list())
+    for(ir::basic_block *block: fn->blocks())
+    for(ir::instruction* i: block->get_inst_list()){
+      if(seen.find(i) != seen.end())
+        continue;
+      bool was_modified = false;
+      was_modified = was_modified || rewrite_mult(i, builder);
+      was_modified = was_modified || rewrite_trans_phi(i, builder);
+      was_modified = was_modified || rewrite_unit_red(i, builder);
+      was_modified = was_modified || rewrite_gep_ptr_min_off_plus_off(i, builder);
+      if(was_modified)
+        seen.insert(i);
+    }
+  }while(seen.size() != n_seen);
+}
+
+}
+}
+}
diff --git a/lib/codegen/transform/reassociate.cc b/lib/codegen/transform/reassociate.cc
new file mode 100644
index 000000000..20241e70e
--- /dev/null
+++ b/lib/codegen/transform/reassociate.cc
@@ -0,0 +1,264 @@
+#include <algorithm>
+#include "triton/codegen/transform/reassociate.h"
+#include "triton/ir/module.h"
+#include "triton/ir/function.h"
+#include "triton/ir/basic_block.h"
+#include "triton/ir/instructions.h"
+#include "triton/ir/utils.h"
+
+namespace triton {
+namespace codegen{
+namespace transform{
+
+
+inline ir::instruction* reassociate::is_bin_add(ir::value *x) {
+  ir::binary_operator *bin_op = dynamic_cast<ir::binary_operator*>(x);
+  bool is_bin_add = bin_op && bin_op->get_op()== ir::binary_op_t::Add;
+  if(is_bin_add)
+    return (ir::instruction*)x;
+  return nullptr;
+}
+
+inline bool is_cst(ir::value *x) {
+  if(dynamic_cast<ir::constant*>(x))
+    return true;
+  if(auto *v = dynamic_cast<ir::retile_inst*>(x))
+    return is_cst(v->get_operand(0));
+  return false;
+}
+
+ir::value *reassociate::reassociate_idx(ir::value *old_value,
+                                              ir::builder &builder,
+                                              ir::value *&noncst,
+                                              ir::value *&cst){
+  // value doesn't change by default
+  ir::value* new_value = old_value;
+  cst = nullptr;
+  noncst = old_value;
+
+  // handle retiling
+  if(ir::instruction* op = dynamic_cast<ir::retile_inst*>(old_value)){
+    auto shapes = op->get_type()->get_tile_shapes();
+    ir::value *old_arg = op->get_operand(0);
+    ir::value *new_arg = reassociate_idx(old_arg, builder, noncst, cst);
+    // retile(x + y) = retile(x) + retile(y)
+    if(ir::instruction* bin_add = is_bin_add(new_arg))
+    if(cst){
+      ir::value *old_lhs = bin_add->get_operand(0);
+      ir::value *old_rhs = bin_add->get_operand(1);
+      ir::value *new_lhs = nullptr;
+      ir::value *new_rhs = nullptr;
+      if(dynamic_cast<ir::reshape_inst*>(op)){
+        builder.set_insert_point(op);
+        new_lhs = builder.create_reshape(old_lhs, shapes);
+        new_rhs = builder.create_reshape(old_rhs, shapes);
+        new_value = builder.create_add(new_lhs, new_rhs, op->get_name());
+      }
+      if(dynamic_cast<ir::broadcast_inst*>(op)){
+        builder.set_insert_point(op);
+        new_lhs = builder.create_broadcast(old_lhs, shapes);
+        new_rhs = builder.create_broadcast(old_rhs, shapes);
+        new_value = builder.create_add(new_lhs, new_rhs, op->get_name());
+      }
+      if(dynamic_cast<ir::splat_inst*>(op)){
+        builder.set_insert_point(op);
+        new_lhs = builder.create_splat(old_lhs, shapes);
+        new_rhs = builder.create_splat(old_rhs, shapes);
+        new_value = builder.create_add(new_lhs, new_rhs, op->get_name());
+      }
+    }
+  }
+
+  // handle binary addition
+  if(ir::instruction* op = is_bin_add(old_value)){
+    builder.set_insert_point(op);
+    std::string name = op->get_name();
+    ir::value *lhs = reassociate_idx(op->get_operand (0), builder, noncst, cst);
+    ir::value *rhs = reassociate_idx(op->get_operand(1), builder, noncst, cst);
+    builder.set_insert_point(op);
+    // (x + y) + z
+    if(ir::instruction* bin_lhs = is_bin_add(lhs)){
+      ir::value *llhs = bin_lhs->get_operand(0);
+      ir::value *rlhs = bin_lhs->get_operand(1);
+      // (cst + x) + y -> cst + (x + y)
+      if(is_cst(llhs))
+        new_value = builder.create_add(llhs, builder.create_add(rlhs, rhs), name);
+      // (x + cst) + y -> cst + (x + y)
+      if(is_cst(rlhs))
+        new_value = builder.create_add(rlhs, builder.create_add(llhs, rhs), name);
+    }
+    // x + (y + z)
+    if(ir::instruction* bin_rhs = is_bin_add(rhs)){
+      ir::value *lrhs = bin_rhs->get_operand(0);
+      ir::value *rrhs = bin_rhs->get_operand(1);
+      // x + (cst + y) -> cst + (x + y)
+      if(is_cst(lrhs))
+        new_value = builder.create_add(lrhs, builder.create_add(rrhs, lhs), name, cst);
+      // x + (y + cst) -> cst + (x + y)
+      if(is_cst(rrhs))
+        new_value = builder.create_add(rrhs, builder.create_add(lrhs, lhs), name, cst);
+    }
+  }
+  // extract constant and non-constant
+  if(ir::instruction *bin_add = is_bin_add(new_value)){
+    ir::value *new_lhs = bin_add->get_operand(0);
+    ir::value *new_rhs = bin_add->get_operand(1);
+    if(is_cst(new_lhs)){
+      cst = new_lhs;
+      noncst = new_rhs;
+    }
+    if(is_cst(new_rhs)){
+      cst = new_rhs;
+      noncst = new_lhs;
+    }
+  }
+  // clean-up if some re-ordering happened
+  if(old_value != new_value)
+    old_value->replace_all_uses_with(new_value);
+  return new_value;
+}
+
+/* run */
+void reassociate::run(ir::module &mod) {
+  ir::builder &builder = mod.get_builder();
+
+  // constant_range -> nv_dynamic_program_idx + nv_static_program_idx
+  for(ir::function *fn: mod.get_function_list()){
+    std::vector<ir::make_range*> ranges;
+    std::vector<ir::basic_block*> rpo = ir::cfg::reverse_post_order(fn);
+    for(ir::basic_block *block: rpo){
+      // iterate through instruction
+      for(ir::instruction *i: block->get_inst_list())
+      for(ir::value* op: i->ops())
+      if(auto *range = dynamic_cast<ir::make_range*>(op))
+        ranges.push_back(range);
+    }
+
+    builder.set_insert_point(rpo.front()->get_first_non_phi());
+    for(ir::make_range* old_range: ranges){
+      ir::value* dyn_range = builder.insert(ir::make_range_dyn::create(old_range->get_type()));
+      ir::value* static_range = ir::make_range_sta::get(old_range);
+      ir::value* new_range = builder.create_add(dyn_range, static_range);
+      old_range->replace_all_uses_with(new_range);
+    }
+  }
+
+  // reassociate
+  std::map<ir::value*, cst_info> infos;
+  std::set<ir::value*> replaced;
+  size_t n_replaced;
+  do{
+    n_replaced = replaced.size();
+    for(ir::function *fn: mod.get_function_list()){
+      std::vector<ir::basic_block*> rpo = ir::cfg::reverse_post_order(fn);
+      // iterate through blocks
+      for(ir::basic_block *block: rpo){
+      // iterate through instruction
+      for(ir::instruction *i: block->get_inst_list()){
+      // retiling
+      if(ir::retile_inst *rt = dynamic_cast<ir::retile_inst*>(i)) {
+        ir::value* op = rt->get_operand(0);
+        if(infos.find(op) != infos.end()){
+          builder.set_insert_point(rt);
+          ir::getelementptr_inst* sta = infos.at(op).sta_ptr;
+          ir::value* dyn = infos.at(op).dyn_ptr;
+          ir::value* cst = *sta->idx_begin();
+          if(dynamic_cast<ir::broadcast_inst*>(rt)) {
+            auto shapes = rt->get_type()->get_tile_shapes();
+            ir::value* ndyn = builder.create_broadcast(dyn, shapes);
+            ir::value* broadcast = builder.create_broadcast(cst, shapes);
+            ir::getelementptr_inst* nsta = (ir::getelementptr_inst*)builder.create_gep(ndyn, {broadcast});
+            infos[rt] = cst_info{ndyn, nsta};
+          }
+        }
+      }
+      // getelementptr instruction
+      if(ir::getelementptr_inst *pz = dynamic_cast<ir::getelementptr_inst*>(i)){
+        if(replaced.find(pz) != replaced.end())
+          continue;
+        // unpack GEP instruction
+        ir::value* py = pz->get_pointer_operand();
+        ir::value* offset = *pz->idx_begin();
+        // reassociate index
+        ir::value *sta = nullptr;
+        ir::value *dyn = offset;
+        reassociate_idx(offset, builder, dyn, sta);
+        if(sta){
+          builder.set_insert_point(pz);
+          ir::value *dyn_ptr = builder.create_gep(py, {dyn});
+          ir::value *sta_ptr = builder.create_gep(dyn_ptr, {sta});
+          pz->replace_all_uses_with(sta_ptr);
+          infos[sta_ptr].dyn_ptr = dyn_ptr;
+          infos[sta_ptr].sta_ptr = (ir::getelementptr_inst*)sta_ptr;
+          replaced.insert(pz);
+        }
+        // reassociate pointer argument
+        if(infos.find(py) != infos.end()){
+          builder.set_insert_point(pz);
+          ir::getelementptr_inst *sta = infos[py].sta_ptr;
+          ir::value *dyn = infos[py].dyn_ptr;
+          ir::value *cst = *sta->idx_begin();
+          ir::value *off = *pz->idx_begin();
+          ir::value *pz_dyn = builder.create_gep(dyn, {off});
+          ir::value *pz_sta = builder.create_gep(pz_dyn, {cst}, pz->get_name());
+          pz->replace_all_uses_with(pz_sta);
+          infos[pz_sta].dyn_ptr = pz_dyn;
+          infos[pz_sta].sta_ptr = (ir::getelementptr_inst*)pz_sta;
+          replaced.insert(pz);
+        }
+        // reassociate phi-node pointer
+        if(ir::phi_node* phi = dynamic_cast<ir::phi_node*>(py)){
+          // only optimize the case where py = phi pa, pz for now
+          std::vector<ir::value*> ops = phi->ops();
+          if(ops.size() != 2)
+            continue;
+          if(ops[0] != pz && ops[1] != pz)
+             continue;
+          // grab  incoming
+          size_t idx_z = (ops[0] == pz) ? 0 : 1;
+          size_t idx_a = (ops[0] == pz) ? 1 : 0;
+          // check if pa is known to have constant offset
+          ir::value *vpa = phi->get_incoming_value(idx_a);
+          auto it_a = infos.find(vpa);
+          if(it_a == infos.end())
+            continue;
+          // unpack dynamically/statically offset pointer
+          ir::value *pa_dyn = it_a->second.dyn_ptr;
+          ir::getelementptr_inst *pa_sta = it_a->second.sta_ptr;
+          ir::value *pz = phi->get_incoming_value(idx_z);
+          // extract offset
+          ir::value *off = *pa_sta->idx_begin();
+          builder.set_insert_point(phi);
+          ir::phi_node *phi_dyn = builder.create_phi(phi->get_type(), 2);
+          phi_dyn->add_incoming(pa_dyn, phi->get_incoming_block(idx_a));
+          builder.set_insert_point(phi->get_parent()->get_first_non_phi());
+          // re-add the offset
+          ir::value *phi_sta = builder.create_gep(phi_dyn, {off}, phi->get_name() + "_sta");
+          phi->replace_all_uses_with(phi_sta);
+          // remove offset from pz
+          if(auto *x = dynamic_cast<ir::instruction*>(pz)){
+            auto insts = x->get_parent()->get_inst_list();
+            auto it = std::find(insts.begin(), insts.end(), x);
+            it++;
+            builder.set_insert_point(*it);
+          }
+          ir::value *_0 = builder.get_int32(0);
+          if(off->get_type()->is_tile_ty())
+            _0 = builder.create_splat(_0, off->get_type()->get_tile_shapes());
+          ir::value *neg_off = builder.create_sub(_0, off);
+          ir::value *pz_dyn = builder.create_gep(pz, {neg_off});
+          phi_dyn->add_incoming(pz_dyn, phi->get_incoming_block(idx_z));
+          infos[phi_sta].dyn_ptr = phi_dyn;
+          infos[phi_sta].sta_ptr = (ir::getelementptr_inst*)phi_sta;
+          replaced.insert(phi);
+        }
+      }
+      }
+     }
+    }
+  }while(replaced.size() != n_replaced);
+}
+
+}
+}
+}
diff --git a/lib/driver/backend.cc b/lib/driver/backend.cc
new file mode 100755
index 000000000..2c64936ef
--- /dev/null
+++ b/lib/driver/backend.cc
@@ -0,0 +1,229 @@
+/* Copyright 2015-2017 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <vector>
+#include <stdexcept>
+#include "triton/driver/dispatch.h"
+#include "triton/driver/backend.h"
+#include "triton/driver/buffer.h"
+#include "triton/driver/context.h"
+#include "triton/driver/stream.h"
+#include "triton/driver/kernel.h"
+
+
+namespace triton
+{
+
+namespace driver
+{
+
+/*-----------------------------------*/
+//-----------  Platforms ------------*/
+/*-----------------------------------*/
+
+void backend::platforms::init() {
+  if(!cache_.empty())
+    return;
+  //if CUDA is here
+  if(dispatch::cuinit()){
+    cache_.push_back(new cu_platform());
+  }
+//  //if OpenCL is here
+//  if(dispatch::clinit()){
+//    cl_uint num_platforms;
+//    dispatch::clGetPlatformIDs(0, nullptr, &num_platforms);
+//    std::vector<cl_platform_id> ids(num_platforms);
+//    dispatch::clGetPlatformIDs(num_platforms, ids.data(), nullptr);
+//    for(cl_platform_id id: ids)
+//      cache_.push_back(new cl_platform(id));
+//  }
+//  //if host is here
+//  bool host_visible = true;
+//  if(host_visible){
+//    cache_.push_back(new host_platform());
+//  }
+  if(cache_.empty())
+    throw std::runtime_error("Triton: No backend available. Make sure CUDA is available in your library path");
+}
+
+void backend::platforms::get(std::vector<platform *> &results) {
+  std::copy(cache_.begin(), cache_.end(), std::back_inserter(results));
+}
+
+std::vector<driver::platform*> backend::platforms::cache_;
+
+
+/*-----------------------------------*/
+//-----------  Devices --------------*/
+/*-----------------------------------*/
+
+void backend::devices::init(std::vector<platform*> const & platforms) {
+  if(!cache_.empty())
+    return;
+  for(driver::platform* pf: platforms)
+    pf->devices(cache_);
+  if(cache_.empty())
+    throw std::runtime_error("Triton: No device available. Make sure that your platform is configured properly");
+}
+
+void backend::devices::get(std::vector<device*> &devs) {
+  std::copy(cache_.begin(), cache_.end(), std::back_inserter(devs));
+}
+
+std::vector<driver::device*> backend::devices::cache_;
+
+
+
+/*-----------------------------------*/
+//---------- Modules ----------------*/
+/*-----------------------------------*/
+
+void backend::modules::release(){
+  for(auto & x: cache_)
+    delete x.second;
+  cache_.clear();
+}
+
+std::map<std::tuple<driver::stream*, std::string>, driver::module*>  backend::modules::cache_;
+
+/*-----------------------------------*/
+//-----------  Kernels --------------*/
+/*-----------------------------------*/
+
+void backend::kernels::release(){
+  for(auto & x: cache_)
+    delete x.second;
+  cache_.clear();
+}
+
+driver::kernel* backend::kernels::get(driver::module *mod, std::string const & name){
+  std::tuple<driver::module*, std::string> key(mod, name);
+  if(cache_.find(key)==cache_.end()){
+    return &*cache_.insert({key, driver::kernel::create(mod, name.c_str())}).first->second;
+  }
+  return cache_.at(key);
+}
+
+std::map<std::tuple<driver::module*, std::string>, driver::kernel*> backend::kernels::cache_;
+
+/*-----------------------------------*/
+//------------  Queues --------------*/
+/*-----------------------------------*/
+
+void backend::streams::init(std::list<driver::context*> const & contexts){
+  for(driver::context* ctx : contexts)
+    if(cache_.find(ctx)==cache_.end())
+      cache_.insert(std::make_pair(ctx, std::vector<driver::stream*>{driver::stream::create(ctx)}));
+}
+
+void backend::streams::release(){
+  for(auto & x: cache_)
+    for(auto & y: x.second)
+      delete y;
+  cache_.clear();
+}
+
+driver::stream* backend::streams::get_default()
+{ return get(contexts::get_default(), 0); }
+
+driver::stream* backend::streams::get(driver::context* context, unsigned int id){
+  init(std::list<driver::context*>(1,context));
+  for(auto & x : cache_)
+    if(x.first==context)
+      return x.second[id];
+  throw;
+}
+
+void backend::streams::get(driver::context* context, std::vector<driver::stream*> & queues){
+  init(std::list<driver::context*>(1,context));
+  queues = cache_.at(context);
+}
+
+std::map<driver::context*, std::vector<driver::stream*>> backend::streams::cache_;
+
+/*-----------------------------------*/
+//------------  Contexts ------------*/
+/*-----------------------------------*/
+
+void backend::contexts::init(std::vector<driver::device*> const & devices){
+  for(driver::device* dvc: devices)
+    cache_.push_back(driver::context::create(dvc));
+}
+
+void backend::contexts::release(){
+  for(auto & x: cache_)
+    delete x;
+  cache_.clear();
+}
+
+driver::context* backend::contexts::get_default(){
+  backend::init();
+  auto it = cache_.begin();
+  std::advance(it, default_device);
+  return *it;
+}
+
+void backend::contexts::get(std::list<driver::context*> & contexts){
+  backend::init();
+  contexts = cache_;
+}
+
+std::list<driver::context*> backend::contexts::cache_;
+
+
+
+/*-----------------------------------*/
+//------------  General -------------*/
+/*-----------------------------------*/
+
+void backend::synchronize(driver::context* context){
+  for(driver::stream * queue: streams::cache_.at(context))
+    queue->synchronize();
+}
+
+
+void backend::release(){
+  backend::kernels::release();
+//  backend::programs::release();
+  backend::streams::release();
+  backend::contexts::release();
+}
+
+
+void backend::init(){
+  if(!contexts::cache_.empty())
+    return;
+  // initialize platforms
+  backend::platforms::init();
+  // initialize devices
+  backend::devices::init(platforms::cache_);
+  // initialize contexts
+  backend::contexts::init(devices::cache_);
+  // initialize streams
+  streams::init(contexts::cache_);
+}
+
+unsigned int backend::default_device = 0;
+
+}
+
+}
diff --git a/lib/driver/buffer.cc b/lib/driver/buffer.cc
new file mode 100755
index 000000000..1f499e5f3
--- /dev/null
+++ b/lib/driver/buffer.cc
@@ -0,0 +1,103 @@
+/* Copyright 2015-2017 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "triton/driver/stream.h"
+#include "triton/driver/buffer.h"
+#include "triton/driver/context.h"
+#include "triton/driver/dispatch.h"
+
+
+namespace triton
+{
+
+namespace driver
+{
+
+
+//
+
+buffer::buffer(driver::context* ctx, size_t size, CUdeviceptr cu, bool take_ownership)
+  : polymorphic_resource(cu, take_ownership), context_(ctx), size_(size) { }
+
+buffer::buffer(driver::context* ctx, size_t size, cl_mem cl, bool take_ownership)
+  : polymorphic_resource(cl, take_ownership), context_(ctx), size_(size) { }
+
+buffer::buffer(driver::context* ctx, size_t size, host_buffer_t hst, bool take_ownership)
+  : polymorphic_resource(hst, take_ownership), context_(ctx), size_(size) { }
+
+
+driver::context* buffer::context() {
+  return context_;
+}
+
+size_t buffer::size() {
+  return size_;
+}
+
+
+buffer* buffer::create(driver::context* ctx, size_t size) {
+  switch(ctx->backend()){
+  case CUDA: return new cu_buffer(ctx, size);
+  case OpenCL: return new ocl_buffer(ctx, size);
+  case Host: return new host_buffer(ctx, size);
+  default: throw std::runtime_error("unknown backend");
+  }
+}
+
+//
+
+host_buffer::host_buffer(driver::context *context, size_t size)
+  :  buffer(context, size, host_buffer_t(), true){
+  hst_->data = new char[size];
+}
+
+//
+
+ocl_buffer::ocl_buffer(driver::context* context, size_t size)
+  : buffer(context, size, cl_mem(), true){
+  cl_int err;
+  *cl_ = dispatch::clCreateBuffer(*context->cl(), CL_MEM_READ_WRITE, size, NULL, &err);
+  check(err);
+}
+
+
+//
+
+cu_buffer::cu_buffer(driver::context* context, size_t size)
+  : buffer(context, size, CUdeviceptr(), true) {
+  cu_context::context_switcher ctx_switch(*context_);
+  dispatch::cuMemAlloc(&*cu_, size);
+}
+
+cu_buffer::cu_buffer(driver::context* context, size_t size, CUdeviceptr cu, bool take_ownership)
+  : buffer(context, size, cu, take_ownership){
+}
+
+void cu_buffer::set_zero(driver::stream* queue, size_t size)
+{
+  cu_context::context_switcher ctx_switch(*context_);
+  dispatch::cuMemsetD8Async(*cu_, 0, size, *queue->cu());
+}
+
+}
+
+}
diff --git a/lib/driver/context.cc b/lib/driver/context.cc
new file mode 100755
index 000000000..473cfaac7
--- /dev/null
+++ b/lib/driver/context.cc
@@ -0,0 +1,147 @@
+/* Copyright 2015-2017 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <cassert>
+#include "triton/driver/context.h"
+#include "triton/driver/module.h"
+#include "triton/tools/sys/getenv.hpp"
+#include "triton/tools/sys/mkdir.hpp"
+
+namespace triton
+{
+
+namespace driver
+{
+
+/* ------------------------ */
+//         BASE             //
+/* ------------------------ */
+
+context::context(driver::device *dev, CUcontext cu, bool take_ownership):
+  polymorphic_resource(cu, take_ownership),
+  dev_(dev), cache_path_(get_cache_path()) {
+}
+
+context::context(driver::device *dev, cl_context cl, bool take_ownership):
+  polymorphic_resource(cl, take_ownership),
+  dev_(dev), cache_path_(get_cache_path()){
+}
+
+context::context(driver::device *dev, host_context_t hst, bool take_ownership):
+  polymorphic_resource(hst, take_ownership),
+  dev_(dev), cache_path_(get_cache_path()){
+}
+
+context* context::create(driver::device *dev){
+  switch(dev->backend()){
+  case CUDA: return new cu_context(dev);
+  case OpenCL: return new ocl_context(dev);
+  case Host: return new host_context(dev);
+  default: throw std::runtime_error("unknown backend");
+  }
+}
+
+
+driver::device* context::device() const {
+  return dev_;
+}
+
+std::string context::get_cache_path(){
+  //user-specified cache path
+  std::string result = tools::getenv("TRITON_CACHE_PATH");
+  if(!result.empty()){
+    if(tools::mkpath(result)==0)
+      return result;
+  }
+  //create in home
+  result = tools::getenv("HOME");
+  if(!result.empty())
+  {
+    result = result + "/.triton/cache/";
+    if(tools::mkpath(result)==0)
+      return result;
+  }
+  //couldn't find a directory
+  return "";
+}
+
+std::string const & context::cache_path() const{
+  return cache_path_;
+}
+
+/* ------------------------ */
+//         Host             //
+/* ------------------------ */
+
+host_context::host_context(driver::device* dev): context(dev, host_context_t(), true){
+
+}
+
+/* ------------------------ */
+//         CUDA             //
+/* ------------------------ */
+
+// RAII context switcher
+cu_context::context_switcher::context_switcher(const context &ctx): ctx_((const cu_context&)ctx) {
+  dispatch::cuCtxPushCurrent_v2(*ctx_.cu());
+}
+
+cu_context::context_switcher::~context_switcher() {
+  CUcontext tmp;
+  dispatch::cuCtxPopCurrent_v2(&tmp);
+  assert(tmp==*ctx_.cu() && "Switching back to invalid context!");
+}
+
+// import CUdevice
+CUdevice cu_context::get_device_of(CUcontext context){
+  dispatch::cuCtxPushCurrent_v2(context);
+  CUdevice res;
+  dispatch::cuCtxGetDevice(&res);
+  dispatch::cuCtxPopCurrent_v2(NULL);
+  return res;
+}
+
+// wrapper for cuda context
+cu_context::cu_context(CUcontext context, bool take_ownership): driver::context(new driver::cu_device(get_device_of(context), false),
+                                                                                context, take_ownership) {
+}
+
+cu_context::cu_context(driver::device* device): context(device, CUcontext(), true){
+  dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, *((driver::cu_device*)dev_)->cu());
+  dispatch::cuCtxPopCurrent_v2(NULL);
+}
+
+
+/* ------------------------ */
+//         OpenCL           //
+/* ------------------------ */
+
+ocl_context::ocl_context(driver::device* dev): context(dev, cl_context(), true) {
+  cl_int err;
+  *cl_ = dispatch::clCreateContext(nullptr, 1, &*dev->cl(), nullptr, nullptr, &err);
+  check(err);
+}
+
+
+
+}
+}
diff --git a/lib/driver/device.cc b/lib/driver/device.cc
new file mode 100755
index 000000000..3f82e2f33
--- /dev/null
+++ b/lib/driver/device.cc
@@ -0,0 +1,247 @@
+/* Copyright 2015-2017 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <map>
+#include <algorithm>
+#include <sstream>
+#include <cstring>
+#include <memory>
+#include "triton/driver/device.h"
+#include "triton/driver/context.h"
+#include "triton/codegen/target.h"
+
+namespace triton
+{
+
+namespace driver
+{
+
+/* ------------------------ */
+//          Host            //
+/* ------------------------ */
+
+std::unique_ptr<codegen::target> host_device::make_target() const {
+  return std::unique_ptr<codegen::cpu_target>(new codegen::cpu_target());
+}
+
+
+/* ------------------------ */
+//         OpenCL           //
+/* ------------------------ */
+
+// maximum amount of shared memory per block
+size_t ocl_device::max_shared_memory() const {
+  throw std::runtime_error("not implemented");
+//  return ocl::info<CL_DEVICE_LOCAL_MEM_SIZE>(*cl_);
+}
+
+size_t ocl_device::max_threads_per_block() const {
+  throw std::runtime_error("not implemented");
+//  return ocl::info<CL_DEVICE_MAX_WORK_ITEM_SIZES>(*cl_).at(0);
+}
+
+std::unique_ptr<codegen::target> ocl_device::make_target() const {
+  return std::unique_ptr<codegen::amd_cl_target>(new codegen::amd_cl_target());
+}
+
+/* ------------------------ */
+//         CUDA             //
+/* ------------------------ */
+
+// architecture
+cu_device::Architecture cu_device::nv_arch(std::pair<unsigned int, unsigned int> sm) const {
+  switch(sm.first) {
+   case 7:
+     switch(sm.second){
+     case 0: return Architecture::SM_7_0;
+     }
+
+  case 6:
+    switch(sm.second){
+    case 0: return Architecture::SM_6_0;
+    case 1: return Architecture::SM_6_1;
+    }
+
+  case 5:
+    switch(sm.second){
+    case 0: return Architecture::SM_5_0;
+    case 2: return Architecture::SM_5_2;
+    default: return Architecture::UNKNOWN;
+    }
+
+  case 3:
+    switch(sm.second){
+    case 0: return Architecture::SM_3_0;
+    case 5: return Architecture::SM_3_5;
+    case 7: return Architecture::SM_3_7;
+    default: return Architecture::UNKNOWN;
+    }
+
+  case 2:
+    switch(sm.second){
+    case 0: return Architecture::SM_2_0;
+    case 1: return Architecture::SM_2_1;
+    default: return Architecture::UNKNOWN;
+    }
+
+  default: return Architecture::UNKNOWN;
+  }
+}
+
+// information query
+template<CUdevice_attribute attr>
+int cu_device::cuGetInfo() const{
+  int res;
+  dispatch::cuDeviceGetAttribute(&res, attr, *cu_);
+  return res;
+}
+
+// convert to nvml
+nvmlDevice_t cu_device::nvml_device() const{
+  std::map<std::string, nvmlDevice_t> map;
+  std::string key = pci_bus_id();
+  if(map.find(key)==map.end()){
+    nvmlDevice_t device;
+    dispatch::nvmlDeviceGetHandleByPciBusId_v2(key.c_str(), &device);
+    return map.insert(std::make_pair(key, device)).first->second;
+  }
+  return map.at(key);
+}
+
+// architecture
+cu_device::Architecture cu_device::architecture() const{
+  return nv_arch(compute_capability());
+}
+
+// number of address bits
+size_t cu_device::address_bits() const{
+  return sizeof(size_t)*8;
+}
+
+// name
+std::string cu_device::name() const {
+    char tmp[128];
+    dispatch::cuDeviceGetName(tmp, 128, *cu_);
+    return std::string(tmp);
+}
+
+// PCI bus ID
+std::string cu_device::pci_bus_id() const{
+  char tmp[128];
+  dispatch::cuDeviceGetPCIBusId(tmp, 128, *cu_);
+  return std::string(tmp);
+}
+
+// force the device to be interpreted as a particular cc
+void cu_device::interpret_as(std::pair<size_t, size_t> cc){
+  interpreted_as_ = std::make_shared<std::pair<size_t, size_t>>(cc);
+}
+
+// compute capability
+std::pair<size_t, size_t> cu_device::compute_capability() const {
+  if(interpreted_as_)
+    return *interpreted_as_;
+  size_t _major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>();
+  size_t _minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>();
+  return std::make_pair(_major, _minor);
+}
+
+// maximum number of threads per block
+size_t cu_device::max_threads_per_block() const {
+  return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK>();
+}
+
+// maximum amount of shared memory per block
+size_t cu_device::max_shared_memory() const {
+  return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK>();
+}
+
+// warp size
+size_t cu_device::warp_size() const {
+  return cuGetInfo<CU_DEVICE_ATTRIBUTE_WARP_SIZE>();
+}
+
+
+// maximum block dimensions
+std::vector<size_t> cu_device::max_block_dim() const {
+  std::vector<size_t> result(3);
+  result[0] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X>();
+  result[1] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y>();
+  result[2] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z>();
+  return result;
+}
+
+// current SM clock
+size_t cu_device::current_sm_clock() const{
+  unsigned int result;
+  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
+  return result;
+}
+
+// max SM clock
+size_t cu_device::max_sm_clock() const{
+  unsigned int result;
+  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
+  return result;
+}
+
+// current memory clock
+size_t cu_device::current_mem_clock() const{
+  unsigned int result;
+  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
+  return result;
+}
+
+// max memory clock
+size_t cu_device::max_mem_clock() const{
+  unsigned int result;
+  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
+  return result;
+}
+
+// max memory clock
+void cu_device::set_max_clock() {
+  dispatch::nvmlDeviceSetApplicationsClocks(nvml_device(), max_mem_clock(), max_sm_clock());
+}
+
+// print infos
+std::string cu_device::infos() const{
+  std::ostringstream oss;
+  std::vector<size_t> max_wi_sizes = max_block_dim();
+  oss << "Platform: CUDA" << std::endl;
+  oss << "Name: " << name() << std::endl;
+  oss << "Maximum total work-group size: " << max_threads_per_block() << std::endl;
+  oss << "Maximum individual work-group sizes: " << max_wi_sizes[0] << ", " << max_wi_sizes[1] << ", " << max_wi_sizes[2] << std::endl;
+  oss << "Local memory size: " << max_shared_memory() << std::endl;
+  return oss.str();
+}
+
+// target
+std::unique_ptr<codegen::target> cu_device::make_target() const {
+  return std::unique_ptr<codegen::nvidia_cu_target>(new codegen::nvidia_cu_target());
+}
+
+
+}
+
+}
+
diff --git a/lib/driver/dispatch.cc b/lib/driver/dispatch.cc
new file mode 100755
index 000000000..fd6ca7bcb
--- /dev/null
+++ b/lib/driver/dispatch.cc
@@ -0,0 +1,342 @@
+/* Copyright 2015-2017 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "triton/driver/dispatch.h"
+#include "triton/driver/context.h"
+#include "triton/tools/sys/getenv.hpp"
+
+namespace triton
+{
+namespace driver
+{
+
+//Helpers for function definition
+#define DEFINE0(init, hlib, ret, fname) ret dispatch::fname()\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }
+
+#define DEFINE1(init, hlib, ret, fname, t1) ret dispatch::fname(t1 a)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }
+
+#define DEFINE2(init, hlib, ret, fname, t1, t2) ret dispatch::fname(t1 a, t2 b)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }
+
+#define DEFINE3(init, hlib, ret, fname, t1, t2, t3) ret dispatch::fname(t1 a, t2 b, t3 c)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }
+
+#define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }
+
+#define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }
+
+#define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }
+
+#define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }
+
+#define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }
+
+#define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }
+
+#define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }
+
+#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }
+
+#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }
+
+#define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, t18 r, t19 s)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }
+
+//Specialized helpers for OpenCL
+#define OCL_DEFINE1(ret, fname, t1) DEFINE1(clinit, opencl_, ret, fname, t1)
+#define OCL_DEFINE2(ret, fname, t1, t2) DEFINE2(clinit, opencl_, ret, fname, t1, t2)
+#define OCL_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(clinit, opencl_, ret, fname, t1, t2, t3)
+#define OCL_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(clinit, opencl_, ret, fname, t1, t2, t3, t4)
+#define OCL_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5)
+#define OCL_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6)
+#define OCL_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
+#define OCL_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
+#define OCL_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
+
+//Specialized helpers for CUDA
+#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
+#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
+#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
+#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
+#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
+#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
+#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
+#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
+#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
+#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
+#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
+
+#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
+#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
+#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
+#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
+
+bool dispatch::clinit()
+{
+    if(opencl_==nullptr)
+        opencl_ = dlopen("libOpenCL.so", RTLD_LAZY);
+    return opencl_ != nullptr;
+}
+
+bool dispatch::cuinit(){
+  if(cuda_==nullptr){
+    std::string libcuda = tools::getenv("TRITON_LIBCUDA");
+    if(libcuda.empty())
+      cuda_ = dlopen("libcuda.so", RTLD_LAZY);
+    else
+      cuda_ = dlopen(libcuda.c_str(), RTLD_LAZY);
+  }
+  if(cuda_ == nullptr)
+    return false;
+  CUresult (*fptr)(unsigned int);
+  cuInit_ = dlsym(cuda_, "cuInit");
+  *reinterpret_cast<void **>(&fptr) = cuInit_;
+  CUresult res = (*fptr)(0);
+  check(res);
+  return true;
+}
+
+bool dispatch::nvmlinit(){
+  if(nvml_==nullptr)
+    nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY);
+  nvmlReturn_t (*fptr)();
+  nvmlInit_v2_ = dlsym(nvml_, "nvmlInit_v2");
+  *reinterpret_cast<void **>(&fptr) = nvmlInit_v2_;
+  nvmlReturn_t res = (*fptr)();
+  check(res);
+  return res;
+}
+
+bool dispatch::spvllvminit(){
+  if(spvllvm_==nullptr)
+    spvllvm_ = dlopen("libLLVMSPIRVLib.so", RTLD_LAZY);
+  return spvllvm_ != nullptr;
+}
+
+//CUDA
+CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
+CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
+CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
+CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
+CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
+CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
+CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
+CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
+CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
+CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
+CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
+CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
+
+CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
+CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
+CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
+CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
+CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
+CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
+CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *)
+CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
+CUDA_DEFINE1(CUresult, cuInit, unsigned int)
+CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
+CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
+CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
+CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
+CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
+CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
+CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
+CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
+CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
+CUDA_DEFINE1(CUresult, cuCtxGetCurrent, CUcontext*)
+CUDA_DEFINE1(CUresult, cuCtxSetCurrent, CUcontext)
+CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
+CUDA_DEFINE1(CUresult, cuCtxPushCurrent_v2, CUcontext)
+CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*)
+CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
+CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
+CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
+
+NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*)
+NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
+NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
+NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, unsigned int, unsigned int)
+
+// OpenCL
+cl_int dispatch::clBuildProgram(cl_program a, cl_uint b, const cl_device_id * c, const char * d, void (*e)(cl_program, void *), void * f)
+{ return f_impl<dispatch::clinit>(opencl_, clBuildProgram, clBuildProgram_, "clBuildProgram", a, b, c, d, e, f); }
+
+cl_context dispatch::clCreateContext(const cl_context_properties * a, cl_uint b, const cl_device_id * c, void (*d)(const char *, const void *, size_t, void *), void * e, cl_int * f)
+{ return f_impl<dispatch::clinit>(opencl_, dispatch::clCreateContext, dispatch::clCreateContext_, "clCreateContext", a, b, c, d, e, f); }
+
+OCL_DEFINE9(cl_int, clEnqueueNDRangeKernel, cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*,  cl_uint, const cl_event*, cl_event*)
+OCL_DEFINE4(cl_int, clSetKernelArg, cl_kernel, cl_uint, size_t, const void *)
+OCL_DEFINE1(cl_int, clReleaseMemObject, cl_mem)
+OCL_DEFINE1(cl_int, clFinish, cl_command_queue)
+OCL_DEFINE5(cl_int, clGetMemObjectInfo, cl_mem, cl_mem_info, size_t, void *, size_t *)
+OCL_DEFINE5(cl_int, clGetCommandQueueInfo, cl_command_queue, cl_command_queue_info, size_t, void *, size_t *)
+OCL_DEFINE1(cl_int, clReleaseContext, cl_context)
+OCL_DEFINE1(cl_int, clReleaseEvent, cl_event)
+OCL_DEFINE9(cl_int, clEnqueueWriteBuffer, cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *)
+OCL_DEFINE9(cl_int, clEnqueueReadBuffer, cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *)
+OCL_DEFINE6(cl_int, clGetProgramBuildInfo, cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *)
+OCL_DEFINE1(cl_int, clReleaseDevice, cl_device_id)
+OCL_DEFINE5(cl_int, clGetDeviceIDs, cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *)
+OCL_DEFINE5(cl_int, clGetContextInfo, cl_context, cl_context_info, size_t, void *, size_t *)
+OCL_DEFINE5(cl_int, clGetDeviceInfo, cl_device_id, cl_device_info, size_t, void *, size_t *)
+OCL_DEFINE1(cl_int, clReleaseCommandQueue, cl_command_queue)
+OCL_DEFINE3(cl_int, clGetPlatformIDs, cl_uint, cl_platform_id *, cl_uint *)
+OCL_DEFINE5(cl_int, clGetPlatformInfo, cl_platform_id, cl_platform_info, size_t, void *, size_t *)
+OCL_DEFINE5(cl_int, clGetEventProfilingInfo, cl_event, cl_profiling_info, size_t, void *, size_t *)
+OCL_DEFINE7(cl_program, clCreateProgramWithBinary, cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *)
+OCL_DEFINE4(cl_command_queue, clCreateCommandQueue, cl_context, cl_device_id, cl_command_queue_properties, cl_int *)
+OCL_DEFINE1(cl_int, clRetainEvent, cl_event)
+OCL_DEFINE1(cl_int, clReleaseProgram, cl_program)
+OCL_DEFINE1(cl_int, clFlush, cl_command_queue)
+OCL_DEFINE5(cl_int, clGetProgramInfo, cl_program, cl_program_info, size_t, void *, size_t *)
+OCL_DEFINE5(cl_int, clGetKernelInfo, cl_kernel, cl_kernel_info, size_t, void *, size_t *)
+OCL_DEFINE6(cl_int, clGetKernelWorkGroupInfo, cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *)
+OCL_DEFINE3(cl_kernel, clCreateKernel, cl_program, const char *, cl_int *)
+OCL_DEFINE4(cl_int, clCreateKernelsInProgram, cl_program, cl_uint, cl_kernel*, cl_uint*)
+OCL_DEFINE5(cl_mem, clCreateBuffer, cl_context, cl_mem_flags, size_t, void *, cl_int *)
+OCL_DEFINE5(cl_program, clCreateProgramWithSource, cl_context, cl_uint, const char **, const size_t *, cl_int *)
+OCL_DEFINE1(cl_int, clReleaseKernel, cl_kernel)
+
+// LLVM to SPIR-V
+int dispatch::initializeLLVMToSPIRVPass(llvm::PassRegistry &registry){
+  return f_impl<dispatch::spvllvminit>(spvllvm_, initializeLLVMToSPIRVPass, initializeLLVMToSPIRVPass_, "initializeLLVMToSPIRVPass", std::ref(registry));
+}
+
+bool dispatch::writeSpirv(llvm::Module *M, std::ostream &OS, std::string &ErrMsg){
+  return f_impl<dispatch::spvllvminit>(spvllvm_, writeSpirv, writeSpirv_, "writeSpirv", M, std::ref(OS), std::ref(ErrMsg));
+}
+
+// Release
+void dispatch::release(){
+  if(cuda_){
+    dlclose(cuda_);
+    cuda_ = nullptr;
+  }
+}
+
+void * dispatch::opencl_;
+void* dispatch::cuda_;
+void* dispatch::nvml_;
+void* dispatch::spvllvm_;
+
+//OpenCL
+void* dispatch::clBuildProgram_;
+void* dispatch::clEnqueueNDRangeKernel_;
+void* dispatch::clSetKernelArg_;
+void* dispatch::clReleaseMemObject_;
+void* dispatch::clFinish_;
+void* dispatch::clGetMemObjectInfo_;
+void* dispatch::clGetCommandQueueInfo_;
+void* dispatch::clReleaseContext_;
+void* dispatch::clReleaseEvent_;
+void* dispatch::clEnqueueWriteBuffer_;
+void* dispatch::clEnqueueReadBuffer_;
+void* dispatch::clGetProgramBuildInfo_;
+void* dispatch::clReleaseDevice_;
+void* dispatch::clCreateContext_;
+void* dispatch::clGetDeviceIDs_;
+void* dispatch::clGetContextInfo_;
+void* dispatch::clGetDeviceInfo_;
+void* dispatch::clReleaseCommandQueue_;
+void* dispatch::clGetPlatformIDs_;
+void* dispatch::clGetPlatformInfo_;
+void* dispatch::clGetEventProfilingInfo_;
+void* dispatch::clCreateProgramWithBinary_;
+void* dispatch::clCreateCommandQueue_;
+void* dispatch::clRetainEvent_;
+void* dispatch::clReleaseProgram_;
+void* dispatch::clFlush_;
+void* dispatch::clGetProgramInfo_;
+void* dispatch::clGetKernelInfo_;
+void* dispatch::clGetKernelWorkGroupInfo_;
+void* dispatch::clCreateKernel_;
+void* dispatch::clCreateKernelsInProgram_;
+void* dispatch::clCreateBuffer_;
+void* dispatch::clCreateProgramWithSource_;
+void* dispatch::clReleaseKernel_;
+
+//CUDA
+void* dispatch::cuCtxGetCurrent_;
+void* dispatch::cuCtxSetCurrent_;
+void* dispatch::cuCtxDestroy_v2_;
+void* dispatch::cuEventCreate_;
+void* dispatch::cuDeviceGet_;
+void* dispatch::cuMemcpyDtoH_v2_;
+void* dispatch::cuStreamCreate_;
+void* dispatch::cuEventElapsedTime_;
+void* dispatch::cuMemFree_v2_;
+void* dispatch::cuMemcpyDtoHAsync_v2_;
+void* dispatch::cuDriverGetVersion_;
+void* dispatch::cuDeviceGetName_;
+void* dispatch::cuDeviceGetPCIBusId_;
+void* dispatch::cuModuleGetGlobal_v2_;
+
+void* dispatch::cuMemcpyHtoDAsync_v2_;
+void* dispatch::cuModuleLoad_;
+void* dispatch::cuLaunchKernel_;
+void* dispatch::cuModuleUnload_;
+void* dispatch::cuModuleLoadDataEx_;
+void* dispatch::cuDeviceGetAttribute_;
+void* dispatch::cuDeviceGetCount_;
+void* dispatch::cuMemcpyHtoD_v2_;
+void* dispatch::cuInit_;
+void* dispatch::cuEventRecord_;
+void* dispatch::cuCtxCreate_v2_;
+void* dispatch::cuModuleGetFunction_;
+void* dispatch::cuStreamSynchronize_;
+void* dispatch::cuStreamDestroy_v2_;
+void* dispatch::cuEventDestroy_v2_;
+void* dispatch::cuMemAlloc_v2_;
+void* dispatch::cuPointerGetAttribute_;
+void* dispatch::cuCtxGetDevice_;
+void* dispatch::cuMemsetD8Async_;
+void* dispatch::cuCtxPushCurrent_v2_;
+void* dispatch::cuCtxPopCurrent_v2_;
+void* dispatch::cuFuncGetAttribute_;
+void* dispatch::cuFuncSetAttribute_;
+void* dispatch::cuFuncSetCacheConfig_;
+
+void* dispatch::nvmlInit_v2_;
+void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_;
+void* dispatch::nvmlDeviceGetClockInfo_;
+void* dispatch::nvmlDeviceGetMaxClockInfo_;
+void* dispatch::nvmlDeviceSetApplicationsClocks_;
+
+// SPIR-V
+void* dispatch::initializeLLVMToSPIRVPass_;
+void* dispatch::writeSpirv_;
+
+}
+}
diff --git a/lib/driver/error.cc b/lib/driver/error.cc
new file mode 100755
index 000000000..ea7d1721a
--- /dev/null
+++ b/lib/driver/error.cc
@@ -0,0 +1,160 @@
+/* Copyright 2015-2017 Philippe Tillet
+* 
+* Permission is hereby granted, free of charge, to any person obtaining 
+* a copy of this software and associated documentation files 
+* (the "Software"), to deal in the Software without restriction, 
+* including without limitation the rights to use, copy, modify, merge, 
+* publish, distribute, sublicense, and/or sell copies of the Software, 
+* and to permit persons to whom the Software is furnished to do so, 
+* subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be 
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "triton/driver/error.h"
+
+namespace triton
+{
+namespace driver
+{
+
+void check(CUresult err)
+{
+  using namespace exception::cuda;
+  switch(err)
+  {
+  case CUDA_SUCCESS                              : break;
+  case CUDA_ERROR_INVALID_VALUE                  : throw invalid_value();
+  case CUDA_ERROR_OUT_OF_MEMORY                  : throw out_of_memory();
+  case CUDA_ERROR_NOT_INITIALIZED                : throw not_initialized();
+  case CUDA_ERROR_DEINITIALIZED                  : throw deinitialized();
+  case CUDA_ERROR_PROFILER_DISABLED              : throw profiler_disabled();
+  case CUDA_ERROR_PROFILER_NOT_INITIALIZED       : throw profiler_not_initialized();
+  case CUDA_ERROR_PROFILER_ALREADY_STARTED       : throw profiler_already_started();
+  case CUDA_ERROR_PROFILER_ALREADY_STOPPED       : throw profiler_already_stopped();
+  case CUDA_ERROR_NO_DEVICE                      : throw no_device();
+  case CUDA_ERROR_INVALID_DEVICE                 : throw invalid_device();
+  case CUDA_ERROR_INVALID_IMAGE                  : throw invalid_image();
+  case CUDA_ERROR_INVALID_CONTEXT                : throw invalid_context();
+  case CUDA_ERROR_CONTEXT_ALREADY_CURRENT        : throw context_already_current();
+  case CUDA_ERROR_MAP_FAILED                     : throw map_failed();
+  case CUDA_ERROR_UNMAP_FAILED                   : throw unmap_failed();
+  case CUDA_ERROR_ARRAY_IS_MAPPED                : throw array_is_mapped();
+  case CUDA_ERROR_ALREADY_MAPPED                 : throw already_mapped();
+  case CUDA_ERROR_NO_BINARY_FOR_GPU              : throw no_binary_for_gpu();
+  case CUDA_ERROR_ALREADY_ACQUIRED               : throw already_acquired();
+  case CUDA_ERROR_NOT_MAPPED                     : throw not_mapped();
+  case CUDA_ERROR_NOT_MAPPED_AS_ARRAY            : throw not_mapped_as_array();
+  case CUDA_ERROR_NOT_MAPPED_AS_POINTER          : throw not_mapped_as_pointer();
+  case CUDA_ERROR_ECC_UNCORRECTABLE              : throw ecc_uncorrectable();
+  case CUDA_ERROR_UNSUPPORTED_LIMIT              : throw unsupported_limit();
+  case CUDA_ERROR_CONTEXT_ALREADY_IN_USE         : throw context_already_in_use();
+  case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        : throw peer_access_unsupported();
+  case CUDA_ERROR_INVALID_PTX                    : throw invalid_ptx();
+  case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       : throw invalid_graphics_context();
+  case CUDA_ERROR_INVALID_SOURCE                 : throw invalid_source();
+  case CUDA_ERROR_FILE_NOT_FOUND                 : throw file_not_found();
+  case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND : throw shared_object_symbol_not_found();
+  case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      : throw shared_object_init_failed();
+  case CUDA_ERROR_OPERATING_SYSTEM               : throw operating_system();
+  case CUDA_ERROR_INVALID_HANDLE                 : throw invalid_handle();
+  case CUDA_ERROR_NOT_FOUND                      : throw not_found();
+  case CUDA_ERROR_NOT_READY                      : throw not_ready();
+  case CUDA_ERROR_ILLEGAL_ADDRESS                : throw illegal_address();
+  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        : throw launch_out_of_resources();
+  case CUDA_ERROR_LAUNCH_TIMEOUT                 : throw launch_timeout();
+  case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  : throw launch_incompatible_texturing();
+  case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    : throw peer_access_already_enabled();
+  case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        : throw peer_access_not_enabled();
+  case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         : throw primary_context_active();
+  case CUDA_ERROR_CONTEXT_IS_DESTROYED           : throw context_is_destroyed();
+  case CUDA_ERROR_ASSERT                         : throw assert_error();
+  case CUDA_ERROR_TOO_MANY_PEERS                 : throw too_many_peers();
+  case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED : throw host_memory_already_registered();
+  case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     : throw host_memory_not_registered();
+  case CUDA_ERROR_HARDWARE_STACK_ERROR           : throw hardware_stack_error();
+  case CUDA_ERROR_ILLEGAL_INSTRUCTION            : throw illegal_instruction();
+  case CUDA_ERROR_MISALIGNED_ADDRESS             : throw misaligned_address();
+  case CUDA_ERROR_INVALID_ADDRESS_SPACE          : throw invalid_address_space();
+  case CUDA_ERROR_INVALID_PC                     : throw invalid_pc();
+  case CUDA_ERROR_LAUNCH_FAILED                  : throw launch_failed();
+  case CUDA_ERROR_NOT_PERMITTED                  : throw not_permitted();
+  case CUDA_ERROR_NOT_SUPPORTED                  : throw not_supported();
+  case CUDA_ERROR_UNKNOWN                        : throw unknown();
+  default                                        : throw unknown();
+  }
+}
+
+void check(cl_int err)
+{
+    using namespace exception::ocl;
+    switch(err)
+    {
+        case CL_SUCCESS:                        break;
+        case CL_DEVICE_NOT_FOUND:               throw device_not_found();
+        case CL_DEVICE_NOT_AVAILABLE:           throw device_not_available();
+        case CL_COMPILER_NOT_AVAILABLE:         throw compiler_not_available();
+        case CL_MEM_OBJECT_ALLOCATION_FAILURE:  throw mem_object_allocation_failure();
+        case CL_OUT_OF_RESOURCES:               throw out_of_resources();
+        case CL_OUT_OF_HOST_MEMORY:             throw out_of_host_memory();
+        case CL_PROFILING_INFO_NOT_AVAILABLE:   throw profiling_info_not_available();
+        case CL_MEM_COPY_OVERLAP:               throw mem_copy_overlap();
+        case CL_IMAGE_FORMAT_MISMATCH:          throw image_format_mismatch();
+        case CL_IMAGE_FORMAT_NOT_SUPPORTED:     throw image_format_not_supported();
+        case CL_BUILD_PROGRAM_FAILURE:          throw build_program_failure();
+        case CL_MAP_FAILURE:                    throw map_failure();
+
+        case CL_INVALID_VALUE:                  throw invalid_value();
+        case CL_INVALID_DEVICE_TYPE:            throw invalid_device_type();
+        case CL_INVALID_PLATFORM:               throw invalid_platform();
+        case CL_INVALID_DEVICE:                 throw invalid_device();
+        case CL_INVALID_CONTEXT:                throw invalid_context();
+        case CL_INVALID_QUEUE_PROPERTIES:       throw invalid_queue_properties();
+        case CL_INVALID_COMMAND_QUEUE:          throw invalid_command_queue();
+        case CL_INVALID_HOST_PTR:               throw invalid_host_ptr();
+        case CL_INVALID_MEM_OBJECT:             throw invalid_mem_object();
+        case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: throw invalid_image_format_descriptor();
+        case CL_INVALID_IMAGE_SIZE:             throw invalid_image_size();
+        case CL_INVALID_SAMPLER:                throw invalid_sampler();
+        case CL_INVALID_BINARY:                 throw invalid_binary();
+        case CL_INVALID_BUILD_OPTIONS:          throw invalid_build_options();
+        case CL_INVALID_PROGRAM:                throw invalid_program();
+        case CL_INVALID_PROGRAM_EXECUTABLE:     throw invalid_program_executable();
+        case CL_INVALID_KERNEL_NAME:            throw invalid_kernel_name();
+        case CL_INVALID_KERNEL_DEFINITION:      throw invalid_kernel_definition();
+        case CL_INVALID_KERNEL:                 throw invalid_kernel();
+        case CL_INVALID_ARG_INDEX:              throw invalid_arg_index();
+        case CL_INVALID_ARG_VALUE:              throw invalid_arg_value();
+        case CL_INVALID_ARG_SIZE:               throw invalid_arg_size();
+        case CL_INVALID_KERNEL_ARGS:            throw invalid_kernel_args();
+        case CL_INVALID_WORK_DIMENSION:         throw invalid_work_dimension();
+        case CL_INVALID_WORK_GROUP_SIZE:        throw invalid_work_group_size();
+        case CL_INVALID_WORK_ITEM_SIZE:         throw invalid_work_item_size();
+        case CL_INVALID_GLOBAL_OFFSET:          throw invalid_global_offset();
+        case CL_INVALID_EVENT_WAIT_LIST:        throw invalid_event_wait_list();
+        case CL_INVALID_EVENT:                  throw invalid_event();
+        case CL_INVALID_OPERATION:              throw invalid_operation();
+        case CL_INVALID_GL_OBJECT:              throw invalid_gl_object();
+        case CL_INVALID_BUFFER_SIZE:            throw invalid_buffer_size();
+        case CL_INVALID_MIP_LEVEL:              throw invalid_mip_level();
+        case CL_INVALID_GLOBAL_WORK_SIZE:       throw invalid_global_work_size();
+    #ifdef CL_INVALID_PROPERTY
+        case CL_INVALID_PROPERTY:               throw invalid_property();
+    #endif
+        default: throw;
+    }
+}
+
+
+}
+}
+
diff --git a/lib/driver/event.cc b/lib/driver/event.cc
new file mode 100755
index 000000000..ad341d701
--- /dev/null
+++ b/lib/driver/event.cc
@@ -0,0 +1,40 @@
+/* Copyright 2015-2017 Philippe Tillet
+* 
+* Permission is hereby granted, free of charge, to any person obtaining 
+* a copy of this software and associated documentation files 
+* (the "Software"), to deal in the Software without restriction, 
+* including without limitation the rights to use, copy, modify, merge, 
+* publish, distribute, sublicense, and/or sell copies of the Software, 
+* and to permit persons to whom the Software is furnished to do so, 
+* subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be 
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "triton/driver/event.h"
+
+namespace triton
+{
+namespace driver
+{
+
+float event::elapsed_time() const{
+  float time;
+  dispatch::cuEventElapsedTime(&time, cu_->first, cu_->second);
+  return time;
+}
+
+handle<cu_event_t> const & event::cu() const
+{ return cu_; }
+
+}
+}
diff --git a/lib/driver/handle.cc b/lib/driver/handle.cc
new file mode 100755
index 000000000..8899eb30e
--- /dev/null
+++ b/lib/driver/handle.cc
@@ -0,0 +1,108 @@
+/* Copyright 2015-2017 Philippe Tillet
+* 
+* Permission is hereby granted, free of charge, to any person obtaining 
+* a copy of this software and associated documentation files 
+* (the "Software"), to deal in the Software without restriction, 
+* including without limitation the rights to use, copy, modify, merge, 
+* publish, distribute, sublicense, and/or sell copies of the Software, 
+* and to permit persons to whom the Software is furnished to do so, 
+* subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be 
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "triton/driver/handle.h"
+#include "triton/driver/error.h"
+
+namespace triton
+{
+
+namespace driver
+{
+
+//Host
+inline void _delete(host_platform_t) { }
+inline void _delete(host_device_t)   { }
+inline void _delete(host_context_t)  { }
+inline void _delete(host_module_t)   { }
+inline void _delete(host_stream_t)   { }
+inline void _delete(host_buffer_t x)   { if(x.data) delete[] x.data; }
+inline void _delete(host_function_t) { }
+
+//OpenCL
+inline void _delete(cl_platform_id) { }
+inline void _delete(cl_device_id x) { dispatch::clReleaseDevice(x); }
+inline void _delete(cl_context x) { dispatch::clReleaseContext(x); }
+inline void _delete(cl_program x) { dispatch::clReleaseProgram(x); }
+inline void _delete(cl_kernel x) { dispatch::clReleaseKernel(x); }
+inline void _delete(cl_command_queue x) { dispatch::clReleaseCommandQueue(x); }
+inline void _delete(cl_mem x) { dispatch::clReleaseMemObject(x); }
+
+//CUDA
+inline void _delete(CUcontext x) { dispatch::cuCtxDestroy(x); }
+inline void _delete(CUdeviceptr x) { dispatch::cuMemFree(x); }
+inline void _delete(CUstream x) { dispatch::cuStreamDestroy(x); }
+inline void _delete(CUdevice) { }
+inline void _delete(CUevent x) { dispatch::cuEventDestroy(x); }
+inline void _delete(CUfunction) { }
+inline void _delete(CUmodule x) { dispatch::cuModuleUnload(x); }
+inline void _delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
+inline void _delete(CUPlatform){}
+
+//Constructor
+template<class T>
+handle<T>::handle(T cu, bool take_ownership): h_(new T(cu)), has_ownership_(take_ownership)
+{ }
+
+template<class T>
+handle<T>::handle(): has_ownership_(false){ }
+
+
+template<class T>
+handle<T>::~handle(){
+  try{
+    if(has_ownership_ && h_ && h_.unique())
+      _delete(*h_);
+  }catch(const exception::cuda::base&){
+    // order of destruction for global variables
+    // is not guaranteed
+  }
+}
+
+template class handle<CUdeviceptr>;
+template class handle<CUstream>;
+template class handle<CUcontext>;
+template class handle<CUdevice>;
+template class handle<cu_event_t>;
+template class handle<CUfunction>;
+template class handle<CUmodule>;
+template class handle<CUPlatform>;
+
+template class handle<cl_platform_id>;
+template class handle<cl_device_id>;
+template class handle<cl_context>;
+template class handle<cl_program>;
+template class handle<cl_command_queue>;
+template class handle<cl_mem>;
+template class handle<cl_kernel>;
+
+template class handle<host_platform_t>;
+template class handle<host_device_t>;
+template class handle<host_context_t>;
+template class handle<host_module_t>;
+template class handle<host_stream_t>;
+template class handle<host_buffer_t>;
+template class handle<host_function_t>;
+
+
+}
+}
diff --git a/lib/driver/kernel.cc b/lib/driver/kernel.cc
new file mode 100755
index 000000000..e8bed34bc
--- /dev/null
+++ b/lib/driver/kernel.cc
@@ -0,0 +1,152 @@
+/* Copyright 2015-2017 Philippe Tillet
+* 
+* Permission is hereby granted, free of charge, to any person obtaining 
+* a copy of this software and associated documentation files 
+* (the "Software"), to deal in the Software without restriction, 
+* including without limitation the rights to use, copy, modify, merge, 
+* publish, distribute, sublicense, and/or sell copies of the Software, 
+* and to permit persons to whom the Software is furnished to do so, 
+* subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be 
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <string.h>
+#include "triton/driver/kernel.h"
+#include "triton/driver/buffer.h"
+
+namespace triton
+{
+
+namespace driver
+{
+
+
+/* ------------------------ */
+//         Base             //
+/* ------------------------ */
+
+kernel::kernel(driver::module *program, CUfunction fn, bool has_ownership):
+  polymorphic_resource(fn, has_ownership), program_(program){
+}
+
+kernel::kernel(driver::module *program, cl_kernel fn, bool has_ownership):
+  polymorphic_resource(fn, has_ownership), program_(program){
+}
+
+kernel::kernel(driver::module *program, host_function_t fn, bool has_ownership):
+  polymorphic_resource(fn, has_ownership), program_(program){
+}
+
+kernel* kernel::create(driver::module* program, const char* name) {
+    switch(program->backend()){
+    case CUDA: return new cu_kernel(program, name);
+    case OpenCL: return new ocl_kernel(program, name);
+    case Host: return new host_kernel(program, name);
+    default: throw std::runtime_error("unknown backend");
+    }
+}
+
+driver::module* kernel::module() {
+  return program_;
+}
+
+/* ------------------------ */
+//         Host             //
+/* ------------------------ */
+
+host_kernel::host_kernel(driver::module* program, const char *name): kernel(program, host_function_t(), true) {
+  hst_->fn = program->hst()->functions.at(name);
+}
+
+void host_kernel::setArg(unsigned int index, std::size_t size, void* ptr){
+  if(index + 1> params_store_.size()){
+    params_store_.resize(index+1);
+    params_.resize(index+1);
+  }
+  params_store_[index].reset(malloc(size), free);
+  memcpy(params_store_[index].get(), ptr, size);
+  params_[index] = params_store_[index].get();
+}
+
+void host_kernel::setArg(unsigned int index, driver::buffer* buffer){
+  if(buffer)
+    kernel::setArg(index, (void*)buffer->hst()->data);
+  else
+    kernel::setArg(index, (std::ptrdiff_t)0);
+}
+
+const std::vector<void *> &host_kernel::params(){
+  return params_;
+}
+
+/* ------------------------ */
+//         OpenCL           //
+/* ------------------------ */
+
+ocl_kernel::ocl_kernel(driver::module* program, const char* name): kernel(program, cl_kernel(), true) {
+//  cl_uint res;
+//  check(dispatch::clCreateKernelsInProgram(*program->cl(), 0, NULL, &res));
+//  std::cout << res << std::endl;
+  cl_int err;
+  *cl_ = dispatch::clCreateKernel(*program->cl(), "matmul", &err);
+  check(err);
+}
+
+void ocl_kernel::setArg(unsigned int index, std::size_t size, void* ptr) {
+  check(dispatch::clSetKernelArg(*cl_, index, size, ptr));
+}
+
+void ocl_kernel::setArg(unsigned int index, driver::buffer* buffer) {
+  if(buffer)
+    check(dispatch::clSetKernelArg(*cl_, index, sizeof(cl_mem), (void*)&*buffer->cl()));
+  else
+    kernel::setArg(index, (std::ptrdiff_t)0);
+}
+
+
+/* ------------------------ */
+//         CUDA             //
+/* ------------------------ */
+
+cu_kernel::cu_kernel(driver::module *program, const char * name) : kernel(program, CUfunction(), true) {
+  cu_params_store_.reserve(64);
+  cu_params_.reserve(64);
+  dispatch::cuModuleGetFunction(&*cu_, *program->cu(), name);
+//  dispatch::cuFuncSetCacheConfig(*cu_, CU_FUNC_CACHE_PREFER_SHARED);
+}
+
+void cu_kernel::setArg(unsigned int index, std::size_t size, void* ptr){
+  if(index + 1> cu_params_store_.size()){
+    cu_params_store_.resize(index+1);
+    cu_params_.resize(index+1);
+  }
+  cu_params_store_[index].reset(malloc(size), free);
+  memcpy(cu_params_store_[index].get(), ptr, size);
+  cu_params_[index] = cu_params_store_[index].get();
+}
+
+void cu_kernel::setArg(unsigned int index, driver::buffer* data){
+  if(data)
+    kernel::setArg(index, *data->cu());
+  else
+    kernel::setArg(index, (std::ptrdiff_t)0);
+}
+
+void* const* cu_kernel::cu_params() const
+{ return cu_params_.data(); }
+
+
+}
+
+}
+
diff --git a/lib/driver/module.cc b/lib/driver/module.cc
new file mode 100755
index 000000000..28940f563
--- /dev/null
+++ b/lib/driver/module.cc
@@ -0,0 +1,288 @@
+/* Copyright 2015-2017 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+#include <fstream>
+#include <memory>
+#include "triton/driver/module.h"
+#include "triton/driver/context.h"
+#include "triton/driver/error.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+namespace triton
+{
+namespace driver
+{
+
+/* ------------------------ */
+//         Base             //
+/* ------------------------ */
+
+void module::init_llvm() {
+  static bool init = false;
+  if(!init){
+    llvm::InitializeAllTargetInfos();
+    llvm::InitializeAllTargets();
+    llvm::InitializeAllTargetMCs();
+    llvm::InitializeAllAsmParsers();
+    llvm::InitializeAllAsmPrinters();
+    init = true;
+  }
+}
+
+module::module(driver::context* ctx, CUmodule mod, bool has_ownership)
+  : polymorphic_resource(mod, has_ownership), ctx_(ctx) {
+}
+
+module::module(driver::context* ctx, cl_program mod, bool has_ownership)
+  : polymorphic_resource(mod, has_ownership), ctx_(ctx) {
+}
+
+module::module(driver::context* ctx, host_module_t mod, bool has_ownership)
+  : polymorphic_resource(mod, has_ownership), ctx_(ctx) {
+}
+
+driver::context* module::context() const {
+  return ctx_;
+}
+
+module* module::create(driver::context* ctx, std::unique_ptr<llvm::Module> src) {
+  switch(ctx->backend()){
+    case CUDA: return new cu_module(ctx, std::move(src));
+    case OpenCL: return new ocl_module(ctx, std::move(src));
+    case Host: return new host_module(ctx, std::move(src));
+    default: throw std::runtime_error("unknown backend");
+  }
+}
+
+void module::compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
+                                 const std::string &proc, std::string layout,
+                                 llvm::SmallVectorImpl<char> &buffer,
+                                 const std::string& features,
+                                 file_type_t ft) {
+  init_llvm();
+  // debug
+//  llvm::legacy::PassManager pm;
+//  pm.add(llvm::createPrintModulePass(llvm::outs()));
+//  pm.add(llvm::createVerifierPass());
+//  pm.run(*module);
+  // create machine
+  module->setTargetTriple(triple);
+  std::string error;
+  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
+  llvm::TargetOptions opt;
+  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
+  opt.UnsafeFPMath = false;
+  opt.NoInfsFPMath = false;
+  opt.NoNaNsFPMath = true;
+  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
+                                                             llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive);
+  // set data layout
+  if(layout.empty())
+    module->setDataLayout(machine->createDataLayout());
+  else
+    module->setDataLayout(layout);
+  // emit machine code
+  for (llvm::Function &f : module->functions())
+    f.addFnAttr(llvm::Attribute::AlwaysInline);
+  llvm::legacy::PassManager pass;
+  llvm::raw_svector_ostream stream(buffer);
+  // convert triton file type to llvm file type
+  auto ll_file_type = [&](module::file_type_t type){
+    if(type == Object)
+      return llvm::TargetMachine::CGFT_ObjectFile;
+    return llvm::TargetMachine::CGFT_AssemblyFile;
+  };
+  // emit
+  machine->addPassesToEmitFile(pass, stream, nullptr, ll_file_type(ft));
+  pass.run(*module);
+}
+
+
+/* ------------------------ */
+//        Host              //
+/* ------------------------ */
+
+host_module::host_module(driver::context * context, std::unique_ptr<llvm::Module> src): module(context, host_module_t(), true) {
+  init_llvm();
+  // host info
+//  std::string triple = llvm::sys::getDefaultTargetTriple();
+//  std::string cpu = llvm::sys::getHostCPUName();
+//  llvm::SmallVector<char, 0> buffer;
+//  module::compile_llvm_module(src, triple, cpu, "", buffer, "", Assembly);
+
+  // create kernel wrapper
+  llvm::LLVMContext &ctx = src->getContext();
+  llvm::Type *void_ty = llvm::Type::getVoidTy(ctx);
+  llvm::Type *args_ty = llvm::Type::getInt8PtrTy(ctx)->getPointerTo();
+  llvm::Type *int32_ty = llvm::Type::getInt32Ty(ctx);
+  llvm::FunctionType *main_ty = llvm::FunctionType::get(void_ty, {args_ty, int32_ty, int32_ty, int32_ty}, false);
+  llvm::Function* main = llvm::Function::Create(main_ty, llvm::Function::ExternalLinkage, "main", &*src);
+  llvm::Function* fn = src->getFunction("matmul");
+  llvm::FunctionType *fn_ty = fn->getFunctionType();
+  std::vector<llvm::Value*> fn_args(fn_ty->getNumParams());
+  std::vector<llvm::Value*> ptrs(fn_args.size() - 3);
+  llvm::BasicBlock* entry = llvm::BasicBlock::Create(ctx, "entry", main);
+  llvm::IRBuilder<> ir_builder(ctx);
+  ir_builder.SetInsertPoint(entry);
+  for(unsigned i = 0; i < ptrs.size(); i++)
+    ptrs[i] = ir_builder.CreateGEP(main->arg_begin(), ir_builder.getInt32(i));
+  for(unsigned i = 0; i < ptrs.size(); i++){
+    llvm::Value* addr = ir_builder.CreateBitCast(ir_builder.CreateLoad(ptrs[i]), fn_ty->getParamType(i)->getPointerTo());
+    fn_args[i] = ir_builder.CreateLoad(addr);
+  }
+  fn_args[fn_args.size() - 3] = main->arg_begin() + 1;
+  fn_args[fn_args.size() - 2] = main->arg_begin() + 2;
+  fn_args[fn_args.size() - 1] = main->arg_begin() + 3;
+  ir_builder.CreateCall(fn, fn_args);
+  ir_builder.CreateRetVoid();
+
+
+  // create execution engine
+  for(llvm::Function& fn: src->functions())
+    hst_->functions[fn.getName()] = &fn;
+  llvm::EngineBuilder builder(std::move(src));
+  builder.setErrorStr(&hst_->error);
+  builder.setMCJITMemoryManager(llvm::make_unique<llvm::SectionMemoryManager>());
+  builder.setOptLevel(llvm::CodeGenOpt::Aggressive);
+  builder.setEngineKind(llvm::EngineKind::JIT);
+  builder.setUseOrcMCJITReplacement(true);
+  hst_->engine = builder.create();
+}
+
+std::unique_ptr<buffer> host_module::symbol(const char *name) const {
+  throw std::runtime_error("not implemented");
+}
+
+
+/* ------------------------ */
+//         OpenCL           //
+/* ------------------------ */
+
+ocl_module::ocl_module(driver::context * context, std::unique_ptr<llvm::Module> src): module(context, cl_program(), true) {
+  throw std::runtime_error("not supported");
+//  init_llvm();
+//  llvm::SmallVector<char, 0> buffer;
+//  module::compile_llvm_module(src, "amdgcn-amd-amdhsa-amdgizcl", "gfx902", "", buffer, "code-object-v3", Object);
+//  std::ofstream output("/tmp/tmp.o", std::ios::binary);
+//  std::copy(buffer.begin(), buffer.end(), std::ostreambuf_iterator<char>(output));
+//  system("ld.lld-8 /tmp/tmp.o -shared -o /tmp/tmp.o");
+//  std::ifstream input("/tmp/tmp.o", std::ios::in | std::ios::binary );
+//  std::vector<unsigned char> in_buffer(std::istreambuf_iterator<char>(input), {});
+//  size_t sizes[] = {in_buffer.size()};
+//  const unsigned char* data[] = {(unsigned char*)in_buffer.data()};
+//  cl_int status;
+//  cl_int err;
+//  *cl_ = dispatch::clCreateProgramWithBinary(*context->cl(), 1, &*context->device()->cl(), sizes, data, &status, &err);
+//  check(status);
+//  check(err);
+//  try{
+//  dispatch::clBuildProgram(*cl_, 1, &*context->device()->cl(), NULL, NULL, NULL);
+//  }
+//  catch(...){
+//  char log[2048];
+//  dispatch::clGetProgramBuildInfo(*cl_, *context->device()->cl(), CL_PROGRAM_BUILD_LOG, 1024, log, NULL);
+//  throw;
+//  }
+}
+
+std::unique_ptr<buffer> ocl_module::symbol(const char *name) const {
+  throw std::runtime_error("not implemented");
+}
+
+/* ------------------------ */
+//         CUDA             //
+/* ------------------------ */
+static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){
+  size_t start_replace = str.find(begin);
+  size_t end_replace = str.find(end, start_replace);
+  if(start_replace == std::string::npos)
+    return false;
+  str.replace(start_replace, end_replace + 1 - start_replace, target);
+  return true;
+}
+
+std::string cu_module::compile_llvm_module(std::unique_ptr<llvm::Module> module, driver::device* device) {
+   // options
+   auto options = llvm::cl::getRegisteredOptions();
+//   for(auto& opt: options)
+//     std::cout << opt.getKey().str() << std::endl;
+   auto* short_ptr = static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"]);
+   assert(short_ptr);
+   short_ptr->setValue(true);
+   // compute capability
+   auto cc = ((driver::cu_device*)device)->compute_capability();
+   std::string sm = "sm_" + std::to_string(cc.first) + std::to_string(cc.second);
+   // create
+   llvm::SmallVector<char, 0> buffer;
+   module::compile_llvm_module(std::move(module), "nvptx64-nvidia-cuda", sm, "", buffer, "ptx63", Assembly);
+   std::string result(buffer.begin(), buffer.end());
+   find_and_replace(result, ".version", "\n", ".version 6.4\n");
+   while(find_and_replace(result, "\t// begin inline asm", "\n", ""));
+   while(find_and_replace(result, "\t// end inline asm", "\n", ""));
+   return result;
+}
+
+cu_module::cu_module(driver::context * context, std::unique_ptr<llvm::Module> ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { }
+
+cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){
+  cu_context::context_switcher ctx(*context);
+//  std::cout << source << std::endl;
+  // JIT compile source-code
+  CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER};
+  unsigned int errbufsize = 8096;
+  std::string errbuf(errbufsize, 0);
+  void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)errbuf.data()};
+  try{
+    dispatch::cuModuleLoadDataEx(&*cu_, source_.data(), 2, opt, optval);
+  }catch(exception::cuda::base const &){
+//#ifdef TRITON_LOG_PTX_ERROR
+    std::cerr << "Compilation Failed! Log: " << std::endl;
+    std::cerr << errbuf << std::endl;
+//#endif
+    throw;
+  }
+}
+
+std::unique_ptr<buffer> cu_module::symbol(const char *name) const{
+  CUdeviceptr handle;
+  size_t size;
+  dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name);
+  std::unique_ptr<buffer> res(new cu_buffer(ctx_, size, handle, false));
+  return std::move(res);
+}
+
+
+}
+}
+
diff --git a/lib/driver/platform.cc b/lib/driver/platform.cc
new file mode 100755
index 000000000..90cb1913b
--- /dev/null
+++ b/lib/driver/platform.cc
@@ -0,0 +1,89 @@
+/* Copyright 2015-2017 Philippe Tillet
+* 
+* Permission is hereby granted, free of charge, to any person obtaining 
+* a copy of this software and associated documentation files 
+* (the "Software"), to deal in the Software without restriction, 
+* including without limitation the rights to use, copy, modify, merge, 
+* publish, distribute, sublicense, and/or sell copies of the Software, 
+* and to permit persons to whom the Software is furnished to do so, 
+* subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be 
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <string>
+#include "triton/driver/platform.h"
+#include "triton/driver/device.h"
+
+
+namespace triton
+{
+namespace driver
+{
+
+
+/* ------------------------ */
+//         CUDA             //
+/* ------------------------ */
+
+std::string cu_platform::version() const{
+  int version;
+  dispatch::cuDriverGetVersion(&version);
+  return std::to_string(version);
+}
+
+void cu_platform::devices(std::vector<device *> &devices) const{
+  int N;
+  dispatch::cuDeviceGetCount(&N);
+  for(int i = 0 ; i < N ; ++i){
+    CUdevice dvc;
+    dispatch::cuDeviceGet(&dvc, i);
+    devices.push_back(new driver::cu_device(dvc));
+  }
+}
+
+/* ------------------------ */
+//        OpenCL            //
+/* ------------------------ */
+
+std::string cl_platform::version() const {
+  size_t size;
+  check(dispatch::clGetPlatformInfo(*cl_, CL_PLATFORM_VERSION, 0, nullptr, &size));
+  std::string result(size, 0);
+  check(dispatch::clGetPlatformInfo(*cl_, CL_PLATFORM_VERSION, size, (void*)&*result.begin(), nullptr));
+  return result;
+}
+
+void cl_platform::devices(std::vector<device*> &devices) const{
+  cl_uint num_devices;
+  check(dispatch::clGetDeviceIDs(*cl_, CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices));
+  std::vector<cl_device_id> ids(num_devices);
+  check(dispatch::clGetDeviceIDs(*cl_, CL_DEVICE_TYPE_GPU, num_devices, ids.data(), nullptr));
+  for(cl_device_id id: ids)
+    devices.push_back(new driver::ocl_device(id));
+}
+
+/* ------------------------ */
+//        Host              //
+/* ------------------------ */
+
+std::string host_platform::version() const {
+  return "1.0";
+}
+
+void host_platform::devices(std::vector<driver::device*> &devices) const {
+  devices.push_back(new driver::host_device());
+}
+
+
+}
+}
diff --git a/lib/driver/stream.cc b/lib/driver/stream.cc
new file mode 100755
index 000000000..2ff5746fc
--- /dev/null
+++ b/lib/driver/stream.cc
@@ -0,0 +1,181 @@
+/* Copyright 2015-2017 Philippe Tillet
+* 
+* Permission is hereby granted, free of charge, to any person obtaining 
+* a copy of this software and associated documentation files 
+* (the "Software"), to deal in the Software without restriction, 
+* including without limitation the rights to use, copy, modify, merge, 
+* publish, distribute, sublicense, and/or sell copies of the Software, 
+* and to permit persons to whom the Software is furnished to do so, 
+* subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be 
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <cassert>
+#include <array>
+#include "triton/driver/backend.h"
+#include "triton/driver/stream.h"
+#include "triton/driver/context.h"
+#include "triton/driver/device.h"
+#include "triton/driver/event.h"
+#include "triton/driver/kernel.h"
+#include "triton/driver/buffer.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/GenericValue.h"
+
+namespace triton
+{
+
+namespace driver
+{
+
+/* ------------------------ */
+//         Base             //
+/* ------------------------ */
+
+stream::stream(driver::context *ctx, CUstream cu, bool has_ownership)
+  : polymorphic_resource(cu, has_ownership), ctx_(ctx) {
+}
+
+stream::stream(driver::context *ctx, cl_command_queue cl, bool has_ownership)
+  : polymorphic_resource(cl, has_ownership), ctx_(ctx) {
+}
+
+stream::stream(driver::context *ctx, host_stream_t cl, bool has_ownership)
+  : polymorphic_resource(cl, has_ownership), ctx_(ctx) {
+}
+
+driver::stream* stream::create(driver::context* ctx) {
+  switch(ctx->backend()){
+    case CUDA: return new cu_stream(ctx);
+    case OpenCL: return new cl_stream(ctx);
+    case Host: return new host_stream(ctx);
+    default: throw std::runtime_error("unknown backend");
+  }
+}
+
+driver::context* stream::context() const {
+  return ctx_;
+}
+
+/* ------------------------ */
+//          Host            //
+/* ------------------------ */
+
+host_stream::host_stream(driver::context *ctx): stream(ctx, host_stream_t(), true) {
+
+}
+
+void host_stream::synchronize() {
+
+}
+
+void host_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<event> const *, event* event) {
+  driver::host_kernel* hst_kernel = (host_kernel*)kernel;
+  llvm::ExecutionEngine* engine = kernel->module()->hst()->engine;
+  void (*fn)(char**, int32_t, int32_t, int32_t) = (void(*)(char**, int32_t, int32_t, int32_t))engine->getFunctionAddress("main");
+  for(size_t i = 0; i < grid[0]; i++)
+    for(size_t j = 0; j < grid[1]; j++)
+      for(size_t k = 0; k < grid[2]; k++)
+        fn((char**)hst_kernel->params().data(), int32_t(i), int32_t(j), int32_t(k));
+}
+
+void host_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
+  std::memcpy((void*)buffer->hst()->data, ptr, size);
+}
+
+void host_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
+  std::memcpy(ptr, (const void*)buffer->hst()->data, size);
+}
+
+
+/* ------------------------ */
+//         OpenCL           //
+/* ------------------------ */
+
+cl_stream::cl_stream(driver::context *ctx): stream(ctx, cl_command_queue(), true) {
+  cl_int err;
+  *cl_ = dispatch::clCreateCommandQueue(*ctx->cl(), *ctx->device()->cl(), 0, &err);
+  check(err);
+}
+
+void cl_stream::synchronize() {
+  check(dispatch::clFinish(*cl_));
+}
+
+void cl_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<event> const *, event* event) {
+  std::array<size_t, 3> global = {grid[0]*block[0], grid[1]*block[1], grid[2]*block[2]};
+  check(dispatch::clEnqueueNDRangeKernel(*cl_, *kernel->cl(), grid.size(), NULL, (const size_t*)global.data(), (const size_t*)block.data(), 0, NULL, NULL));
+}
+
+void cl_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
+  check(dispatch::clEnqueueWriteBuffer(*cl_, *buffer->cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL));
+}
+
+void cl_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
+  check(dispatch::clEnqueueReadBuffer(*cl_, *buffer->cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL));
+}
+
+/* ------------------------ */
+//         CUDA             //
+/* ------------------------ */
+
+inline CUcontext get_context() {
+  CUcontext result;
+  dispatch::cuCtxGetCurrent(&result);
+  return result;
+}
+
+cu_stream::cu_stream(CUstream str, bool take_ownership):
+  stream(backend::contexts::import(get_context()), str, take_ownership) {
+}
+
+cu_stream::cu_stream(driver::context *context): stream((driver::cu_context*)context, CUstream(), true) {
+  cu_context::context_switcher ctx_switch(*ctx_);
+  dispatch::cuStreamCreate(&*cu_, 0);
+}
+
+void cu_stream::synchronize() {
+  cu_context::context_switcher ctx_switch(*ctx_);
+  dispatch::cuStreamSynchronize(*cu_);
+}
+
+void cu_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<event> const *, event* event) {
+  driver::cu_kernel* cu_kernel = (driver::cu_kernel*)kernel;
+  cu_context::context_switcher ctx_switch(*ctx_);
+  if(event)
+    dispatch::cuEventRecord(event->cu()->first, *cu_);
+  dispatch::cuLaunchKernel(*kernel->cu(), grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, *cu_,(void**)cu_kernel->cu_params(), NULL);
+  if(event)
+    dispatch::cuEventRecord(event->cu()->second, *cu_);
+}
+
+void cu_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
+  cu_context::context_switcher ctx_switch(*ctx_);
+  if(blocking)
+    dispatch::cuMemcpyHtoD(*buffer->cu() + offset, ptr, size);
+  else
+    dispatch::cuMemcpyHtoDAsync(*buffer->cu() + offset, ptr, size, *cu_);
+}
+
+void cu_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
+  cu_context::context_switcher ctx_switch(*ctx_);
+  if(blocking)
+    dispatch::cuMemcpyDtoH(ptr, *buffer->cu() + offset, size);
+  else
+    dispatch::cuMemcpyDtoHAsync(ptr, *buffer->cu() + offset, size, *cu_);
+}
+
+
+}
+
+}
diff --git a/lib/ir/basic_block.cc b/lib/ir/basic_block.cc
new file mode 100644
index 000000000..0654156a3
--- /dev/null
+++ b/lib/ir/basic_block.cc
@@ -0,0 +1,41 @@
+#include "triton/ir/basic_block.h"
+#include "triton/ir/instructions.h"
+#include "triton/ir/type.h"
+#include "triton/ir/function.h"
+
+namespace triton {
+namespace ir {
+
+class phi_node;
+
+
+basic_block::basic_block(context &ctx, const std::string &name, function *parent):
+    value(type::get_label_ty(ctx), name), ctx_(ctx), parent_(parent) {
+  if(parent_)
+    parent_->insert_block(this);
+}
+
+basic_block* basic_block::create(context &ctx, const std::string &name, function *parent){
+  return new basic_block(ctx, name, parent);
+}
+
+void basic_block::add_predecessor(basic_block *pred) {
+  preds_.push_back(pred);
+  if(pred)
+    pred->succs_.push_back(this);
+}
+
+
+
+basic_block::iterator basic_block::get_first_non_phi(){
+  auto it = begin();
+  for(; it != end(); it++)
+  if(!dynamic_cast<phi_node*>(*it)){
+    return it;
+  }
+  return it;
+}
+
+}
+
+}
diff --git a/lib/ir/builder.cc b/lib/ir/builder.cc
new file mode 100644
index 000000000..b1e417e5f
--- /dev/null
+++ b/lib/ir/builder.cc
@@ -0,0 +1,349 @@
+#include <string>
+#include <algorithm>
+#include "triton/ir/basic_block.h"
+#include "triton/ir/builder.h"
+#include "triton/ir/constant.h"
+#include "triton/ir/instructions.h"
+#include "triton/ir/type.h"
+
+namespace triton{
+namespace ir{
+
+builder::builder(context &ctx):
+  ctx_(ctx), block_(nullptr), insert_point_(nullptr) {}
+
+//===----------------------------------------------------------------------===//
+//                               utilities
+//===----------------------------------------------------------------------===//
+void builder::set_insert_point(basic_block::iterator it){
+  block_ = (*it)->get_parent();
+  insert_point_ = it;
+}
+
+void builder::set_insert_point(instruction* i){
+  block_ = i->get_parent();
+  auto it = std::find(block_->begin(), block_->end(), i);
+  set_insert_point(it);
+}
+
+
+void builder::set_insert_point_after(instruction* i){
+  block_ = i->get_parent();
+  auto it = std::find(block_->begin(), block_->end(), i);
+  set_insert_point(++it);
+}
+
+
+void builder::set_insert_point(basic_block *block){
+  block_ = block;
+  insert_point_ = block->end();
+}
+
+
+//===----------------------------------------------------------------------===//
+//                               convenience functions
+//===----------------------------------------------------------------------===//
+
+value *builder::get_int32(unsigned val) {
+  return constant_int::get(type::get_int32_ty(ctx_), val);
+}
+
+type *builder::get_void_ty()
+{ return type::get_void_ty(ctx_); }
+
+type *builder::get_int1_ty()
+{ return type::get_int1_ty(ctx_); }
+
+type *builder::get_int8_ty()
+{ return type::get_int8_ty(ctx_); }
+
+type *builder::get_int16_ty()
+{ return type::get_int16_ty(ctx_); }
+
+type *builder::get_int32_ty()
+{ return type::get_int32_ty(ctx_); }
+
+type *builder::get_int64_ty()
+{ return type::get_int64_ty(ctx_); }
+
+type *builder::get_half_ty()
+{ return type::get_half_ty(ctx_); }
+
+type *builder::get_float_ty()
+{ return type::get_float_ty(ctx_); }
+
+type *builder::get_double_ty()
+{ return type::get_double_ty(ctx_); }
+
+
+//===----------------------------------------------------------------------===//
+//                               terminator instructions
+//===----------------------------------------------------------------------===//
+
+value* builder::create_br(basic_block *dest){
+  dest->add_predecessor(block_);
+  return insert(branch_inst::create(dest));
+}
+
+value* builder::create_cond_br(value *cond, basic_block *if_dest, basic_block *else_dest){
+  if_dest->add_predecessor(block_);
+  else_dest->add_predecessor(block_);
+  return insert(branch_inst::create(cond, if_dest, else_dest));
+}
+
+value *builder::create_ret_void() {
+  return insert(return_inst::create(ctx_));
+}
+
+//===----------------------------------------------------------------------===//
+//                               cast instructions
+//===----------------------------------------------------------------------===//
+#define DEFINE_CAST_INSTR(SUFFIX, OPCODE)\
+  value *builder::create_ ## SUFFIX(value *src, type *dst_ty, std::string const &name){\
+    return create_cast(OPCODE, src, dst_ty, name);\
+  }
+
+DEFINE_CAST_INSTR(si_to_fp, cast_op_t::SIToFP)
+DEFINE_CAST_INSTR(ui_to_fp, cast_op_t::UIToFP)
+DEFINE_CAST_INSTR(fp_to_si, cast_op_t::FPToSI)
+DEFINE_CAST_INSTR(fp_to_ui, cast_op_t::FPToUI)
+DEFINE_CAST_INSTR(fp_ext, cast_op_t::FPExt)
+DEFINE_CAST_INSTR(fp_trunc, cast_op_t::FPTrunc)
+
+value* builder::create_cast(cast_op_t op, value *v, type *dst_ty, const std::string &name){
+  return insert(cast_inst::create(op, v, dst_ty), name);
+}
+
+value* builder::create_int_cast(value *src, type *dst_ty, bool is_signed, const std::string &name){
+  return insert(cast_inst::create_integer_cast(src, dst_ty, is_signed), name);
+}
+
+//===----------------------------------------------------------------------===//
+//                               phi instructions
+//===----------------------------------------------------------------------===//
+
+phi_node* builder::create_phi(type *ty, unsigned num_reserved, const std::string &name){
+  return insert(phi_node::create(ty, num_reserved), name);
+}
+
+//===----------------------------------------------------------------------===//
+//                               binary float instructions
+//===----------------------------------------------------------------------===//
+
+#define DEFINE_BINARY_FLOAT(SUFFIX, OPCODE)\
+  value *builder::create_ ## SUFFIX(value *lhs, value *rhs, const std::string &name){\
+    return insert(binary_operator::create(OPCODE, lhs, rhs), name);\
+  }
+
+// Binary
+DEFINE_BINARY_FLOAT(fmul, binary_op_t::FMul)
+DEFINE_BINARY_FLOAT(fdiv, binary_op_t::FDiv)
+DEFINE_BINARY_FLOAT(frem, binary_op_t::FRem)
+DEFINE_BINARY_FLOAT(fadd, binary_op_t::FAdd)
+DEFINE_BINARY_FLOAT(fsub, binary_op_t::FSub)
+
+
+//===----------------------------------------------------------------------===//
+//                               binary int instructions
+//===----------------------------------------------------------------------===//
+
+
+value* builder::create_insert_nuwnswb_binop(binary_op_t op, value *lhs,
+                                            value *rhs, const std::string &name,
+                                            bool has_nuw, bool has_nsw) {
+  binary_operator* result = insert(binary_operator::create(op, lhs, rhs), name);
+  if (has_nuw) result->set_has_no_unsigned_wrap();
+  if (has_nsw) result->set_has_no_signed_wrap();
+  return result;
+}
+
+#define DEFINE_NOWRAP_BINARY(SUFFIX, OPCODE)\
+  value* builder::create_ ## SUFFIX(value *lhs, value *rhs, const std::string &name, bool has_nuw, bool has_nsw){\
+    return create_insert_nuwnswb_binop(OPCODE, lhs, rhs, name, has_nuw, has_nsw);\
+  }\
+
+#define DEFINE_BINARY_INT(SUFFIX, OPCODE)\
+  value *builder::create_ ## SUFFIX(value *lhs, value *rhs, const std::string &name){\
+    return create_insert_nuwnswb_binop(OPCODE, lhs, rhs, name, false, false);\
+  }
+
+
+
+// Binary
+DEFINE_NOWRAP_BINARY(mul, binary_op_t::Mul)
+DEFINE_NOWRAP_BINARY(add, binary_op_t::Add)
+DEFINE_NOWRAP_BINARY(sub, binary_op_t::Sub)
+DEFINE_NOWRAP_BINARY(shl, binary_op_t::Shl)
+DEFINE_NOWRAP_BINARY(ashr, binary_op_t::AShr)
+DEFINE_NOWRAP_BINARY(lshr, binary_op_t::LShr)
+DEFINE_BINARY_INT(sdiv, binary_op_t::SDiv)
+DEFINE_BINARY_INT(udiv, binary_op_t::UDiv)
+DEFINE_BINARY_INT(srem, binary_op_t::SRem)
+DEFINE_BINARY_INT(urem, binary_op_t::URem)
+DEFINE_BINARY_INT(and, binary_op_t::And)
+DEFINE_BINARY_INT(or, binary_op_t::Or)
+DEFINE_BINARY_INT(xor, binary_op_t::Xor)
+
+
+//===----------------------------------------------------------------------===//
+//                               getelementptr instructions
+//===----------------------------------------------------------------------===//
+
+value* builder::create_gep(value *ptr, const std::vector<value*>& idx_list, const std::string &name){
+  return insert(getelementptr_inst::create(ptr, idx_list), name);
+}
+
+//===----------------------------------------------------------------------===//
+//                               icmp instructions
+//===----------------------------------------------------------------------===//
+
+value *builder::create_icmp(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name){
+  return insert(icmp_inst::create(pred, lhs, rhs), name);
+}
+
+#define DEFINE_ICMP_INSTR(SUFFIX, OPCODE)\
+  value *builder::create_icmp ## SUFFIX(value *lhs, value *rhs, const std::string &name){\
+    return create_icmp(OPCODE, lhs, rhs, name);\
+  }
+
+// Signed
+DEFINE_ICMP_INSTR(SLE, cmp_pred_t::ICMP_SLE)
+DEFINE_ICMP_INSTR(SLT, cmp_pred_t::ICMP_SLT)
+DEFINE_ICMP_INSTR(SGE, cmp_pred_t::ICMP_SGE)
+DEFINE_ICMP_INSTR(SGT, cmp_pred_t::ICMP_SGT)
+// Unsigned
+DEFINE_ICMP_INSTR(ULE, cmp_pred_t::ICMP_ULE)
+DEFINE_ICMP_INSTR(ULT, cmp_pred_t::ICMP_ULT)
+DEFINE_ICMP_INSTR(UGE, cmp_pred_t::ICMP_UGE)
+DEFINE_ICMP_INSTR(UGT, cmp_pred_t::ICMP_UGT)
+// General
+DEFINE_ICMP_INSTR(EQ, cmp_pred_t::ICMP_EQ)
+DEFINE_ICMP_INSTR(NE, cmp_pred_t::ICMP_NE)
+
+
+//===----------------------------------------------------------------------===//
+//                               fcmp instructions
+//===----------------------------------------------------------------------===//
+
+value *builder::create_fcmp(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name){
+  return insert(fcmp_inst::create(pred, lhs, rhs), name);
+}
+
+#define DEFINE_FCMP_INSTR(SUFFIX, OPCODE)\
+  value *builder::create_fcmp ## SUFFIX(value *lhs, value *rhs, const std::string &name){\
+    return create_fcmp(OPCODE, lhs, rhs, name);\
+  }
+
+// Ordered
+DEFINE_FCMP_INSTR(OLE, cmp_pred_t::FCMP_OLE)
+DEFINE_FCMP_INSTR(OLT, cmp_pred_t::FCMP_OLT)
+DEFINE_FCMP_INSTR(OGE, cmp_pred_t::FCMP_OGE)
+DEFINE_FCMP_INSTR(OGT, cmp_pred_t::FCMP_OGT)
+DEFINE_FCMP_INSTR(OEQ, cmp_pred_t::FCMP_OEQ)
+DEFINE_FCMP_INSTR(ONE, cmp_pred_t::FCMP_ONE)
+
+
+
+//===----------------------------------------------------------------------===//
+//                               load/store instructions
+//===----------------------------------------------------------------------===//
+
+value *builder::create_load(value *ptr, const std::string &name){
+  return insert(unmasked_load_inst::create(ptr, name));
+}
+
+value *builder::create_store(value *ptr, value *val, const std::string &name){
+  return insert(unmasked_store_inst::create(ptr, val, name));
+}
+
+value *builder::create_masked_load(value *ptr, value *mask, value *false_value, const std::string &name){
+  return insert(masked_load_inst::create(ptr, mask, false_value, name));
+}
+
+value *builder::create_masked_store(value *ptr, value *val, value *mask, const std::string &name){
+  return insert(masked_store_inst::create(ptr, val, mask, name));
+}
+
+//===----------------------------------------------------------------------===//
+//                               tile instructions
+//===----------------------------------------------------------------------===//
+
+value *builder::create_reshape(value *arg, const type::tile_shapes_t &shapes, const std::string &name) {
+  return insert(reshape_inst::create(arg, shapes, name));
+}
+
+value *builder::create_splat(value *arg, const type::tile_shapes_t &shapes, const std::string &name) {
+  return insert(splat_inst::create(arg, shapes, name));
+}
+
+value *builder::create_broadcast(value *arg, const type::tile_shapes_t &shapes, const std::string &name) {
+  return insert(broadcast_inst::create(arg, shapes, name));
+}
+
+value *builder::create_downcast(value *arg, const std::string &name) {
+  return insert(downcast_inst::create(arg, name));
+}
+
+//===----------------------------------------------------------------------===//
+//                               built-in instructions
+//===----------------------------------------------------------------------===//
+
+value *builder::create_get_program_id(unsigned axis, const std::string &name) {
+  return insert(get_program_id_inst::create(ctx_, axis, name));
+}
+
+value *builder::create_get_num_program(unsigned axis, const std::string &name) {
+  return insert(get_num_program_inst::create(ctx_, axis, name));
+}
+
+value *builder::create_atomic_cas(value *ptr, value *cmp, value *val, const std::string &name){
+  return insert(atomic_cas_inst::create(ptr, cmp, val, name));
+}
+
+value *builder::create_atomic_exch(value *ptr, value *val, const std::string &name){
+  return insert(atomic_exch_inst::create(ptr, val, name));
+}
+
+value *builder::create_atomic_add(value *ptr, value *val, const std::string &name){
+  return insert(atomic_add_inst::create(ptr, val, name));
+}
+
+value *builder::create_dot(value *A, value *B, value *C, const std::string &name) {
+  return insert(dot_inst::create_nn(A, B, C, name));
+}
+
+value *builder::create_trans(value *A, const std::vector<int>& perm, const std::string &name) {
+  return insert(trans_inst::create(A, perm, name));
+}
+
+value *builder::create_sqrt(value *A, const std::string &name) {
+  return insert(sqrt_inst::create(A, name));
+}
+
+value *builder::create_reduce(value *A, reduce_inst::op_t op, unsigned axis, const std::string &name) {
+  return insert(reduce_inst::create(A, op, axis, name));
+}
+
+value *builder::create_select(value *pred, value *if_value, value *else_value, const std::string &name){
+  return insert(select_inst::create(pred, if_value, else_value, name));
+}
+
+//===----------------------------------------------------------------------===//
+//                               intrinsic instructions
+//===----------------------------------------------------------------------===//
+
+
+value *builder::create_copy_to_shared(value *arg, const std::string &name) {
+  return insert(copy_to_shared_inst::create(arg, name));
+}
+
+value *builder::create_copy_from_shared(value *arg, const std::string &name) {
+  return insert(copy_from_shared_inst::create(arg, name));
+}
+
+value *builder::create_barrier(const std::string &name) {
+  return insert(barrier_inst::create(ctx_, name));
+}
+
+}
+}
diff --git a/lib/ir/constant.cc b/lib/ir/constant.cc
new file mode 100644
index 000000000..8a3f1a343
--- /dev/null
+++ b/lib/ir/constant.cc
@@ -0,0 +1,115 @@
+#include <cassert>
+#include "triton/ir/constant.h"
+#include "triton/ir/type.h"
+#include "triton/ir/context.h"
+#include "triton/ir/context_impl.h"
+
+namespace triton{
+namespace ir{
+
+
+// constant
+
+constant *constant::get_null_value(type *ty) {
+  context &ctx = ty->get_context();
+  switch (ty->get_scalar_ty()->get_type_id()) {
+  case type::IntegerTyID:
+    return constant_int::get(ty, 0);
+  case type::HalfTyID:
+    return constant_fp::get(type::get_half_ty(ctx), 0);
+  case type::FloatTyID:
+    return constant_fp::get(type::get_float_ty(ctx), 0);
+  case type::DoubleTyID:
+    return constant_fp::get(type::get_double_ty(ctx), 0);
+  default:
+    throw std::runtime_error("Cannot create a null constant of that type!");
+  }
+}
+
+// FIXME
+
+constant *constant::get_all_ones_value(type *ty) {
+  if(ty->is_integer_ty())
+    return constant_int::get(ty, 0xFFFFFFFF);
+  if(ty->is_floating_point_ty())
+    return constant_fp::get(ty, 0xFFFFFFFF);
+  throw std::runtime_error("Cannot create all ones value for that type!");
+}
+
+// constant_int
+// FIXME use something like APInt
+
+constant_int::constant_int(type *ty, uint64_t value)
+  : constant(ty, 0), value_(value){ }
+
+constant_int *constant_int::get(type *ty, uint64_t value) {
+  context_impl *impl = ty->get_context().p_impl.get();
+  constant_int *& cst = impl->int_constants_[std::make_pair(ty, value)];
+  if(cst == nullptr)
+    cst = new constant_int(ty, value);
+  return cst;
+}
+
+
+// constant_fp
+// FIXME use something like APFloat
+
+constant_fp::constant_fp(type *ty, double value)
+  : constant(ty, 0), value_(value){ }
+
+constant *constant_fp::get_negative_zero(type *ty){
+  double neg_zero = 0;
+  return get(ty, neg_zero);
+}
+
+constant *constant_fp::get_zero_value_for_negation(type *ty) {
+  if(ty->get_scalar_ty()->is_floating_point_ty())
+    return get_negative_zero(ty);
+  return constant::get_null_value(ty);
+}
+
+constant *constant_fp::get(type *ty, double v){
+  context_impl *impl = ty->get_context().p_impl.get();
+  constant_fp *&result = impl->fp_constants_[std::make_pair(ty, v)];
+  if(!result)
+    result = new constant_fp(ty, v);
+  return result;
+}
+
+
+// undef value
+undef_value::undef_value(type *ty)
+  : constant(ty, 0) { }
+
+undef_value *undef_value::get(type *ty) {
+  context_impl *impl = ty->get_context().p_impl.get();
+  undef_value *&result = impl->uv_constants_[ty];
+  if(!result)
+    result = new undef_value(ty);
+  return result;
+}
+
+/* global value */
+global_value::global_value(type *ty, unsigned num_ops,
+                           linkage_types_t linkage,
+                           const std::string &name, unsigned addr_space)
+    : constant(pointer_type::get(ty, addr_space), num_ops, name),
+      linkage_(linkage) { }
+
+
+/* global object */
+global_object::global_object(type *ty, unsigned num_ops,
+                            linkage_types_t linkage,
+                            const std::string &name, unsigned addr_space)
+  : global_value(ty, num_ops, linkage, name, addr_space) { }
+
+
+/* alloc const */
+alloc_const::alloc_const(type *ty, constant_int *size, const std::string &name)
+  : global_object(ty, 1, global_value::external, name, 4) {
+  set_operand(0, size);
+}
+
+
+}
+}
diff --git a/lib/ir/context.cc b/lib/ir/context.cc
new file mode 100644
index 000000000..e0a6976e0
--- /dev/null
+++ b/lib/ir/context.cc
@@ -0,0 +1,38 @@
+#include "triton/ir/context_impl.h"
+#include "triton/ir/context.h"
+#include "triton/ir/type.h"
+
+namespace triton{
+namespace ir{
+
+//===----------------------------------------------------------------------===//
+//                               context implementation
+//===----------------------------------------------------------------------===//
+
+context_impl::context_impl(context &ctx)
+    : void_ty(ctx, type::VoidTyID),
+      label_ty(ctx, type::LabelTyID),
+      half_ty(ctx, type::HalfTyID),
+      float_ty(ctx, type::FloatTyID),
+      double_ty(ctx, type::DoubleTyID),
+      int1_ty(ctx, 1),
+      int8_ty(ctx, 8),
+      int16_ty(ctx, 16),
+      int32_ty(ctx, 32),
+      int64_ty(ctx, 64),
+      int128_ty(ctx, 128){
+
+}
+
+//===----------------------------------------------------------------------===//
+//                                    context
+//===----------------------------------------------------------------------===//
+
+context::context():
+  p_impl(std::make_shared<context_impl>(*this)) {
+
+}
+
+
+}
+}
diff --git a/lib/ir/function.cc b/lib/ir/function.cc
new file mode 100644
index 000000000..84d52df72
--- /dev/null
+++ b/lib/ir/function.cc
@@ -0,0 +1,66 @@
+#include <algorithm>
+#include "triton/ir/function.h"
+#include "triton/ir/type.h"
+#include "triton/ir/module.h"
+
+namespace triton{
+namespace ir{
+
+
+/* Argument */
+
+argument::argument(type *ty, const std::string &name, function *parent, unsigned arg_no)
+  : value(ty, name), parent_(parent), arg_no_(arg_no) { }
+
+argument *argument::create(type *ty, const std::string &name,
+                          function *parent, unsigned arg_no) {
+  return new argument(ty, name, parent, arg_no);
+}
+
+function* argument::get_parent() const {
+  return parent_;
+}
+
+unsigned argument::get_arg_no() const {
+  return arg_no_;
+}
+
+void argument::accept(visitor *v) {
+  v->visit_argument(this);
+}
+
+
+/* function */
+function::function(function_type *ty, linkage_types_t linkage,
+                   const std::string &name, module *parent)
+    : global_object(ty, 0, linkage, name), parent_(parent), fn_ty_(ty) {
+  unsigned num_params = fn_ty_->get_num_params();
+  // skip if no parameter
+  if(num_params == 0)
+    return;
+  // create arguments
+  args_.resize(num_params);
+  for(unsigned i = 0; i < num_params; i++){
+    type *param_ty = fn_ty_->get_param_ty(i);
+    args_[i] = argument::create(param_ty, "", this, i);
+  }
+  if(parent)
+    parent->push_function(this);
+}
+
+/* basic block */
+void function::insert_block(basic_block *block, basic_block *next) {
+  auto it = std::find(blocks_.begin(), blocks_.end(), next);
+  blocks_.insert(it, block);
+}
+
+
+function *function::create(function_type *ty, linkage_types_t linkage,
+                           const std::string &name, module *mod) {
+  return new function(ty, linkage, name, mod);
+}
+
+
+}
+}
+
diff --git a/lib/ir/instructions.cc b/lib/ir/instructions.cc
new file mode 100644
index 000000000..930c4a116
--- /dev/null
+++ b/lib/ir/instructions.cc
@@ -0,0 +1,828 @@
+#include <algorithm>
+#include "triton/ir/context.h"
+#include "triton/ir/basic_block.h"
+#include "triton/ir/instructions.h"
+#include "triton/ir/constant.h"
+#include "triton/ir/type.h"
+
+namespace triton{
+namespace ir{
+
+//===----------------------------------------------------------------------===//
+//                               instruction classes
+//===----------------------------------------------------------------------===//
+
+instruction::instruction(type *ty, value_id_t ity, unsigned num_ops,
+                         const std::string &name, instruction *next)
+    : user(ty, num_ops, name), id_(ity) {
+  if(next){
+    basic_block *block = next->get_parent();
+    assert(block && "Next instruction is not in a basic block!");
+    auto it = std::find(block->begin(), block->end(), next);
+    block->get_inst_list().insert(it, next);
+  }
+}
+
+void instruction::erase_from_parent() {
+  parent_->erase(this);
+  for(ir::value* op: ops())
+    op->erase_use(this);
+}
+
+bool instruction::has_tile_result_or_op() {
+  bool result = get_type()->is_tile_ty();
+  for(unsigned i = 0; i < get_num_operands(); i++)
+    result |= get_operand(i)->get_type()->is_tile_ty();
+  return result;
+}
+
+//===----------------------------------------------------------------------===//
+//                               phi_node classes
+//===----------------------------------------------------------------------===//
+
+phi_node::phi_node(type *ty, unsigned num_reserved, std::string const &name, instruction *next)
+    : instruction(ty, INST_PHI, 0, name, next) {
+  blocks_.reserve(num_reserved);
+}
+
+// Set incoming value
+void phi_node::set_incoming_value(unsigned i, value *v){
+  assert(v && "PHI node got a null value!");
+  assert(get_type() == v->get_type() &&
+         "All operands to PHI node must be the same type as the PHI node!");
+  set_operand(i, v);
+}
+
+// Set incoming block
+void phi_node::set_incoming_block(unsigned i, basic_block *block){
+  assert(block && "PHI node got a null basic block!");
+  blocks_[i] = block;
+}
+
+// Add incoming
+void phi_node::add_incoming(value *v, basic_block *block){
+  resize_ops(get_num_operands() + 1);
+  blocks_.resize(get_num_operands() + 1);
+  set_incoming_value(get_num_operands() - 1, v);
+  set_incoming_block(get_num_operands() - 1, block);
+}
+
+// Factory methods
+phi_node* phi_node::create(type *ty, unsigned num_reserved, const std::string &name, instruction *next){
+  return new phi_node(ty, num_reserved, name, next);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                               binary_operator classes
+//===----------------------------------------------------------------------===//
+
+std::string binary_operator::repr_impl() const {
+  switch(op_) {
+  case Add  : return "add";
+  case FAdd : return "fadd";
+  case Sub  : return "sub";
+  case FSub : return "fsub";
+  case Mul  : return "mul";
+  case FMul : return "fmul";
+  case UDiv : return "udiv";
+  case SDiv : return "sdiv";
+  case FDiv : return "fdiv";
+  case URem : return "urem";
+  case SRem : return "srem";
+  case FRem : return "frem";
+  case Shl  : return "shl";
+  case LShr : return "lshr";
+  case AShr : return "ashr";
+  case And  : return "and";
+  case Or   : return "or";
+  case Xor  : return "xor";
+  default: throw std::runtime_error("unknown binary operator");
+  }
+}
+
+bool binary_operator::is_int_div() const {
+  return op_ == binary_op_t::UDiv || op_ == binary_op_t::SDiv;
+}
+
+bool binary_operator::is_int_rem() const {
+  return op_ == binary_op_t::URem || op_ == binary_op_t::SRem;
+}
+
+bool binary_operator::is_shl() const {
+  return op_ == binary_op_t::Shl;
+}
+
+bool binary_operator::is_shr() const {
+  return op_ == binary_op_t::LShr || op_ == binary_op_t::AShr;
+}
+
+bool binary_operator::is_int_mult()    const {
+  return op_ == binary_op_t::Mul;
+}
+
+bool binary_operator::is_int_add_sub() const {
+  return op_ == binary_op_t::Add || op_ == binary_op_t::Sub;
+}
+
+
+binary_operator::binary_operator(binary_op_t op, value *lhs, value *rhs, type *ty, const std::string &name, instruction *next)
+    : instruction(ty, INST_BINOP, 2, name, next), op_(op){
+  set_operand(0, lhs);
+  set_operand(1, rhs);
+}
+
+binary_operator *binary_operator::create(binary_op_t op, value *lhs, value *rhs, const std::string &name, instruction *next){
+  assert(lhs->get_type() == rhs->get_type() &&
+         "Cannot create binary operator with two operands of differing type!");
+  return new binary_operator(op, lhs, rhs, lhs->get_type(), name, next);
+}
+
+//binary_operator *binary_operator::create_fneg(value *arg, const std::string &name, instruction *next){
+//  assert(arg->get_type()->get_scalar_ty()->is_floating_point_ty());
+//  value *zero = constant_fp::get_zero_value_for_negation(arg->get_type());
+//  return binary_operator::create(binary_op_t::FSub, zero, arg, name, next);
+//}
+
+//binary_operator *binary_operator::create_neg(value *arg, const std::string &name, instruction *next){
+//  assert(arg->get_type()->get_scalar_ty()->is_integer_ty());
+//  value *zero = constant_fp::get_zero_value_for_negation(arg->get_type()->get_scalar_ty());
+//  return binary_operator::create(binary_op_t::Sub, zero, arg, name, next);
+//}
+
+//binary_operator *binary_operator::create_not(value *arg, const std::string &name, instruction *next){
+//  assert(arg->get_type()->is_integer_ty());
+//  constant *mask = constant::get_all_ones_value(arg->get_type());
+//  return binary_operator::create(binary_op_t::Xor, arg, mask, name, next);
+//}
+
+//===----------------------------------------------------------------------===//
+//                               cmp_inst classes
+//===----------------------------------------------------------------------===//
+
+
+
+// cmp_inst
+std::string cmp_inst::repr_impl() const {
+  switch (pred_) {
+    case FCMP_FALSE :  return "false";
+    case FCMP_OEQ   :  return "fcmp_oeq";
+    case FCMP_OGT   :  return "fcmp_ogt";
+    case FCMP_OGE   :  return "fcmp_oge";
+    case FCMP_OLT   :  return "fcmp_olt";
+    case FCMP_OLE   :  return "fcmp_ole";
+    case FCMP_ONE   :  return "fcmp_one";
+    case FCMP_ORD   :  return "fcmp_ord";
+    case FCMP_UNO   :  return "fcmp_uno";
+    case FCMP_UEQ   :  return "fcmp_ueq";
+    case FCMP_UGT   :  return "fcmp_ugt";
+    case FCMP_UGE   :  return "fcmp_uge";
+    case FCMP_ULT   :  return "fcmp_ult";
+    case FCMP_ULE   :  return "fcmp_ule";
+    case FCMP_UNE   :  return "fcmp_une";
+    case FCMP_TRUE  :  return "true";
+    case ICMP_EQ    :  return "icmp_eq";
+    case ICMP_NE    :  return "icmp_ne";
+    case ICMP_UGT   :  return "icmp_ugt";
+    case ICMP_UGE   :  return "icmp_uge";
+    case ICMP_ULT   :  return "icmp_ult";
+    case ICMP_ULE   :  return "icmp_ule";
+    case ICMP_SGT   :  return "icmp_sgt";
+    case ICMP_SGE   :  return "icmp_sge";
+    case ICMP_SLT   :  return "icmp_slt";
+    case ICMP_SLE   :  return "icmp_sle";
+    default: throw std::runtime_error("unreachable");
+  }
+}
+
+cmp_inst::cmp_inst(type *ty, value_id_t id, cmp_pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next)
+    : instruction(ty, id, 2, name, next), pred_(pred) {
+  set_operand(0, lhs);
+  set_operand(1, rhs);
+}
+
+type* cmp_inst::make_cmp_result_type(type *ty){
+  type* int1_ty = type::get_int1_ty(ty->get_context());
+  if (tile_type* tile_ty = dynamic_cast<tile_type*>(ty))
+    return tile_type::get_same_shapes(int1_ty, tile_ty);
+  return int1_ty;
+}
+
+
+bool cmp_inst::is_fp_predicate(cmp_pred_t pred) {
+  return pred >= FIRST_FCMP_PREDICATE && pred <= LAST_FCMP_PREDICATE;
+}
+
+bool cmp_inst::is_int_predicate(cmp_pred_t pred) {
+  return pred >= FIRST_ICMP_PREDICATE && pred <= LAST_ICMP_PREDICATE;
+}
+
+
+// icmp_inst
+icmp_inst::icmp_inst(type *ty, cmp_pred_t pred,
+                     value *lhs, value *rhs, const std::string &name, instruction *next)
+  : cmp_inst(ty, INST_ICMP, pred, lhs, rhs, name, next){ }
+
+icmp_inst* icmp_inst::create(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next){
+  assert(is_int_predicate(pred));
+  type *res_ty = make_cmp_result_type(lhs->get_type());
+  return new icmp_inst(res_ty, pred, lhs, rhs, name, next);
+}
+
+// fcmp_inst
+fcmp_inst::fcmp_inst(type *ty, cmp_pred_t pred,
+                     value *lhs, value *rhs, const std::string &name, instruction *next)
+  : cmp_inst(ty, INST_FCMP, pred, lhs, rhs, name, next){ }
+
+fcmp_inst* fcmp_inst::create(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next){
+  assert(is_fp_predicate(pred));
+  type *res_ty = make_cmp_result_type(lhs->get_type());
+  return new fcmp_inst(res_ty, pred, lhs, rhs, name, next);
+}
+
+//===----------------------------------------------------------------------===//
+//                               unary_inst classes
+//===----------------------------------------------------------------------===//
+
+unary_inst::unary_inst(type *ty, value_id_t id, value *v, const std::string &name, instruction *next)
+    : instruction(ty, id, 1, name, next) {
+  set_operand(0, v);
+}
+
+//===----------------------------------------------------------------------===//
+//                               cast_inst classes
+//===----------------------------------------------------------------------===//
+
+std::string cast_inst::repr_impl() const {
+  switch (op_){
+  case cast_op_t::Trunc:         return "trunc";
+  case cast_op_t::ZExt:          return "zext";
+  case cast_op_t::SExt:          return "sext";
+  case cast_op_t::FPTrunc:       return "fp_trunc";
+  case cast_op_t::FPExt:         return "fp_ext";
+  case cast_op_t::UIToFP:        return "ui_to_fp";
+  case cast_op_t::SIToFP:        return "si_to_fp";
+  case cast_op_t::FPToUI:        return "fp_to_ui";
+  case cast_op_t::FPToSI:        return "fp_to_si";
+  case cast_op_t::PtrToInt:      return "ptr_to_int";
+  case cast_op_t::IntToPtr:      return "int_to_ptr";
+  case cast_op_t::BitCast:       return "bitcast";
+  case cast_op_t::AddrSpaceCast: return "addr_space_cast";
+  default: throw std::runtime_error("unreachable");
+  }
+}
+// TODO
+bool cast_inst::is_valid(cast_op_t op, value *arg, type *ty) {
+  assert(arg->get_type()->is_tile_ty() == ty->is_tile_ty());
+  return true;
+}
+
+cast_inst *cast_inst::create(cast_op_t op, value *arg, type *ty, const std::string &name, instruction *next){
+  assert(is_valid(op, arg, ty) && "Invalid cast!");
+  // Construct and return the appropriate CastInst subclass
+  switch (op) {
+  case cast_op_t::Trunc:         return new trunc_inst           (ty, arg, name, next);
+  case cast_op_t::ZExt:          return new z_ext_inst           (ty, arg, name, next);
+  case cast_op_t::SExt:          return new s_ext_inst           (ty, arg, name, next);
+  case cast_op_t::FPTrunc:       return new fp_trunc_inst        (ty, arg, name, next);
+  case cast_op_t::FPExt:         return new fp_ext_inst          (ty, arg, name, next);
+  case cast_op_t::UIToFP:        return new ui_to_fp_inst        (ty, arg, name, next);
+  case cast_op_t::SIToFP:        return new si_to_fp_inst        (ty, arg, name, next);
+  case cast_op_t::FPToUI:        return new fp_to_ui_inst        (ty, arg, name, next);
+  case cast_op_t::FPToSI:        return new fp_to_si_inst        (ty, arg, name, next);
+  case cast_op_t::PtrToInt:      return new ptr_to_int_inst      (ty, arg, name, next);
+  case cast_op_t::IntToPtr:      return new int_to_ptr_inst      (ty, arg, name, next);
+  case cast_op_t::BitCast:       return new bit_cast_inst        (ty, arg, name, next);
+  case cast_op_t::AddrSpaceCast: return new addr_space_cast_inst (ty, arg, name, next);
+  default: throw std::runtime_error("unreachable");
+  }
+}
+
+cast_inst *cast_inst::create_integer_cast(value *arg, type *ty, bool is_signed, const std::string &name, instruction *next){
+  type *arg_ty = arg->get_type();
+  assert(arg_ty->is_int_or_tileint_ty() && ty->is_int_or_tileint_ty() && "Invalid integer cast!");
+  unsigned arg_bits = arg_ty->get_scalar_ty()->get_integer_bitwidth();
+  unsigned dst_bits = ty->get_scalar_ty()->get_integer_bitwidth();
+  cast_op_t op = (arg_bits == dst_bits ? cast_op_t::BitCast :
+            (arg_bits > dst_bits  ? cast_op_t::Trunc :
+            (is_signed            ? cast_op_t::SExt : cast_op_t::ZExt)));
+  return create(op, arg, ty, name, next);
+}
+
+//===----------------------------------------------------------------------===//
+//                               terminator_inst classes
+//===----------------------------------------------------------------------===//
+
+
+// return_inst
+return_inst::return_inst(context &ctx, value *ret_val, instruction *next)
+    : terminator_inst(type::get_void_ty(ctx), INST_RETURN, ret_val!=nullptr, "", next){
+  if(ret_val)
+    set_operand(0, ret_val);
+}
+
+return_inst *return_inst::create(context &ctx, value *ret_val, instruction *next){
+  return new return_inst(ctx, ret_val, next);
+}
+
+
+// branch_inst
+branch_inst* branch_inst::create(basic_block *dst, instruction *next) {
+  assert(dst && "Branch destination may not be null!");
+  return new uncond_branch_inst(dst, next);
+}
+
+branch_inst* branch_inst::create(value *cond, basic_block *if_dst, basic_block *else_dst, instruction *next) {
+  assert(cond->get_type()->is_integer_ty(1) && "May only branch on boolean predicates!");
+  return new cond_branch_inst(if_dst, else_dst, cond, next);
+}
+
+// uncond_branch_inst
+uncond_branch_inst::uncond_branch_inst(basic_block *dst, instruction *next)
+    : branch_inst(type::get_void_ty(dst->get_context()), INST_UNCOND_BRANCH, 1, "", next){
+  set_operand(0, dst);
+}
+
+// cond_branch_inst
+cond_branch_inst::cond_branch_inst(basic_block *if_dst, basic_block *else_dst, value *cond, instruction *next)
+    : branch_inst(type::get_void_ty(if_dst->get_context()), INST_COND_BRANCH, 3, "", next){
+  assert(cond->get_type()->is_integer_ty(1) && "May only branch on boolean predicates!");
+  set_operand(0, if_dst);
+  set_operand(1, else_dst);
+  set_operand(2, cond);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                               getelementptr_inst classes
+//===----------------------------------------------------------------------===//
+
+getelementptr_inst::getelementptr_inst(type *pointee_ty, value *ptr, const std::vector<value *> &idx, const std::string &name, instruction *next)
+    : instruction(get_return_type(pointee_ty, ptr, idx), INST_GETELEMENTPTR, 1 + idx.size(), name, next),
+      source_elt_ty(pointee_ty),
+      res_elt_ty(get_indexed_type(pointee_ty, idx)){
+  // sanity check
+  type *expected_ty = get_type()->get_scalar_ty();
+  expected_ty = ((pointer_type*)expected_ty)->get_element_ty();
+  assert(res_elt_ty == expected_ty);
+  // set operands
+  set_operand(0, ptr);
+  for(size_t i = 0; i < idx.size(); i++)
+    set_operand(1 + i, idx[i]);
+}
+
+type *getelementptr_inst::get_return_type(type *elt_ty, value *x, const std::vector<value *> &idx_list) {
+  // result pointer type
+  type *ty = x->get_type();
+  unsigned addr_space = ty->get_scalar_ty()->get_pointer_address_space();
+  type *ptr_ty = pointer_type::get(get_indexed_type(elt_ty, idx_list), addr_space);
+  // Tile GEP
+  if(ty->is_tile_ty())
+    return tile_type::get_same_shapes(ptr_ty, ty);
+  for(value *idx : idx_list)
+  if (idx->get_type()->is_tile_ty())
+    return tile_type::get_same_shapes(ptr_ty, ty);
+  // Scalar GEP
+  return ptr_ty;
+}
+
+type *getelementptr_inst::get_indexed_type_impl(type *ty, const std::vector<value *> &idx_list) {
+  if(idx_list.empty())
+    return ty;
+  if(!ty->is_sized())
+    return nullptr;
+  unsigned cur_idx = 1;
+  for(; cur_idx != idx_list.size(); cur_idx++){
+    composite_type *cty = dynamic_cast<composite_type*>(ty);
+    if(!cty || cty->is_pointer_ty())
+      break;
+    value *idx = idx_list[cur_idx];
+    if(!cty->index_valid(idx))
+      break;
+    ty = cty->get_type_at_index(idx);
+  }
+  return (cur_idx == idx_list.size())? ty : nullptr;
+}
+
+type *getelementptr_inst::get_indexed_type(type *ty, const std::vector<value *> &idx_list) {
+  type *result = get_indexed_type_impl(ty, idx_list);
+  assert(result && "invalid GEP type!");
+  return result;
+}
+
+getelementptr_inst *getelementptr_inst::create(value *ptr, const std::vector<value *> &idx, const std::string &name, instruction *next) {
+  type *pointee_ty = ((pointer_type*)(ptr->get_type()->get_scalar_ty()))->get_element_ty();
+  return new getelementptr_inst(pointee_ty, ptr, idx, name, next);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                               load_inst/store_inst classes
+//===----------------------------------------------------------------------===//
+
+// io_inst
+io_inst::io_inst(type *ty, value_id_t id, unsigned num_ops, const std::string &name, instruction *next)
+  : instruction(ty, id, num_ops, name, next)
+{ }
+
+// load_inst
+load_inst::load_inst(value *ptr, value_id_t id, unsigned num_ops, const std::string &name, instruction *next)
+  : io_inst(get_pointee_type(ptr->get_type()), id, num_ops, name, next)
+{ }
+
+// load
+type *load_inst::get_pointee_type(type *ty) {
+  type *scalar_ty = ty->get_scalar_ty();
+  type *pointee_ty = scalar_ty->get_pointer_element_ty();
+  if(ty->is_tile_ty())
+    return tile_type::get_same_shapes(pointee_ty, ty);
+  return pointee_ty;
+}
+
+// unmasked_load
+unmasked_load_inst::unmasked_load_inst(value *ptr, const std::string &name, instruction *next)
+  : load_inst(ptr, INST_UNMASKED_LOAD, 1, name, next) {
+  set_operand(0, ptr);
+}
+
+unmasked_load_inst* unmasked_load_inst::create(value *ptr, const std::string &name, instruction *next) {
+  return new unmasked_load_inst(ptr, name, next);
+}
+
+// masked load
+masked_load_inst::masked_load_inst(value *ptr, value *mask, value *false_value,
+                                   const std::string &name, instruction *next)
+  : load_inst(ptr, INST_MASKED_LOAD, 3, name, next) {
+  set_operand(0, ptr);
+  set_operand(1, mask);
+  set_operand(2, false_value);
+}
+
+masked_load_inst* masked_load_inst::create(value *ptr, value *mask, value *false_value,
+                                           const std::string &name, instruction *next) {
+  return new masked_load_inst(ptr, mask, false_value, name, next);
+}
+
+
+store_inst::store_inst(value *ptr, value_id_t id, unsigned num_ops, const std::string &name, instruction *next)
+  : io_inst(type::get_void_ty(ptr->get_type()->get_context()), id, num_ops, name, next)
+{ }
+
+// unmasked_store
+unmasked_store_inst::unmasked_store_inst(value *ptr, value *val,
+                                         const std::string &name, instruction *next)
+    : store_inst(ptr, INST_UNMASKED_STORE, 2, name, next)  {
+  set_operand(0, ptr);
+  set_operand(1, val);
+}
+
+unmasked_store_inst* unmasked_store_inst::create(value *ptr, value *val,
+                                                 const std::string &name, instruction *next) {
+  return new unmasked_store_inst(ptr, val, name, next);
+}
+
+// masked store
+masked_store_inst::masked_store_inst(value *ptr, value *val, value *mask,
+                                     const std::string &name, instruction *next)
+  : store_inst(ptr, INST_MASKED_STORE, 3, name, next) {
+  set_operand(0, ptr);
+  set_operand(1, val);
+  set_operand(2, mask);
+}
+
+masked_store_inst* masked_store_inst::create(value *ptr, value *val, value *mask, const std::string &name, instruction *next)  {
+  return new masked_store_inst(ptr, val, mask, name, next);
+}
+//===----------------------------------------------------------------------===//
+//                               retile_inst classes
+//===----------------------------------------------------------------------===//
+
+retile_inst::retile_inst(value *arg, value_id_t id, const type::tile_shapes_t &shapes,
+                         const std::string &name, instruction *next)
+   : unary_inst(tile_type::get(arg->get_type()->get_scalar_ty(), shapes), id, arg, name, next) { }
+
+
+// reshape
+
+instruction* reshape_inst::create(value *arg, const type::tile_shapes_t &shapes,
+                                  const std::string &name, instruction *next) {
+  return new reshape_inst(arg, INST_RESHAPE, shapes, name, next);
+}
+
+
+// splat
+
+instruction* splat_inst::create(value *arg, const type::tile_shapes_t &shapes,
+                                  const std::string &name, instruction *next) {
+  return new splat_inst(arg, INST_SPLAT, shapes, name, next);
+}
+
+// broadcast
+
+instruction* broadcast_inst::create(value *arg, const type::tile_shapes_t &shapes,
+                                  const std::string &name, instruction *next) {
+  return new broadcast_inst(arg, INST_BROADCAST, shapes, name, next);
+}
+
+// downcast
+
+instruction* downcast_inst::create(value *arg, const std::string &name, instruction *next) {
+  return new downcast_inst(arg->get_type()->get_scalar_ty(), INST_DOWNCAST, arg, name, next);
+}
+
+//===----------------------------------------------------------------------===//
+//                               matmul_inst classes
+//===----------------------------------------------------------------------===//
+
+dot_inst::dot_inst(value *A, value *B, value *C, TransT AT, TransT BT,
+                         const std::string &name, instruction *next)
+    : builtin_inst(C->get_type(), INST_DOT, 3, name, next) {
+  set_operand(0, A);
+  set_operand(1, B);
+  set_operand(2, C);
+}
+
+instruction *dot_inst::create(value *A, value *B, value *C,
+                              bool AT, bool BT,
+                              const std::string &name, instruction *next) {
+  TransT OPA = AT ? Trans : NoTrans;
+  TransT OPB = BT ? Trans : NoTrans;
+  return new dot_inst(A, B, C, OPA, OPB, name, next);
+}
+
+instruction *dot_inst::create_nn(value *A, value *B, value *C,
+                                 const std::string &name, instruction *next) {
+  return new dot_inst(A, B, C, NoTrans, NoTrans, name, next);
+}
+
+instruction *dot_inst::create_nt(value *A, value *B, value *C,
+                                 const std::string &name, instruction *next) {
+  return new dot_inst(A, B, C, NoTrans, Trans, name, next);
+}
+
+instruction *dot_inst::create_tn(value *A, value *B, value *C,
+                                 const std::string &name, instruction *next) {
+  return new dot_inst(A, B, C, Trans, NoTrans, name, next);
+}
+
+instruction *dot_inst::create_tt(value *A, value *B, value *C,
+                                 const std::string &name, instruction *next) {
+  return new dot_inst(A, B, C, Trans, Trans, name, next);
+}
+
+//===----------------------------------------------------------------------===//
+//                               trans instructions
+//===----------------------------------------------------------------------===//
+
+ir::type* trans_inst::get_res_ty(ir::type* ty, std::vector<int> perm) {
+  // get argument shapes
+  ir::tile_type::tile_shapes_t arg_shapes = ty->get_tile_shapes();
+  // permutate argument shapes
+  perm = init_perm(ty, perm);
+  ir::tile_type::tile_shapes_t res_shapes = arg_shapes;
+  for(size_t i = 0; i < perm.size(); i++)
+    res_shapes[i] = arg_shapes[perm[i]];
+  // construct type
+  return tile_type::get(ty->get_scalar_ty(), res_shapes);
+}
+
+std::vector<int> trans_inst::init_perm(ir::type* ty, const std::vector<int>& perm) {
+  if(!perm.empty())
+    return perm;
+  auto size = ty->get_tile_shapes().size();
+  std::vector<int> result;
+  result.push_back(size - 1);
+  for(size_t i = 0; i < size - 1; i++)
+    result.push_back(i);
+  return result;
+}
+
+trans_inst::trans_inst(value *arg, const std::vector<int> &perm, const std::string &name, instruction *next)
+  : builtin_inst(get_res_ty(arg->get_type(), perm), INST_TRANS, 1, name, next) {
+  // sanity check
+  perm_ = init_perm(arg->get_type(), perm);
+  //auto size = arg->get_type()->get_tile_shapes().size();
+  //assert(perm_.size() == size);
+  set_operand(0, arg);
+}
+
+instruction* trans_inst::create(value *arg, const std::vector<int> &perm, const std::string &name, instruction *next) {
+  return new trans_inst(arg, perm, name, next);
+}
+
+const std::vector<int> trans_inst::get_perm() const {
+  return perm_;
+}
+
+//===----------------------------------------------------------------------===//
+//                               sqrt instructions
+//===----------------------------------------------------------------------===//
+
+sqrt_inst::sqrt_inst(value *arg, const std::string &name, instruction *next)
+  : builtin_inst(arg->get_type(), INST_SQRT, 1, name, next){
+  set_operand(0, arg);
+}
+
+instruction* sqrt_inst::create(value *arg, const std::string &name, instruction *next) {
+  return new sqrt_inst(arg, name, next);
+}
+
+//===----------------------------------------------------------------------===//
+//                               reduce instructions
+//===----------------------------------------------------------------------===//
+
+std::string reduce_inst::to_str(op_t op) {
+  switch (op) {
+    case ADD: return "+";
+    case SUB: return "-";
+    case MAX: return "imax";
+    case MIN: return "imin";
+    case FADD: return "+";
+    case FSUB: return "-";
+    case FMAX: return "fmax";
+    case FMIN: return "fmin";
+    default: break;
+  }
+  assert(false);
+  return "";
+}
+
+type* reduce_inst::get_res_type(value *arg, unsigned axis) {
+  ir::tile_type::tile_shapes_t shapes = arg->get_type()->get_tile_shapes();
+  shapes.erase(shapes.begin() + axis);
+  type *scalar_ty = arg->get_type()->get_scalar_ty();
+  if(shapes.empty())
+//    shapes.push_back(1);
+    return scalar_ty;
+  return tile_type::get(scalar_ty, shapes);
+}
+
+reduce_inst::reduce_inst(value *arg, op_t op, unsigned axis, const std::string &name, instruction *next)
+  : builtin_inst(get_res_type(arg, axis), INST_REDUCE, 1, name, next),
+    op_(op),
+    axis_(axis){
+  set_operand(0, arg);
+}
+
+instruction* reduce_inst::create(value *arg, op_t op, unsigned axis, const std::string &name, instruction *next) {
+  return new reduce_inst(arg, op, axis, name, next);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                               select instructions
+//===----------------------------------------------------------------------===//
+
+select_inst::select_inst(value *pred, value *if_value, value *else_value, const std::string &name, instruction *next)
+  : builtin_inst(if_value->get_type(), INST_SELECT, 3, name, next){
+  set_operand(0, pred);
+  set_operand(1, if_value);
+  set_operand(2, else_value);
+}
+
+instruction* select_inst::create(value *pred, value *if_value, value *else_value, const std::string &name, instruction *next) {
+  return new select_inst(pred, if_value, else_value, name, next);
+}
+//===----------------------------------------------------------------------===//
+//                               builtin instructions
+//===----------------------------------------------------------------------===//
+
+
+// get_program_id
+get_program_id_inst::get_program_id_inst(type *ty, unsigned axis, const std::string &name, instruction *next)
+  : builtin_inst(ty, INST_GET_PROGRAM_ID, 0, name, next), axis_(axis){
+
+}
+
+instruction* get_program_id_inst::create(context &ctx, unsigned axis, const std::string &name, instruction *next) {
+  return new get_program_id_inst(type::get_int32_ty(ctx), axis, name, next);
+}
+
+// get_num_program
+get_num_program_inst::get_num_program_inst(type *ty, unsigned axis, const std::string &name, instruction *next)
+  : builtin_inst(ty, INST_GET_NUM_PROGRAMS, 0, name, next), axis_(axis){
+
+}
+
+instruction* get_num_program_inst::create(context &ctx, unsigned axis, const std::string &name, instruction *next) {
+  return new get_num_program_inst(type::get_int32_ty(ctx), axis, name, next);
+}
+
+
+// atomic cas
+
+atomic_cas_inst::atomic_cas_inst(value *ptr, value *cmp, value *val, const std::string &name, instruction *next)
+  : builtin_inst(ptr->get_type()->get_pointer_element_ty(), INST_ATOMIC_CAS, 3, name, next) {
+  set_operand(0, ptr);
+  set_operand(1, cmp);
+  set_operand(2, val);
+}
+
+instruction* atomic_cas_inst::create(value *ptr, value *cmp, value *val, const std::string &name, instruction *next) {
+  return new atomic_cas_inst(ptr, cmp, val, name, next);
+}
+
+// atomic exch
+
+atomic_exch_inst::atomic_exch_inst(value *ptr, value *val, const std::string &name, instruction *next)
+  : builtin_inst(ptr->get_type()->get_pointer_element_ty(), INST_ATOMIC_EXCH, 2, name, next) {
+  set_operand(0, ptr);
+  set_operand(1, val);
+}
+
+instruction* atomic_exch_inst::create(value *ptr, value *val, const std::string &name, instruction *next) {
+  return new atomic_exch_inst(ptr, val, name, next);
+}
+
+// atomic add
+
+atomic_add_inst::atomic_add_inst(value *ptr, value *val, const std::string &name, instruction *next)
+  : builtin_inst(ptr->get_type()->get_pointer_element_ty(), INST_ATOMIC_ADD, 2, name, next) {
+  set_operand(0, ptr);
+  set_operand(1, val);
+}
+
+instruction* atomic_add_inst::create(value *ptr, value *val, const std::string &name, instruction *next) {
+  return new atomic_add_inst(ptr, val, name, next);
+}
+
+//===----------------------------------------------------------------------===//
+//                               intrinsic instructions
+//===----------------------------------------------------------------------===//
+
+// copy to shared
+copy_to_shared_inst* copy_to_shared_inst::create(value *arg, const std::string &name,
+                                                 instruction *next) {
+  return new copy_to_shared_inst(arg->get_type(), INST_COPY_TO_SHARED, arg, name, next);
+}
+
+// copy from shared
+copy_from_shared_inst* copy_from_shared_inst::create(value *arg, const std::string &name,
+                                                 instruction *next) {
+  return new copy_from_shared_inst(arg->get_type(), INST_COPY_FROM_SHARED, arg, name, next);
+}
+
+// recoalesce
+recoalesce_inst* recoalesce_inst::create(value *arg, const std::string &name, instruction *next) {
+  return new recoalesce_inst(arg->get_type(), INST_RECOALESCE, arg, name, next);
+}
+
+
+
+// barrier
+barrier_inst::barrier_inst(context &ctx, const std::string &name,
+                                                       instruction *next)
+  : instruction(type::get_void_ty(ctx), INST_BARRIER, 0, name, next) { }
+
+barrier_inst* barrier_inst::create(context &ctx, const std::string &name, instruction *next) {
+  return new barrier_inst(ctx, name, next);
+}
+
+
+// nv_dynamic_program_idx
+make_range_dyn::make_range_dyn(type *ty, const std::string &name, instruction *next)
+  : instruction(ty, INST_MAKE_RANGE_DYN, 0, name, next) { }
+
+make_range_dyn* make_range_dyn::create(type *ty, const std::string &name, instruction *next) {
+  return new make_range_dyn(ty, name, next);
+}
+
+// nv_static_program_idx
+make_range_sta::make_range_sta(make_range *range)
+  : constant(range->get_type(), 0), range_(range) { }
+
+make_range* make_range_sta::get_range() const
+{ return range_; }
+
+make_range_sta* make_range_sta::get(make_range* range) {
+  static std::map<make_range*, make_range_sta*> cache;
+  if(cache.find(range) == cache.end())
+    cache.insert({range, new make_range_sta(range)});
+  return cache.at(range);
+}
+
+
+// make_range
+make_range::make_range(type *ty, constant_int *first, constant_int *last)
+  : instruction(ty, INST_MAKE_RANGE, 0), first_(first), last_(last){ }
+
+make_range *make_range::create(constant_int *first, constant_int *last) {
+  assert(first->get_type()->is_integer_ty());
+  assert(first->get_type() == last->get_type());
+  assert(((constant_int*)first)->get_value() == 0);
+  type *ty = tile_type::get(first->get_type(), {(unsigned)last->get_value()});
+  return new make_range(ty, first, last);
+}
+
+const constant_int* make_range::get_first() const {
+  return first_;
+}
+
+const constant_int* make_range::get_last() const {
+  return last_;
+}
+
+
+
+}
+}
diff --git a/lib/ir/metadata.cc b/lib/ir/metadata.cc
new file mode 100644
index 000000000..16bc059c5
--- /dev/null
+++ b/lib/ir/metadata.cc
@@ -0,0 +1,14 @@
+#include "triton/ir/metadata.h"
+
+namespace triton{
+namespace ir{
+
+metadata::metadata(kind_t kind, unsigned value)
+  : kind_(kind), value_(value) { }
+
+metadata* metadata::get(kind_t kind, unsigned value) {
+  return new metadata(kind, value);
+}
+
+}
+}
diff --git a/lib/ir/module.cc b/lib/ir/module.cc
new file mode 100644
index 000000000..4a4655fb6
--- /dev/null
+++ b/lib/ir/module.cc
@@ -0,0 +1,162 @@
+#include <algorithm>
+#include <iostream>
+#include "triton/ir/basic_block.h"
+#include "triton/ir/module.h"
+#include "triton/ir/type.h"
+#include "triton/ir/constant.h"
+#include "triton/ir/function.h"
+
+namespace triton{
+namespace ir{
+
+/* Module */
+module::module(const std::string &name, context &ctx)
+  : name_(name), context_(ctx), builder_(ctx) {
+  sealed_blocks_.insert(nullptr);
+}
+
+ir::builder& module::get_builder() {
+  return builder_;
+}
+
+ir::context& module::get_context() {
+  return context_;
+}
+
+void module::set_value(const std::string& name, ir::basic_block *block, ir::value *value){
+  values_[val_key_t{name, block}] = value;
+  auto it = metadatas_.find(name);
+  if(auto *x = dynamic_cast<ir::instruction*>(value))
+  if(it != metadatas_.end()){
+    x->set_metadata(it->second.first, it->second.second);
+  }
+  value->set_name(name);
+}
+
+void module::set_value(const std::string& name, ir::value *value){
+  return set_value(name, builder_.get_insert_block(), value);
+}
+
+void module::set_const(const std::string& name){
+  const_.insert(name);
+}
+
+void module::set_continue_fn(std::function<ir::value*()> fn) {
+  continue_fn_ = fn;
+}
+
+std::function<ir::value*()> module::get_continue_fn() {
+  return continue_fn_;
+}
+
+ir::phi_node* module::make_phi(ir::type *ty, unsigned num_values, ir::basic_block *block){
+  basic_block::iterator insert = block->get_first_non_phi();
+  if(insert != block->end()){
+    builder_.set_insert_point(insert);
+  }
+  ir::phi_node *res = builder_.create_phi(ty, num_values);
+  if(insert != block->end())
+    builder_.set_insert_point(block);
+  return res;
+}
+
+ir::value *module::try_remove_trivial_phis(ir::phi_node *&phi){
+  // find non-self references
+  std::set<ir::value*> non_self_ref;
+  std::copy_if(phi->ops().begin(), phi->ops().end(), std::inserter(non_self_ref, non_self_ref.begin()),
+               [phi](ir::value* op){ return  op != phi && op; });
+  // non-trivial
+  if(non_self_ref.size() != 1)
+    return phi;
+  // unique value or self-reference
+  ir::value *same = *non_self_ref.begin();
+  std::set<ir::user*> users = phi->get_users();
+  phi->replace_all_uses_with(same);
+  phi->erase_from_parent();
+  for(ir::user* u: users)
+  if(auto *uphi = dynamic_cast<ir::phi_node*>(u))
+    if(uphi != phi)
+      try_remove_trivial_phis(uphi);
+  return same;
+}
+
+
+ir::value *module::add_phi_operands(const std::string& name, ir::phi_node *&phi){
+  // already initialized
+  if(phi->get_num_operands())
+    return phi;
+  ir::basic_block *block = phi->get_parent();
+  for(ir::basic_block *pred: block->get_predecessors()){
+    ir::value *value = get_value(name, pred);
+    phi->add_incoming(value, pred);
+  }
+  return phi;
+}
+
+ir::value *module::get_value_recursive(const std::string& name, ir::basic_block *block) {
+  ir::value *result;
+  bool is_const = const_.find(name) != const_.end();
+  auto &preds = block->get_predecessors();
+  ir::type *ty = get_scope().types.at(name);
+  if(block && !is_const && sealed_blocks_.find(block) == sealed_blocks_.end()){
+    incomplete_phis_[block][name] = make_phi(ty, 1, block);
+    result = (ir::value*)incomplete_phis_[block][name];
+  }
+  else if(preds.size() <= 1){
+    bool has_pred = preds.size();
+    result = get_value(name, has_pred?preds.front():nullptr);
+  }
+  else{
+    ir::phi_node* phi = make_phi(ty, 1, block);
+    set_value(name, block, phi);
+    result = add_phi_operands(name, phi);
+  }
+  if(auto *phi = dynamic_cast<ir::phi_node*>(result))
+    result = try_remove_trivial_phis(phi);
+  set_value(name, block, result);
+  return result;
+}
+
+ir::value *module::get_value(const std::string& name, ir::basic_block *block) {
+  ir::basic_block* save_block = builder_.get_insert_block();
+  ir::basic_block::iterator save_pt = builder_.get_insert_point();
+  val_key_t key(name, block);
+  if(values_.find(key) != values_.end()){
+    return values_.at(key);
+  }
+  ir::value *result = get_value_recursive(name, block);
+  builder_.set_insert_point(save_block);
+  if(save_pt != save_block->end())
+    builder_.set_insert_point(save_pt);
+  return result;
+}
+
+ir::value *module::get_value(const std::string& name) {
+  return get_value(name, builder_.get_insert_block());
+}
+
+const std::string& module::get_name() {
+  return name_;
+}
+
+void module::seal_block(ir::basic_block *block){
+  for(auto &x: incomplete_phis_[block]){
+    add_phi_operands(x.first, x.second);
+    if(get_value(x.first) == x.second)
+      set_value(x.first, try_remove_trivial_phis(x.second));
+  }
+  sealed_blocks_.insert(block);
+  incomplete_phis_[block].clear();
+}
+
+/* functions */
+function *module::get_or_insert_function(const std::string &name, function_type *ty) {
+  function *&fn = (function*&)symbols_[name];
+  if(fn == nullptr)
+    return fn = function::create(ty, global_value::external, name, this);
+  return fn;
+}
+
+
+}
+}
diff --git a/lib/ir/print.cc b/lib/ir/print.cc
new file mode 100644
index 000000000..f88ba1f6f
--- /dev/null
+++ b/lib/ir/print.cc
@@ -0,0 +1,77 @@
+#include <iostream>
+#include "triton/ir/basic_block.h"
+#include "triton/ir/module.h"
+#include "triton/ir/type.h"
+#include "triton/ir/constant.h"
+#include "triton/ir/function.h"
+#include "triton/ir/instructions.h"
+#include "triton/ir/print.h"
+
+namespace triton{
+namespace ir{
+
+std::string get_name(ir::value *v, unsigned i) {
+  if(v->get_name().empty()){
+    std::string name = "%" + std::to_string(i);
+    v->set_name(name);
+  }
+  return v->get_name();
+}
+
+
+void print(module &mod, std::ostream& os) {
+  unsigned cnt = 0;
+  for(ir::function *fn: mod.get_function_list()){
+    os << "def " << fn->get_fn_type()->get_return_ty()->repr() << " " << fn->get_name() << "(" ;
+    for(ir::argument* arg: fn->args()) {
+      if(arg->get_arg_no() > 0)
+        os << ", ";
+      os << arg->get_type()->repr() << " " << arg->get_name();
+      auto attrs = fn->get_attributes(arg);
+      if(attrs.size() > 0)
+        os << " ";
+      for(ir::attribute attr: attrs)
+        os << attr.repr() << " ";
+    }
+    os << ")" << std::endl;
+    os << "{" << std::endl;
+    for(ir::basic_block *block: fn->blocks()){
+      auto const &predecessors = block->get_predecessors();
+      os << block->get_name() << ":";
+      if(!predecessors.empty()){
+        os << "                 ";
+        os << "; preds = ";
+        auto const &predecessors = block->get_predecessors();
+        for(ir::basic_block *pred: predecessors)
+          os << pred->get_name() << (pred!=predecessors.back()?", ":"");
+      }
+      os << std::endl;
+      for(ir::instruction *inst: block->get_inst_list()){
+        os << "  ";
+        if(!inst->get_type()->is_void_ty()){
+          os << get_name(inst, cnt++);
+          os << " = ";
+        }
+        ir::type* type = inst->get_type();
+        os << inst->repr() << " " << type->repr();
+        ir::instruction::ops_t ops = inst->ops();
+        size_t num_ops = inst->get_num_operands();
+        if(num_ops > 0)
+          os << " ";;
+        for(unsigned i = 0; i < num_ops; i++){
+          if(auto *x = dynamic_cast<ir::constant*>(ops[i]))
+            os << x->repr();
+          else
+            os << get_name(ops[i], cnt++);
+          os << (i < num_ops - 1?", ":"");
+        }
+        os << ";" << std::endl;
+      }
+    }
+    os << "}" << std::endl;
+  }
+}
+
+
+}
+}
diff --git a/lib/ir/type.cc b/lib/ir/type.cc
new file mode 100644
index 000000000..8300a32c4
--- /dev/null
+++ b/lib/ir/type.cc
@@ -0,0 +1,229 @@
+#include <cassert>
+#include "triton/ir/type.h"
+#include "triton/ir/context.h"
+#include "triton/ir/context_impl.h"
+#include "triton/ir/value.h"
+#include "triton/ir/constant.h"
+
+namespace triton{
+namespace ir{
+
+//===----------------------------------------------------------------------===//
+//                              type class
+//===----------------------------------------------------------------------===//
+
+// attributes
+type *type::get_scalar_ty() const {
+  if(is_tile_ty())
+    return get_tile_element_ty();
+  return const_cast<type*>(this);
+}
+
+unsigned type::get_primitive_size_in_bits() const {
+  switch (id_) {
+    case HalfTyID: return 16;
+    case FloatTyID: return 32;
+    case DoubleTyID: return 64;
+    case X86_FP80TyID: return 80;
+    case FP128TyID: return 128;
+    case PPC_FP128TyID: return 128;
+    case IntegerTyID: return ((integer_type*)(this))->get_bitwidth();
+    case TileTyID:  return ((tile_type*)(this))->get_bitwidth();
+    default: return 0;
+  }
+}
+
+unsigned type::get_integer_bitwidth() const
+{ assert(id_ == IntegerTyID); return ((integer_type*)(this))->get_bitwidth(); }
+
+unsigned type::get_tile_bitwidth() const
+{ return ((tile_type*)(this))->get_bitwidth(); }
+
+unsigned type::get_fp_mantissa_width() const {
+  id_t id = get_scalar_ty()->id_;
+  assert(is_floating_point_ty() && "Not a floating point type!");
+  if (id == HalfTyID) return 11;
+  if (id == FloatTyID) return 24;
+  if (id == DoubleTyID) return 53;
+  throw std::runtime_error("unreachable");
+}
+
+type* type::get_tile_element_ty() const {
+  assert(is_tile_ty());
+  return contained_tys_[0];
+}
+
+unsigned type::get_pointer_address_space() const {
+  assert(is_pointer_ty());
+  return ((pointer_type*)this)->get_address_space();
+}
+
+type * type::get_pointer_element_ty() const {
+  type *ptr_ty = get_scalar_ty();
+  assert(ptr_ty->is_pointer_ty());
+  type *scalar_ty = ((pointer_type*)ptr_ty)->get_element_ty();
+  if(is_tile_ty())
+    return tile_type::get_same_shapes(scalar_ty, (type*)this);
+  return scalar_ty;
+}
+
+
+const type::tile_shapes_t &type::get_tile_shapes() const {
+  assert(is_tile_ty());
+  return ((tile_type*)this)->get_shapes();
+}
+
+const size_t type::get_tile_rank() const {
+  return get_tile_shapes().size();
+}
+
+const size_t type::get_tile_ranks1() const {
+  int ret = 0;
+  for(int s: get_tile_shapes())
+    ret += s > 1;
+  return ret;
+}
+
+
+unsigned type::get_tile_num_elements() const {
+  const tile_shapes_t& shapes = get_tile_shapes();
+  unsigned result = 1;
+  for(auto shape: shapes)
+    result *= shape;
+  return result;
+}
+
+
+// composite predicates
+bool type::is_int_or_tileint_ty()
+{ return get_scalar_ty()->is_integer_ty(); }
+
+bool type::is_integer_ty(unsigned width) const
+{ return is_integer_ty() && get_integer_bitwidth()== width; }
+
+
+bool type::is_floating_point_ty() const
+{ return is_half_ty() || is_float_ty() || is_double_ty(); }
+
+bool type::is_sized() const {
+  // primitive types are sized
+  if(is_integer_ty() || is_floating_point_ty() ||
+     is_pointer_ty()){
+    return true;
+  }
+  // tile types are sizes
+  if(is_tile_ty())
+    return get_scalar_ty()->is_sized();
+  return false;
+}
+
+// primitive types
+type *type::get_void_ty(context &ctx) { return &ctx.p_impl->void_ty; }
+type *type::get_label_ty(context &ctx) { return &ctx.p_impl->label_ty; }
+// half
+type *type::get_half_ty(context &ctx) { return &ctx.p_impl->half_ty; }
+type *type::get_float_ty(context &ctx) { return &ctx.p_impl->float_ty; }
+type *type::get_double_ty(context &ctx) { return &ctx.p_impl->double_ty; }
+// integer types
+integer_type *type::get_int1_ty(context &ctx) { return &ctx.p_impl->int1_ty; }
+integer_type *type::get_int8_ty(context &ctx) { return &ctx.p_impl->int8_ty; }
+integer_type *type::get_int16_ty(context &ctx) { return &ctx.p_impl->int16_ty; }
+integer_type *type::get_int32_ty(context &ctx) { return &ctx.p_impl->int32_ty; }
+integer_type *type::get_int64_ty(context &ctx) { return &ctx.p_impl->int64_ty; }
+integer_type *type::get_int128_ty(context &ctx) { return &ctx.p_impl->int128_ty; }
+
+
+
+pointer_type::pointer_type(type *ty, unsigned address_space)
+    : type(ty->get_context(), PointerTyID), address_space_(address_space){
+  contained_tys_.push_back(ty);
+}
+
+bool pointer_type::is_valid_elt_ty(type *ty){
+  return !ty->is_void_ty() && !ty->is_label_ty() &&
+         !ty->is_metadata_ty() && !ty->is_token_ty();
+}
+
+pointer_type* pointer_type::get(type *elt_ty, unsigned address_space){
+  assert(elt_ty && "Can't get a pointer to <null> type!");
+  assert(is_valid_elt_ty(elt_ty) && "Invalid type for pointer element!");
+  // look-up
+  context_impl *impl = elt_ty->get_context().p_impl.get();
+  pointer_type *&entry = impl->ptr_tys[std::make_pair(elt_ty, address_space)];
+  if(!entry)
+    entry = new pointer_type(elt_ty, address_space);
+  return entry;
+}
+
+//===----------------------------------------------------------------------===//
+//                               composite_type class
+//===----------------------------------------------------------------------===//
+
+type* composite_type::get_type_at_index(value *) const{
+  assert(is_tile_ty());
+  return get_scalar_ty();
+}
+
+bool composite_type::index_valid(value *idx) const{
+  assert(is_tile_ty());
+  return idx->get_type()->is_int_or_tileint_ty();
+}
+
+//===----------------------------------------------------------------------===//
+//                               tile_type class
+//===----------------------------------------------------------------------===//
+
+tile_type::tile_type(type *ty, const tile_shapes_t &shapes)
+    : composite_type(ty->get_context(), TileTyID), shapes_(shapes) {
+  contained_tys_.push_back(ty);
+}
+
+bool tile_type::is_valid_elt_ty(type *ty) {
+  return ty->is_pointer_ty() || ty->is_floating_point_ty() || ty->is_integer_ty();
+}
+
+unsigned tile_type::get_num_elements() const {
+  unsigned res = 1;
+  for(auto shape: shapes_)
+    res *= shape;
+  return res;
+}
+
+unsigned tile_type::get_bitwidth() const {
+  return get_num_elements() * get_tile_element_ty()->get_primitive_size_in_bits();
+}
+
+tile_type* tile_type::get(type *elt_ty, const tile_shapes_t &shapes) {
+  assert(elt_ty && "Can't get a tile of <null> type!");
+  assert(shapes.size() && "Can't create a tile with empty shapes!");
+  assert(is_valid_elt_ty(elt_ty) && "Invalid type for tile element!");
+  // look-up
+  context_impl *impl = elt_ty->get_context().p_impl.get();
+  tile_type *&entry = impl->tile_tys[std::make_pair(elt_ty, shapes)];
+  if(!entry)
+    entry = new tile_type(elt_ty, shapes);
+  return entry;
+}
+
+tile_type* tile_type::get_same_shapes(type *ty, type *ref){
+  assert(ref->is_tile_ty());
+  return get(ty, ref->get_tile_shapes());
+}
+
+//===----------------------------------------------------------------------===//
+//                               function_type class
+//===----------------------------------------------------------------------===//
+
+function_type::function_type(type *ret_ty, const std::vector<type*> &param_tys):
+   type(ret_ty->get_context(), FunctionTyID) {
+  contained_tys_.push_back(ret_ty);
+  for(type *ty: param_tys)
+    contained_tys_.push_back(ty);
+}
+
+function_type* function_type::get(type *ret_ty, const std::vector<type *> &param_tys) {
+  return new function_type(ret_ty, param_tys);
+}
+
+}
+}
diff --git a/lib/ir/utils.cc b/lib/ir/utils.cc
new file mode 100644
index 000000000..7baf5df14
--- /dev/null
+++ b/lib/ir/utils.cc
@@ -0,0 +1,54 @@
+#include <stack>
+#include <iostream>
+#include "triton/ir/utils.h"
+#include "triton/ir/basic_block.h"
+#include "triton/ir/function.h"
+#include "triton/ir/module.h"
+
+namespace triton{
+namespace ir{
+
+std::vector<basic_block*> cfg::reverse_post_order(function* fn) {
+  std::stack<basic_block*> stack;
+  std::set<basic_block*> visited;
+  std::vector<basic_block*> result;
+  // initialize stack
+  for(ir::basic_block* block: fn->blocks())
+    if(block->get_predecessors().empty())
+      stack.push(block);
+  // DFS
+  while(!stack.empty()) {
+    basic_block* current = stack.top();
+    stack.pop();
+    result.push_back(current);
+    visited.insert(current);
+    for(basic_block* succ: current->get_successors())
+      if(visited.find(succ) == visited.end())
+        stack.push(succ);
+  }
+  return std::move(result);
+}
+
+void for_each_instruction(module &mod, const std::function<void (instruction *)> &do_work) {
+  for(ir::function *fn: mod.get_function_list())
+  for(ir::basic_block *block: cfg::reverse_post_order(fn))
+  for(ir::instruction *i: block->get_inst_list())
+    do_work(i);
+}
+
+void for_each_value(module &mod, const std::function<void (value *)> &do_work) {
+  std::set<ir::value*> seen;
+  for(ir::function *fn: mod.get_function_list())
+  for(ir::basic_block *block: cfg::reverse_post_order(fn))
+  for(ir::instruction *i: block->get_inst_list()){
+    for(ir::value *op: i->ops()){
+      if(seen.insert(op).second)
+        do_work(op);
+    }
+    if(seen.insert(i).second)
+      do_work(i);
+  }
+}
+
+}
+}
diff --git a/lib/ir/value.cc b/lib/ir/value.cc
new file mode 100644
index 000000000..a43aaa05e
--- /dev/null
+++ b/lib/ir/value.cc
@@ -0,0 +1,79 @@
+#include <cassert>
+#include "triton/ir/value.h"
+#include "triton/ir/instructions.h"
+
+namespace triton{
+namespace ir{
+
+class type;
+
+//===----------------------------------------------------------------------===//
+//                               value class
+//===----------------------------------------------------------------------===//
+
+value::value(type *ty, const std::string &name): ty_(ty){
+  set_name(name);
+}
+
+void value::add_use(user *arg) {
+  users_.insert(arg);
+}
+
+unsigned value::erase_use(user *arg){
+  return users_.erase(arg);
+}
+
+// TODO: automatic naming scheme + update symbol table
+void value::set_name(const std::string &name){
+  name_ = name;
+}
+
+void value::replace_all_uses_with(value *target){
+  throw std::runtime_error("not implemented");
+}
+
+void visitor::visit_value(ir::value* v) {
+  v->accept(this);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                               user class
+//===----------------------------------------------------------------------===//
+void user::set_operand(unsigned i, value *x) {
+  assert(i < ops_.size() && "set_operand() out of range!");
+  ops_[i] = x;
+  x->add_use(this);
+}
+
+value* user::get_operand(unsigned i) const {
+  assert(i < ops_.size() && "get_operand() out of range!");
+  return ops_[i];
+}
+
+unsigned user::get_num_operands() const {
+  return num_ops_;
+}
+
+unsigned user::get_num_hidden() const {
+  return num_hidden_;
+}
+
+void user::replace_all_uses_with(value *target) {
+  for(auto it = users_.begin(); it != users_.end(); it++){
+    (*it)->replace_uses_of_with(this, target);
+  }
+}
+
+void user::replace_uses_of_with(value *before, value *after) {
+  for(size_t i = 0; i < ops_.size(); i++)
+    if(ops_[i] == before)
+      ops_[i] = after;
+  after->add_use(this);
+  before->erase_use(this);
+}
+
+
+
+}
+}
diff --git a/lib/lang/ast.cc b/lib/lang/ast.cc
new file mode 100644
index 000000000..1fd8b2dcb
--- /dev/null
+++ b/lib/lang/ast.cc
@@ -0,0 +1,1087 @@
+#include "triton/lang/ast.h"
+#include "triton/lang/error.h"
+#include "triton/lang/evaluator.h"
+#include "triton/lang/mem_pool.h"
+#include "triton/lang/parser.h"
+#include "triton/lang/token.h"
+
+
+static MemPoolImp<BinaryOp>         binaryOpPool;
+static MemPoolImp<TransOp>          transOpPool;
+static MemPoolImp<ConditionalOp>    conditionalOpPool;
+static MemPoolImp<FuncCall>         funcCallPool;
+static MemPoolImp<Declaration>      initializationPool;
+static MemPoolImp<Object>           objectPool;
+static MemPoolImp<Identifier>       identifierPool;
+static MemPoolImp<Enumerator>       enumeratorPool;
+static MemPoolImp<Constant>         constantPool;
+static MemPoolImp<TempVar>          tempVarPool;
+static MemPoolImp<UnaryOp>          unaryOpPool;
+static MemPoolImp<EmptyStmt>        emptyStmtPool;
+static MemPoolImp<IfStmt>           ifStmtPool;
+static MemPoolImp<ForStmt>          forStmtPool;
+static MemPoolImp<JumpStmt>         jumpStmtPool;
+static MemPoolImp<ReturnStmt>       returnStmtPool;
+static MemPoolImp<LabelStmt>        labelStmtPool;
+static MemPoolImp<CompoundStmt>     compoundStmtPool;
+static MemPoolImp<FuncDef>          funcDefPool;
+
+
+/*
+ * Accept
+ */
+
+void Declaration::Accept(Visitor* v) {
+  v->VisitDeclaration(this);
+}
+
+
+void EmptyStmt::Accept(Visitor* v) {
+  // Nothing to do
+}
+
+
+void LabelStmt::Accept(Visitor* v) {
+  v->VisitLabelStmt(this);
+}
+
+
+void IfStmt::Accept(Visitor* v) {
+  v->VisitIfStmt(this);
+}
+
+void ForStmt::Accept(Visitor* v) {
+  v->VisitForStmt(this);
+}
+
+
+void JumpStmt::Accept(Visitor* v) {
+  v->VisitJumpStmt(this);
+}
+
+
+void ReturnStmt::Accept(Visitor* v) {
+  v->VisitReturnStmt(this);
+}
+
+
+void CompoundStmt::Accept(Visitor* v) {
+  v->VisitCompoundStmt(this);
+}
+
+
+void BinaryOp::Accept(Visitor* v) {
+  v->VisitBinaryOp(this);
+}
+
+
+void UnaryOp::Accept(Visitor* v) {
+  v->VisitUnaryOp(this);
+}
+
+void TransOp::Accept(Visitor* v) {
+  v->VisitTransOp(this);
+}
+
+void ConditionalOp::Accept(Visitor* v) {
+  v->VisitConditionalOp(this);
+}
+
+
+void FuncCall::Accept(Visitor* v) {
+  v->VisitFuncCall(this);
+}
+
+
+void Identifier::Accept(Visitor* v) {
+  v->VisitIdentifier(this);
+}
+
+
+void Object::Accept(Visitor* v) {
+  v->VisitObject(this);
+}
+
+
+void Constant::Accept(Visitor* v) {
+  v->VisitConstant(this);
+}
+
+
+void Enumerator::Accept(Visitor* v)
+{
+  v->VisitEnumerator(this);
+}
+
+
+void TempVar::Accept(Visitor* v) {
+  v->VisitTempVar(this);
+}
+
+
+void FuncDef::Accept(Visitor* v) {
+  v->VisitFuncDef(this);
+}
+
+
+void TranslationUnit::Accept(Visitor* v) {
+  v->VisitTranslationUnit(this);
+}
+
+
+// Casting array to pointer, function to pointer to function
+Expr* Expr::MayCast(Expr* expr) {
+  auto type = Type::MayCast(expr->Type());
+  // If the types are equal, no need cast
+  if (type != expr->Type()) { // Pointer comparison is enough
+    return UnaryOp::New(Token::CAST, expr, type);
+  }
+  return expr;
+}
+
+
+Expr* Expr::MayCast(Expr* expr, QualType desType) {
+  expr = MayCast(expr);
+  auto srcType = expr->Type();
+  if (desType->ToPointer() && srcType->ToPointer())
+    if (desType->IsVoidPointer() || srcType->IsVoidPointer())
+      return expr;
+  if (!desType->Compatible(*expr->Type()))
+    expr = UnaryOp::New(Token::CAST, expr, desType);
+  return expr;
+}
+
+// Extract the operand's scalar type if possible
+// and emit an error otherwise
+::Type* Expr::TryExtractScalarType(Expr* loc, Expr *operand) {
+  auto scalType = operand->Type()->ScalarType();
+  if(!scalType)
+    Error(loc, "expect tile or scalar operand");
+  return scalType;
+}
+
+// If operand is a tile, return a tile of the same shape and
+// provided element type
+// If operand is a scalar, return provided element type
+// directly
+::Type* Expr::ScalarOrLikeTile(Expr* operand, ::Type* ty) {
+  assert(ty->IsScalar());
+  ::Type *retTy = ty;
+  if(TileType *T = operand->Type()->ToTile())
+    retTy = TileType::New(T->Shape(), retTy);
+  return retTy;
+}
+
+BinaryOp* BinaryOp::New(const Token* tok, Expr* lhs, Expr* rhs) {
+  return New(tok, tok->tag_, lhs, rhs);
+}
+
+
+BinaryOp* BinaryOp::New(const Token* tok, int op, Expr* lhs, Expr* rhs) {
+  switch (op) {
+  case ',': case '.': case '=':
+  case '*': case '/': case '%':
+  case '+': case '-': case '&':
+  case '^': case '|': case '<':
+  case '>':
+  case Token::LEFT:
+  case Token::RIGHT:
+  case Token::LE:
+  case Token::GE:
+  case Token::EQ:
+  case Token::NE:
+  case Token::LOGICAL_AND:
+  case Token::LOGICAL_OR:
+  case Token::ELLIPSIS:
+  case Token::MATMUL:
+  case Token::MASKED_DEREF:
+    break;
+  default:
+    assert(0);
+  }
+
+  auto ret = new (binaryOpPool.Alloc()) BinaryOp(tok, op, lhs, rhs);
+  ret->pool_ = &binaryOpPool;
+
+  ret->TypeChecking();
+  return ret;
+}
+
+
+ArithmType* BinaryOp::Convert() {
+  // Both lhs and rhs are ensured to be have arithmetic scalar type
+  auto lhsType = lhs_->Type()->ScalarType()->ToArithm();
+  auto rhsType = rhs_->Type()->ScalarType()->ToArithm();
+  assert(lhsType && rhsType);
+  auto maxType = ArithmType::MaxType(lhsType, rhsType);
+  if (lhsType != maxType) { // Pointer comparation is enough!
+    lhs_ = UnaryOp::New(Token::CAST, lhs_, ScalarOrLikeTile(lhs_, maxType));
+  }
+  if (rhsType != maxType) {
+    rhs_ = UnaryOp::New(Token::CAST, rhs_, ScalarOrLikeTile(rhs_, maxType));
+  }
+  return maxType;
+}
+
+void BinaryOp::Broadcast(Expr* loc, Expr *&lhs, Expr *&rhs, QualType& type) {
+  auto lhsType = lhs->Type()->ToTile();
+  auto rhsType = rhs->Type()->ToTile();
+  auto eleType = type->ScalarType();
+  assert(eleType);
+  if(!lhsType && !rhsType)
+    return ;
+  else if(lhsType && !rhsType){
+    type = TileType::New(lhsType->Shape(), eleType);
+    ::Type* rtype = TileType::New(lhsType->Shape(), rhs->Type()->ScalarType());
+    rhs = UnaryOp::New(Token::CAST, rhs, rtype);
+  }
+  else if(!lhsType && rhsType){
+    type = TileType::New(rhsType->Shape(), eleType);
+    ::Type* ltype = TileType::New(rhsType->Shape(), lhs->Type()->ScalarType());
+    lhs = UnaryOp::New(Token::CAST, lhs, ltype);
+
+  }
+  else {
+    auto lhsShape = lhsType->Shape();
+    auto rhsShape = rhsType->Shape();
+    auto lhsRank = lhsShape.size();
+    auto rhsRank = rhsShape.size();
+    auto retRank = std::max(lhsRank, rhsRank);
+    // pad to the left until shapes have the same rank
+    while(lhsShape.size() < retRank)
+      lhsShape.insert(lhsShape.begin(), 1);
+    while(rhsShape.size() < retRank)
+      rhsShape.insert(rhsShape.begin(), 1);
+    // broadcast if possible
+    TileType::ShapeInt retShape(retRank);
+    for(size_t i = 0; i < retRank; i++) {
+      if(lhsShape[i] == 1)
+        retShape[i] = rhsShape[i];
+      else if(rhsShape[i] == 1)
+        retShape[i] = lhsShape[i];
+      else if(lhsShape[i] == rhsShape[i])
+        retShape[i] = lhsShape[i];
+      else
+        Error(loc, "cannot broadcast dimension %d "
+                    "for operands of shape %d and %d",
+                    i, lhsShape[i], rhsShape[i]);
+    }
+    ::Type* ltype = TileType::New(retShape, lhsType->ScalarType());
+    ::Type* rtype = TileType::New(retShape, rhsType->ScalarType());
+    type = TileType::New(retShape, eleType);
+    if(retShape != lhsShape)
+      lhs = UnaryOp::New(Token::CAST, lhs, ltype);
+    if(retShape != rhsShape)
+      rhs = UnaryOp::New(Token::CAST, rhs, rtype);
+  }
+}
+
+/*
+ * Type checking
+ */
+
+void Expr::EnsureCompatibleOrVoidPointer(const QualType lhs,
+                                         const QualType rhs) const {
+  if (lhs->ToPointer() && rhs->ToPointer() &&
+      (lhs->IsVoidPointer() || rhs->IsVoidPointer())) {
+    return;
+  }
+  EnsureCompatible(lhs, rhs);
+}
+
+
+void Expr::EnsureCompatible(const QualType lhs, const QualType rhs) const {
+  if (!lhs->Compatible(*rhs))
+    Error(this, "incompatible types");
+}
+
+
+void BinaryOp::TypeChecking() {
+  switch (op_) {
+  case '.':
+    return MemberRefOpTypeChecking();
+
+  case '*':
+  case '/':
+  case '%':
+    return MultiOpTypeChecking();
+
+  case '+':
+  case '-':
+    return AdditiveOpTypeChecking();
+
+  case Token::LEFT:
+  case Token::RIGHT:
+    return ShiftOpTypeChecking();
+
+  case '<':
+  case '>':
+  case Token::LE:
+  case Token::GE:
+    return RelationalOpTypeChecking();
+
+  case Token::EQ:
+  case Token::NE:
+    return EqualityOpTypeChecking();
+
+  case '&':
+  case '^':
+  case '|':
+    return BitwiseOpTypeChecking();
+
+  case Token::LOGICAL_AND:
+  case Token::LOGICAL_OR:
+    return LogicalOpTypeChecking();
+
+  case '=':
+    return AssignOpTypeChecking();
+
+  case ',':
+    return CommaOpTypeChecking();
+
+  case Token::ELLIPSIS:
+    return RangeOpTypeChecking();
+
+  case Token::MATMUL:
+    return MatmulOpTypeChecking();
+
+  case Token::MASKED_DEREF:
+    return MaskedDerefOpTypeChecking();
+
+  default:
+    assert(0);
+  }
+}
+
+
+void BinaryOp::CommaOpTypeChecking() {
+  type_ = rhs_->Type();
+}
+
+
+void BinaryOp::SubScriptingOpTypeChecking() {
+  assert(false);
+}
+
+
+void BinaryOp::MemberRefOpTypeChecking() {
+  type_ = rhs_->Type();
+}
+
+
+void BinaryOp::MultiOpTypeChecking() {
+  ::Type* lhsScalType = lhs_->Type()->ScalarType();
+  ::Type* rhsScalType = rhs_->Type()->ScalarType();
+  if(!lhsScalType || !rhsScalType) {
+    Error(this, "operands should have type or scalar type");
+  }
+  if (!lhsScalType->ToArithm() || !rhsScalType->ToArithm()) {
+    Error(this, "operands should have arithmetic type");
+  }
+  if ('%' == op_ &&
+      !(lhsScalType->IsInteger() && rhsScalType->IsInteger())) {
+    Error(this, "operands of '%%' should be integers");
+  }
+  type_ = Convert();
+  Broadcast(this, lhs_, rhs_, type_);
+}
+
+
+/*
+ * Additive operator is only allowed between:
+ *  1. arithmetic types (bool, interger, floating)
+ *  2. pointer can be used:
+ *    1. lhs of MINUS operator, and rhs must be integer or pointer;
+ *    2. lhs/rhs of ADD operator, and the other operand must be integer;
+ *  3. tiles can be used:
+ *    1. the scalar type of lhs/rhs satisfy the above requirements
+ *    2. lhs/rhs that have identical shape
+ *    3. lhs/rhs that can be broadcast as per numpy-like semantics
+ */
+void BinaryOp::AdditiveOpTypeChecking() {
+  ::Type* lhsScalType = TryExtractScalarType(this, lhs_);
+  ::Type* rhsScalType = TryExtractScalarType(this, rhs_);
+  auto lhsPtrType = lhsScalType->ToPointer();
+  auto rhsPtrType = rhsScalType->ToPointer();
+  if (lhsPtrType) {
+    if (op_ == '-') {
+      if (rhsPtrType) {
+        if (!lhsPtrType->Compatible(*rhsPtrType))
+          Error(this, "invalid operands to binary -");
+        type_ = ArithmType::New(T_LONG); // ptrdiff_t
+      } else if (!rhsScalType->IsInteger()) {
+        Error(this, "invalid operands to binary -");
+      } else {
+        type_ = lhsPtrType;
+      }
+    } else if (!rhsScalType->IsInteger()) {
+      Error(this, "invalid operands to binary +");
+    } else {
+      type_ = lhsPtrType;
+    }
+  } else if (rhsPtrType) {
+    if (op_ == '+' && !lhsScalType->IsInteger()) {
+      Error(this, "invalid operands to binary '+'");
+    } else if (op_ == '-' && !lhsPtrType) {
+      Error(this, "invalid operands to binary '-'");
+    }
+    type_ = op_ == '-' ? ArithmType::New(T_LONG): rhsScalType;
+    std::swap(lhs_, rhs_); // To simplify code gen
+  } else {
+    if (!lhsScalType->ToArithm() || !rhsScalType->ToArithm()) {
+      Error(this, "invalid operands to binary %s", tok_->str_.c_str());
+    }
+    type_ = Convert();
+  }
+  Broadcast(this, lhs_, rhs_, type_);
+}
+
+void BinaryOp::RangeOpTypeChecking() {
+  auto lhsType = lhs_->Type()->ToArithm();
+  auto rhsType = rhs_->Type()->ToArithm();
+  if(!lhsType || !lhsType->IsInteger() || !rhsType || !rhsType->IsInteger())
+    Error(this, "expect integers for range operator");
+  lhs_ = Expr::MayCast(lhs_, ArithmType::IntegerPromote(lhsType));
+  rhs_ = Expr::MayCast(rhs_, ArithmType::IntegerPromote(rhsType));
+  long begin = Evaluator<long>().Eval(lhs_);
+  long end = Evaluator<long>().Eval(rhs_);
+  int len = static_cast<int>(end - begin);
+  if(len < 0)
+    Error(this, "range cannot be negative");
+  type_ = TileType::New(TileType::ShapeInt{len}, lhs_->Type());
+}
+
+void BinaryOp::MaskedDerefOpTypeChecking() {
+//  auto lhsTileType = lhs_->Type()->ToTile();
+//  auto rhsTileType = rhs_->Type()->ToTile();
+  ::Type* lhsScalType = TryExtractScalarType(this, lhs_);
+  ::Type* rhsScalType = TryExtractScalarType(this, rhs_);
+  auto lhsType = lhsScalType->ToArithm();
+  auto rhsType = rhsScalType->ToPointer();
+  if (!rhsType)
+    Error(this, "pointer expected for deref pointer in operator '*?'");
+  if (!lhsType || (lhsType && !lhsType->IsBool()))
+    Error(this, "bool expected for deref mask in operator '*?'");
+  type_ = ScalarOrLikeTile(rhs_, rhsType->Derived().GetPtr());
+  Broadcast(this, lhs_, rhs_, type_);
+}
+
+void BinaryOp::MatmulOpTypeChecking() {
+  auto lhsType = lhs_->Type()->ToTile();
+  auto rhsType = rhs_->Type()->ToTile();
+  if(!lhsType || !rhsType)
+    Error(this, "expect tile operands for matrix multiplication");
+  auto lhsShape = lhsType->Shape();
+  auto rhsShape = rhsType->Shape();
+  size_t lhsRank = lhsShape.size();
+  size_t rhsRank = rhsShape.size();
+  if(lhsRank != rhsRank)
+    Error(this, "matrix multiplication operands have incompatible rank"
+                "%d and %d", lhsRank, rhsRank);
+  for(int d = 2; d < lhsRank; d++)
+    if(lhsShape[d] != rhsShape[d])
+      Error(this, "matrix multiplication operands have incompatible batch dimension"
+                  "%d and %d for axis %d", lhsShape[d], rhsShape[d], d);
+  if(lhsShape[1] != rhsShape[0])
+    Error(this, "matrix multiplication operands have incompatible inner dimension"
+                " %d and %d", lhsShape[1], rhsShape[0]);
+  // ret shape
+  TileType::ShapeInt retShape = {lhsShape[0], rhsShape[1]};
+  for(int d = 2; d < lhsRank; d++)
+    retShape.push_back(lhsShape[d]);
+  QualType retType = lhsType->Derived();
+  if(retType != rhsType->Derived())
+    Error(this, "matrix multiplication operands have incompatible data types");
+  ArithmType* ScalType = lhsType->ScalarType()->ToArithm();
+  if(ScalType->Tag() & T_HALF)
+    ScalType = ArithmType::New(T_FLOAT);
+  type_ = TileType::New(retShape, ScalType);
+}
+
+void BinaryOp::ShiftOpTypeChecking() {
+  ::Type* lhsScalType = TryExtractScalarType(this, lhs_);
+  ::Type* rhsScalType = TryExtractScalarType(this, rhs_);
+  auto lhsType = lhsScalType->ToArithm();
+  auto rhsType = rhsScalType->ToArithm();
+  if (!lhsType || !lhsType->IsInteger() || !rhsType || !rhsType->IsInteger())
+    Error(this, "expect integers for shift operator");
+  lhs_ = Expr::MayCast(lhs_, ScalarOrLikeTile(lhs_, ArithmType::IntegerPromote(lhsType)));
+  rhs_ = Expr::MayCast(rhs_, ScalarOrLikeTile(rhs_, ArithmType::IntegerPromote(rhsType)));
+  type_ = lhs_->Type();
+  Broadcast(this, lhs_, rhs_, type_);
+}
+
+
+void BinaryOp::RelationalOpTypeChecking() {
+  ::Type* lhsScalType = TryExtractScalarType(this, lhs_);
+  ::Type* rhsScalType = TryExtractScalarType(this, rhs_);
+  if (lhsScalType->ToPointer() || rhsScalType->ToPointer()) {
+    EnsureCompatible(lhsScalType, rhsScalType);
+  } else {
+    if (!lhsScalType->IsReal() || !rhsScalType->IsReal()) {
+      Error(this, "expect real type of operands");
+    }
+    Convert();
+  }
+  type_ = ArithmType::New(T_INT);
+  Broadcast(this, lhs_, rhs_, type_);
+}
+
+
+void BinaryOp::EqualityOpTypeChecking() {
+  ::Type* lhsScalType = TryExtractScalarType(this, lhs_);
+  ::Type* rhsScalType = TryExtractScalarType(this, rhs_);
+  if (lhsScalType->ToPointer() || rhsScalType->ToPointer()) {
+    EnsureCompatibleOrVoidPointer(lhsScalType, rhsScalType);
+  } else {
+    if (!lhsScalType->ToArithm() || !rhsScalType->ToArithm())
+      Error(this, "invalid operands to binary %s", tok_->str_.c_str());
+    Convert();
+  }
+  type_ = ArithmType::New(T_INT);
+  Broadcast(this, lhs_, rhs_, type_);
+}
+
+
+void BinaryOp::BitwiseOpTypeChecking() {
+  ::Type* lhsScalType = TryExtractScalarType(this, lhs_);
+  ::Type* rhsScalType = TryExtractScalarType(this, rhs_);
+  if (!lhsScalType->IsInteger() || !rhsScalType->IsInteger())
+    Error(this, "operands of '&' should be integer");
+  type_ = Convert();
+  Broadcast(this, lhs_, rhs_, type_);
+}
+
+
+void BinaryOp::LogicalOpTypeChecking() {
+  ::Type* lhsScalType = TryExtractScalarType(this, lhs_);
+  ::Type* rhsScalType = TryExtractScalarType(this, rhs_);
+  if (!lhsScalType->IsScalar() || !rhsScalType->IsScalar())
+    Error(this, "the operand should be arithmetic type or pointer");
+  type_ = ArithmType::New(T_INT);
+  Broadcast(this, lhs_, rhs_, type_);
+}
+
+
+void BinaryOp::AssignOpTypeChecking() {
+  if (lhs_->IsConstQualified()) {
+    Error(lhs_, "left operand of '=' is const qualified");
+  } else if (!lhs_->IsLVal()) {
+    Error(lhs_, "lvalue expression expected");
+  }
+
+  ::Type* lhsScalType = TryExtractScalarType(this, lhs_);
+  ::Type* rhsScalType = TryExtractScalarType(this, rhs_);
+  if (!lhsScalType->ToArithm() || !rhsScalType->ToArithm()) {
+    EnsureCompatibleOrVoidPointer(lhsScalType, rhsScalType);
+  }
+
+  // The other constraints are lefted to cast operator
+  rhs_ = Expr::MayCast(rhs_, ScalarOrLikeTile(rhs_, lhsScalType));
+  type_ = lhs_->Type();
+  rhs_ = UnaryOp::New(Token::CAST, rhs_, type_);
+}
+
+/*
+ * Unary Operators
+ */
+
+UnaryOp* UnaryOp::New(int op, Expr* operand, QualType type, int info) {
+  auto ret = new (unaryOpPool.Alloc()) UnaryOp(op, operand, type, info);
+  ret->pool_ = &unaryOpPool;
+
+  ret->TypeChecking();
+  return ret;
+}
+
+
+int UnaryOp::encodeRed(int ax, int tag) {
+  int result = 0;
+  result |= ax;
+  result |= tag << 16;
+  return result;
+}
+
+void UnaryOp::decodeRed(int info, int& ax, int& tag) {
+  ax = info & 0x0000FFFF;
+  tag = (info & 0xFFFF0000) >> 16;
+}
+
+bool UnaryOp::IsLVal() {
+  // Only deref('*') could be lvalue;
+  return op_ == Token::DEREF;
+}
+
+
+::Type* UnaryOp::Convert() {
+  auto scalType = operand_->Type()->ScalarType();
+  assert(scalType);
+  auto arithmType = scalType->ToArithm();
+  assert(arithmType);
+  if (arithmType->IsInteger())
+    arithmType = ArithmType::IntegerPromote(arithmType);
+  ::Type* retType = ScalarOrLikeTile(operand_, arithmType);
+  operand_ = Expr::MayCast(operand_, retType);
+  return retType;
+}
+
+
+void UnaryOp::TypeChecking() {
+  switch (op_) {
+  case Token::POSTFIX_INC:
+  case Token::POSTFIX_DEC:
+  case Token::PREFIX_INC:
+  case Token::PREFIX_DEC:
+    return IncDecOpTypeChecking();
+
+  case Token::ADDR:
+    return AddrOpTypeChecking();
+
+  case Token::DEREF:
+    return DerefOpTypeChecking();
+
+  case Token::PLUS:
+  case Token::MINUS:
+  case '~':
+  case '!':
+    return UnaryArithmOpTypeChecking();
+
+  case Token::CAST:
+    return CastOpTypeChecking();
+
+  case Token::REDUCE:
+    return ReduceOpTypeChecking();
+
+  default:
+    assert(false);
+  }
+}
+
+void UnaryOp::IncDecOpTypeChecking() {
+  if (operand_->IsConstQualified()) {
+    Error(this, "increment/decrement of const qualified expression");
+  } else if (!operand_->IsLVal()) {
+    Error(this, "lvalue expression expected");
+  }
+  auto scalType = TryExtractScalarType(this, operand_);
+  if (!scalType->IsReal() && !scalType->ToPointer()) {
+    Error(this, "expect operand of real type or pointer");
+  }
+  type_ = operand_->Type();
+}
+
+
+void UnaryOp::AddrOpTypeChecking() {
+  auto funcType = operand_->Type()->ToFunc();
+  if (funcType == nullptr && !operand_->IsLVal())
+    Error(this, "expression must be an lvalue or function designator");
+  if(operand_->Type()->IsTile())
+    Error(this, "cannot take the address of a tile");
+  type_ = PointerType::New(operand_->Type());
+}
+
+
+void UnaryOp::DerefOpTypeChecking() {
+  auto scalType = TryExtractScalarType(this, operand_);
+  auto pointerType = scalType->ToPointer();
+  if (!pointerType)
+    Error(this, "pointer expected for deref operator '*'");
+  type_ = ScalarOrLikeTile(operand_, pointerType->Derived().GetPtr());
+}
+
+void UnaryOp::ReduceOpTypeChecking() {
+  int ax, tag;
+  decodeRed(info_, ax, tag);
+  auto tileType = operand_->Type()->ToTile();
+  if(!tileType)
+    Error(this, "array expected for reduction operation");
+  auto shape = tileType->Shape();
+  shape.erase(shape.begin() + ax);
+  if(shape.empty())
+    type_ = tileType->Derived();
+  else
+    type_ = TileType::New(shape, tileType->Derived());
+}
+
+void UnaryOp::UnaryArithmOpTypeChecking() {
+  auto scalType = TryExtractScalarType(this, operand_);
+  if (Token::PLUS == op_ || Token::MINUS == op_) {
+    if (!scalType->ToArithm())
+      Error(this, "Arithmetic type expected");
+    Convert();
+    type_ = operand_->Type();
+  } else if ('~' == op_) {
+    if (!scalType->IsInteger())
+      Error(this, "integer expected for operator '~'");
+    Convert();
+    type_ = operand_->Type();
+  } else if (!scalType->IsScalar()) {
+    Error(this, "arithmetic type or pointer expected for operator '!'");
+  } else {
+    type_ = ScalarOrLikeTile(operand_, ArithmType::New(T_INT));
+  }
+}
+
+
+void UnaryOp::CastOpTypeChecking() {
+  auto operandType = Type::MayCast(operand_->Type());
+  // The type_ has been initiated to dest type
+  if (type_->ToVoid()) {
+    // The expression becomes a void expression
+  } else if(type_->IsTile() || operandType->IsTile()) {
+    /* Broadcasting rules:
+     *   1. Tiles with 1 element can be converted to scalar
+     *   2. Scalar can be converted to tiles of any shapes
+     *   3. Tiles can be converted to another tile only if the
+     *       mismatching dimensions are unitary
+     */
+    if(type_->IsScalar() && operandType->ToTile()->NumEle() != 1)
+      Error(this, "tile with more than one element cannot be casted to scalar");
+    if(type_->IsTile() && operandType->IsTile()){
+      auto shape = type_->ToTile()->Shape();
+      auto operandShape = operandType->ToTile()->Shape();
+      if(operandShape.size() > shape.size())
+        Error(this, "cast cannot reduce operand rank");
+      while(operandShape.size() < shape.size())
+        operandShape.insert(operandShape.begin(), 1);
+      for(size_t i = 0; i < shape.size(); i++) {
+        if(shape[i] != 1 && operandShape[i] != 1 && shape[i] != operandShape[i])
+          Error(this, "cannot broadcast dimension %d "
+                      "for operands of shape %d and %d",
+                      i, shape[i], operandShape[i]);
+      }
+    }
+  } else if (!type_->IsScalar() || !operandType->IsScalar()) {
+    if (!type_->Compatible(*operandType))
+      Error(this, "the cast type should be arithemetic type or pointer");
+  } else if (type_->IsFloat() && operandType->ToPointer()) {
+    Error(this, "cannot cast a pointer to floating");
+  } else if (type_->ToPointer() && operandType->IsFloat()) {
+    Error(this, "cannot cast a floating to pointer");
+  }
+}
+
+/*
+ * Transposition Operator
+ */
+void TransOp::TypeChecking() {
+  auto tileType = operand_->Type()->ToTile();
+  if(!tileType)
+    Error(this, "tile expected for transposition operator '^'");
+  auto opShape = tileType->Shape();
+  if(perm_.size() != opShape.size())
+    Error(this, "invalid permutations");
+  // permutate input shape
+  TileType::ShapeInt resShape(opShape.size());
+  for(int d = 0; d < opShape.size(); d++)
+    resShape[d] = opShape[perm_[d]];
+  type_ = TileType::New(resShape, tileType->Derived());
+}
+
+TransOp* TransOp::New(const PermInt& perm, Expr* operand) {
+  auto ret = new (transOpPool.Alloc()) TransOp(perm, operand);
+  ret->pool_ = &transOpPool;
+  ret->TypeChecking();
+  return ret;
+}
+
+/*
+ * Conditional Operator
+ */
+
+ConditionalOp* ConditionalOp::New(const Token* tok,
+                                  Expr* cond,
+                                  Expr* exprTrue,
+                                  Expr* exprFalse) {
+  auto ret = new (conditionalOpPool.Alloc())
+      ConditionalOp(cond, exprTrue, exprFalse);
+  ret->pool_ = &conditionalOpPool;
+
+  ret->TypeChecking();
+  return ret;
+}
+
+
+ArithmType* ConditionalOp::Convert() {
+  auto lhsType = exprTrue_->Type()->ScalarType()->ToArithm();
+  auto rhsType = exprFalse_->Type()->ScalarType()->ToArithm();
+  assert(lhsType && rhsType);
+  auto type = ArithmType::MaxType(lhsType, rhsType);
+  if (lhsType != type) { // Pointer comparation is enough!
+    exprTrue_ = UnaryOp::New(Token::CAST, exprTrue_, type);
+  }
+  if (rhsType != type) {
+    exprFalse_ = UnaryOp::New(Token::CAST, exprFalse_, type);
+  }
+
+  return type;
+}
+
+
+void ConditionalOp::TypeChecking() {
+  auto condScalarType = TryExtractScalarType(this, cond_);
+
+  if (!condScalarType) {
+    Error(cond_->Tok(), "condition must be tile or scalar");
+  }
+
+  auto lhsType = TryExtractScalarType(this, exprTrue_);
+  auto rhsType = TryExtractScalarType(this, exprFalse_);
+  if (lhsType->ToArithm() && rhsType->ToArithm()) {
+    type_ = Convert();
+  } else {
+    EnsureCompatibleOrVoidPointer(lhsType, rhsType);
+    type_ = lhsType;
+  }
+  BinaryOp::Broadcast(this, exprFalse_, exprTrue_, type_);
+}
+
+
+/*
+ * Function Call
+ */
+
+FuncCall* FuncCall::New(Expr* designator, const ArgList& args) {
+  auto ret = new (funcCallPool.Alloc()) FuncCall(designator, args);
+  ret->pool_ = &funcCallPool;
+
+  ret->TypeChecking();
+  return ret;
+}
+
+
+void FuncCall::TypeChecking() {
+  auto pointerType = designator_->Type()->ToPointer();
+  if (pointerType) {
+    if (!pointerType->Derived()->ToFunc())
+      Error(designator_, "called object is not a function or function pointer");
+    // Convert function pointer to function type
+    designator_ = UnaryOp::New(Token::DEREF, designator_);
+  }
+  auto funcType = designator_->Type()->ToFunc();
+  if (!funcType) {
+    Error(designator_, "called object is not a function or function pointer");
+  } else if (!funcType->Derived()->ToVoid() &&
+             !funcType->Derived()->Complete()) {
+    Error(designator_, "invalid use of incomplete return type");
+  }
+
+  auto arg = args_.begin();
+  for (auto param: funcType->Params()) {
+    if (arg == args_.end())
+      Error(this, "too few arguments for function call");
+    *arg = Expr::MayCast(*arg, param->Type());
+    ++arg;
+  }
+  if (arg != args_.end() && !funcType->Variadic())
+    Error(this, "too many arguments for function call");
+
+  // C11 6.5.2.2 [6]: promote float to double if it has no prototype
+  while (arg != args_.end()) {
+    if ((*arg)->Type()->IsFloat() && (*arg)->Type()->Width() == 4) {
+      auto type = ArithmType::New(T_DOUBLE);
+      *arg = UnaryOp::New(Token::CAST, *arg, type);
+    }
+    ++arg;
+  }
+
+  type_ = funcType->Derived();
+}
+
+
+/*
+ * Identifier
+ */
+
+Identifier* Identifier::New(const Token* tok,
+                            QualType type,
+                            enum Linkage linkage,
+                            const AttrList &attrList) {
+  auto ret = new (identifierPool.Alloc()) Identifier(tok, type, linkage, attrList);
+  ret->pool_ = &identifierPool;
+  return ret;
+}
+
+
+Enumerator* Enumerator::New(const Token* tok, int val) {
+  auto ret = new (enumeratorPool.Alloc()) Enumerator(tok, val);
+  ret->pool_ = &enumeratorPool;
+  return ret;
+}
+
+
+Declaration* Declaration::New(Object* obj) {
+  auto ret = new (initializationPool.Alloc()) Declaration(obj);
+  ret->pool_ = &initializationPool;
+  return ret;
+}
+
+void Declaration::AddInit(Initializer init) {
+  init.expr_ = Expr::MayCast(init.expr_, init.type_);
+  auto res = inits_.insert(init);
+  if (!res.second) {
+    inits_.erase(res.first);
+    inits_.insert(init);
+  }
+}
+
+
+/*
+ * Object
+ */
+
+Object* Object::New(const Token* tok,
+                    QualType type,
+                    int storage,
+                    enum Linkage linkage,
+                    unsigned char bitFieldBegin,
+                    unsigned char bitFieldWidth,
+                    const AttrList& attrList) {
+  auto ret = new (objectPool.Alloc())
+             Object(tok, type, storage, linkage, bitFieldBegin, bitFieldWidth, attrList);
+  ret->pool_ = &objectPool;
+
+  static long id = 0;
+  if (ret->IsStatic() || ret->Anonymous())
+    ret->id_ = ++id;
+  return ret;
+}
+
+
+Object* Object::NewAnony(const Token* tok,
+                         QualType type,
+                         int storage,
+                         enum Linkage linkage,
+                         unsigned char bitFieldBegin,
+                         unsigned char bitFieldWidth,
+                         const AttrList& attrList) {
+  auto ret = new (objectPool.Alloc())
+             Object(tok, type, storage, linkage, bitFieldBegin, bitFieldWidth, attrList);
+  ret->pool_ = &objectPool;
+  ret->anonymous_ = true;
+
+  static long id = 0;
+  if (ret->IsStatic() || ret->anonymous_)
+    ret->id_ = ++id;
+  return ret;
+}
+
+
+/*
+ * Constant
+ */
+
+Constant* Constant::New(const Token* tok, int tag, long val) {
+  auto type = ArithmType::New(tag);
+  auto ret = new (constantPool.Alloc()) Constant(tok, type, val);
+  ret->pool_ = &constantPool;
+  return ret;
+}
+
+
+Constant* Constant::New(const Token* tok, int tag, double val) {
+  auto type = ArithmType::New(tag);
+  auto ret = new (constantPool.Alloc()) Constant(tok, type, val);
+  ret->pool_ = &constantPool;
+  return ret;
+}
+
+
+Constant* Constant::New(const Token* tok, int tag, const std::string* val) {
+  auto derived = ArithmType::New(tag);
+  auto type = ArrayType::New(val->size() / derived->Width(), derived);
+
+  auto ret = new (constantPool.Alloc()) Constant(tok, type, val);
+  ret->pool_ = &constantPool;
+
+  static long id = 0;
+  ret->id_ = ++id;
+  return ret;
+}
+
+
+std::string Constant::SValRepr() const {
+  std::vector<char> buf(4 * sval_->size() + 1);
+  for (size_t i = 0; i < sval_->size(); ++i) {
+    int c = (*sval_)[i];
+    sprintf(&buf[i * 4], "\\x%1x%1x", (c >> 4) & 0xf, c & 0xf);
+  }
+  return std::string(buf.begin(), buf.end() - 1);
+}
+
+
+/*
+ * TempVar
+ */
+
+TempVar* TempVar::New(QualType type) {
+  auto ret = new (tempVarPool.Alloc()) TempVar(type);
+  ret->pool_ = &tempVarPool;
+  return ret;
+}
+
+
+/*
+ * Statement
+ */
+
+EmptyStmt* EmptyStmt::New() {
+  auto ret = new (emptyStmtPool.Alloc()) EmptyStmt();
+  ret->pool_ = &emptyStmtPool;
+  return ret;
+}
+
+
+// The else stmt could be null
+IfStmt* IfStmt::New(Expr* cond, Stmt* then, Stmt* els) {
+  auto ret = new (ifStmtPool.Alloc()) IfStmt(cond, then, els);
+  ret->pool_ = &ifStmtPool;
+  return ret;
+}
+
+
+CompoundStmt* CompoundStmt::New(std::list<Stmt*>& stmts, ::Scope* scope) {
+  auto ret = new (compoundStmtPool.Alloc()) CompoundStmt(stmts, scope);
+  ret->pool_ = &compoundStmtPool;
+  return ret;
+}
+
+ForStmt* ForStmt::New(Stmt* body, Stmt* init, Expr* cond, Expr* step) {
+  auto ret = new (forStmtPool.Alloc()) ForStmt(body, init, cond, step);
+  ret->pool_ = &forStmtPool;
+  return ret;
+}
+
+JumpStmt* JumpStmt::New(LabelStmt* label) {
+  auto ret = new (jumpStmtPool.Alloc()) JumpStmt(label);
+  ret->pool_ = &jumpStmtPool;
+  return ret;
+}
+
+
+ReturnStmt* ReturnStmt::New(Expr* expr) {
+  auto ret = new (returnStmtPool.Alloc()) ReturnStmt(expr);
+  ret->pool_ = &returnStmtPool;
+  return ret;
+}
+
+
+LabelStmt* LabelStmt::New() {
+  auto ret = new (labelStmtPool.Alloc()) LabelStmt();
+  ret->pool_ = &labelStmtPool;
+  return ret;
+}
+
+
+FuncDef* FuncDef::New(Identifier* ident, LabelStmt* retLabel) {
+  auto ret = new (funcDefPool.Alloc()) FuncDef(ident, retLabel);
+  ret->pool_ = &funcDefPool;
+  return ret;
+}
+
+
+bool Initializer::operator<(const Initializer& rhs) const {
+  if (offset_ < rhs.offset_)
+    return true;
+  return (offset_ == rhs.offset_ && bitFieldBegin_ < rhs.bitFieldBegin_);
+}
diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc
new file mode 100644
index 000000000..8bbf39081
--- /dev/null
+++ b/lib/lang/code_gen.cc
@@ -0,0 +1,730 @@
+#include "triton/lang/code_gen.h"
+#include "triton/lang/evaluator.h"
+#include "triton/lang/parser.h"
+#include "triton/lang/token.h"
+#include "triton/ir/module.h"
+#include "triton/ir/function.h"
+
+// Helpers
+void Generator::set_ret(ir::value* value) {
+  ret_ = value;
+}
+
+inline bool is_terminator(ir::value* x) {
+  return x && dynamic_cast<ir::terminator_inst*>(x);
+}
+
+
+// Expression
+
+void Generator::VisitBinaryOp(BinaryOp* binary) {
+
+  Visit(binary->rhs_);
+  ir::value* rhs = ret_;
+
+  if(binary->op_ == '=')
+    return set_ret(assign_->GenExpr(binary->lhs_, rhs));
+
+  Visit(binary->lhs_);
+  ir::value* lhs = ret_;
+
+  // op info
+  auto type = binary->lhs_->Type()->ScalarType();
+  auto flt = type->IsFloat();
+  auto sign = !type->IsUnsigned();
+
+  // return
+  switch(binary->op_){
+    case Token::LOGICAL_AND: return set_ret(bld_->create_and(lhs, rhs));
+    case Token::LOGICAL_OR: return set_ret(bld_->create_or(lhs, rhs));
+    case '|': return set_ret(bld_->create_or(lhs, rhs));
+    case '&': return set_ret(bld_->create_and(lhs, rhs));
+    case '^': return set_ret(bld_->create_xor(lhs, rhs));
+    case Token::LEFT: return set_ret(bld_->create_shl(lhs, rhs));
+    case Token::RIGHT: return set_ret(bld_->create_lshr(lhs, rhs));
+    case '.': return error_not_implemented();
+    case ',': return error_not_implemented();
+    case '@' : {
+      ir::type* ret_ty = GenIRType(binary->Type(), *ctx_);
+      ir::type* ret_scal_ty = ret_ty->get_scalar_ty();
+      ir::value* _0;
+      if(ret_scal_ty->is_float_ty())
+        _0 = ir::constant_fp::get(ret_scal_ty, 0);
+      else
+        _0 = ir::constant_int::get(ret_scal_ty, 0);
+      _0 = bld_->create_splat(_0, ret_ty->get_tile_shapes());
+      return set_ret(bld_->create_dot(lhs, rhs, _0));
+    }
+    case Token::MASKED_DEREF: {
+      ir::type* ret_ty = GenIRType(binary->Type(), *ctx_);
+      ir::value* false_value = ir::undef_value::get(ret_ty->get_scalar_ty());
+      if(ret_ty->is_tile_ty())
+        false_value = bld_->create_splat(false_value, ret_ty->get_tile_shapes());
+      return set_ret(bld_->create_masked_load(rhs, lhs, false_value));
+    }
+    case Token::ELLIPSIS: {
+      auto clhs = dynamic_cast<ir::constant_int*>(lhs);
+      auto crhs = dynamic_cast<ir::constant_int*>(rhs);
+      if(!clhs || !crhs)
+        should_not_happen();
+      return set_ret(bld_->insert(ir::make_range::create(clhs, crhs)));
+    }
+    case '+':
+      if(binary->lhs_->Type()->ScalarType()->ToPointer()){
+        return set_ret(bld_->create_gep(lhs, {rhs}));
+      }
+      else if(flt)
+        return set_ret(bld_->create_fadd(lhs, rhs));
+      else
+        return set_ret(bld_->create_add(lhs, rhs));
+    case '-':
+      if(binary->lhs_->Type()->ToPointer())
+        return set_ret(bld_->create_gep(lhs, {GenUnaryMinus(rhs)}));
+      else if(flt)
+        return set_ret(bld_->create_fsub(lhs, rhs));
+      else
+        return set_ret(bld_->create_sub(lhs, rhs));
+    case '*':
+      if(flt)
+        return set_ret(bld_->create_fmul(lhs, rhs));
+      else
+        return set_ret(bld_->create_mul(lhs, rhs));
+    case '/':
+      if(flt)
+        return set_ret(bld_->create_fdiv(lhs, rhs));
+      else if(sign)
+        return set_ret(bld_->create_sdiv(lhs, rhs));
+      else if(!sign)
+        return set_ret(bld_->create_udiv(lhs, rhs));
+      else
+        return should_not_happen();
+    case '%':
+      if(flt)
+        return set_ret(bld_->create_frem(lhs, rhs));
+      else if(sign)
+        return set_ret(bld_->create_srem(lhs, rhs));
+      else
+        return set_ret(bld_->create_urem(lhs, rhs));
+    case '<':
+      if(flt)
+        return set_ret(bld_->create_fcmpOLT(lhs, rhs));
+      else if(sign)
+        return set_ret(bld_->create_icmpSLT(lhs, rhs));
+      else if(!sign)
+        return set_ret(bld_->create_icmpULT(lhs, rhs));
+      else
+        return should_not_happen();
+    case '>':
+      if(flt)
+        return set_ret(bld_->create_fcmpOGT(lhs, rhs));
+      else if(sign)
+        return set_ret(bld_->create_icmpSGT(lhs, rhs));
+      else if(!sign)
+        return set_ret(bld_->create_icmpUGT(lhs, rhs));
+      else
+        return should_not_happen();
+    case Token::LE:
+      if(flt)
+        return set_ret(bld_->create_fcmpOLE(lhs, rhs));
+      else if(sign)
+        return set_ret(bld_->create_icmpSLE(lhs, rhs));
+      else if(!sign)
+        return set_ret(bld_->create_icmpULE(lhs, rhs));
+      else
+        return should_not_happen();
+    case Token::GE:
+      if(flt)
+        return set_ret(bld_->create_fcmpOGE(lhs, rhs));
+      else if(sign)
+        return set_ret(bld_->create_icmpSGE(lhs, rhs));
+      else if(!sign)
+        return set_ret(bld_->create_icmpUGE(lhs, rhs));
+      else
+        return should_not_happen();
+    case Token::EQ:
+      if(flt)
+        return set_ret(bld_->create_fcmpOEQ(lhs, rhs));
+      else
+        return set_ret(bld_->create_icmpEQ(lhs, rhs));
+    case Token::NE:
+      if(flt)
+        return set_ret(bld_->create_fcmpONE(lhs, rhs));
+      else
+        return set_ret(bld_->create_icmpNE(lhs, rhs));
+    default:
+      error_not_implemented();
+  }
+  error_not_implemented();
+}
+
+ir::reduce_inst::op_t reduce_op(int tag, bool is_float) {
+  using ir::reduce_inst;
+  switch(tag){
+    case Token::ADD: return is_float ? reduce_inst::FADD : reduce_inst::ADD;
+    case Token::SUB: return is_float ? reduce_inst::FSUB : reduce_inst::SUB;
+    case Token::MAX: return is_float ? reduce_inst::FMAX : reduce_inst::MAX;
+    case Token::MIN: return is_float ? reduce_inst::FMIN : reduce_inst::MIN;
+    default: break;
+  }
+  should_not_happen();
+  return reduce_inst::op_t();
+}
+
+ir::value* Generator::GenUnaryMinus(ir::value* arg) {
+  ir::type *ty = arg->get_type();
+  ir::type *sca_ty = ty->get_scalar_ty();
+  ir::value *_0 = ir::constant_fp::get_zero_value_for_negation(sca_ty);
+  if(ty->is_tile_ty())
+    _0 = bld_->create_splat(_0, ty->get_tile_shapes());
+  return bld_->create_sub(_0, arg);
+}
+
+void Generator::VisitUnaryOp(UnaryOp* unary) {
+  // recursion
+  Visit(unary->operand_);
+  ir::value* arg = ret_;
+  ir::type *arg_ty = arg->get_type();
+  ir::type *arg_scal_ty = arg_ty->get_scalar_ty();
+  // return
+  switch  (unary->op_) {
+    case Token::PREFIX_INC:  return error_not_implemented();
+    case Token::PREFIX_DEC:  return error_not_implemented();
+    case Token::POSTFIX_INC: return error_not_implemented();
+    case Token::POSTFIX_DEC: return error_not_implemented();
+    case Token::ADDR:        return error_not_implemented();
+    case Token::DEREF:       return set_ret(bld_->create_load(arg));
+    case Token::PLUS:        return error_not_implemented();
+    case Token::MINUS:       return set_ret(GenUnaryMinus(arg));
+    case '~':                return error_not_implemented();
+    case '!':                return error_not_implemented();
+    case Token::CAST:        return set_ret(GenCastOp(arg, GenIRType(unary->Type(), *ctx_)));
+    case Token::REDUCE: {
+      int ax, tag;
+      UnaryOp::decodeRed(unary->info_, ax, tag);
+      bool is_float = arg_scal_ty->is_floating_point_ty();
+      ir::reduce_inst::op_t op = reduce_op(tag, is_float);
+      return set_ret(bld_->create_reduce(arg, op, ax));
+    }
+    default: error_not_implemented();
+  }
+  return error_not_implemented();
+}
+
+void Generator::VisitTransOp(TransOp *trans) {
+  Visit(trans->operand_);
+  ir::value* arg = ret_;
+  return set_ret(bld_->create_trans(arg, trans->getPerm()));
+}
+
+void Generator::VisitConditionalOp(ConditionalOp* condOp) {
+//  auto &instructions = bld_->get_insert_block()->get_inst_list();
+  VisitExpr(condOp->cond_);
+  ir::value* cond = ret_;
+  VisitExpr(condOp->exprTrue_);
+  ir::value* true_val = ret_;
+  VisitExpr(condOp->exprFalse_);
+  ir::value* false_val = ret_;
+  if(ir::load_inst* ld = dynamic_cast<ir::load_inst*>(true_val)) {
+    ir::value* new_ld = bld_->create_masked_load(ld->get_pointer_operand(),
+                                                  cond,
+                                                  false_val);
+    ld->replace_all_uses_with(new_ld);
+    ld->erase_from_parent();
+    return set_ret(new_ld);
+  }
+  return error_not_implemented();
+}
+
+void Generator::VisitFuncCall(FuncCall* funcCall) {
+  std::string name = funcCall->Name();
+  if(name == "get_program_id"){
+    VisitExpr(funcCall->Args()->at(0));
+    ir::value* ret = ret_;
+    if(auto axis = dynamic_cast<ir::constant_int*>(ret))
+      return set_ret(bld_->create_get_program_id(axis->get_value()));
+    else
+      return should_not_happen();
+  }
+  if(name == "get_num_programs"){
+    VisitExpr(funcCall->Args()->at(0));
+    ir::value* ret = ret_;
+    if(auto axis = dynamic_cast<ir::constant_int*>(ret))
+      return set_ret(bld_->create_get_num_program(axis->get_value()));
+    else
+      return should_not_happen();
+  }
+  if(name == "atomic_cas"){
+    VisitExpr(funcCall->Args()->at(0));
+    ir::value* ptr = ret_;
+    VisitExpr(funcCall->Args()->at(1));
+    ir::value* cmp = ret_;
+    VisitExpr(funcCall->Args()->at(2));
+    ir::value* val = ret_;
+    return set_ret(bld_->create_atomic_cas(ptr, cmp, val));
+  }
+  if(name == "atomic_xchg"){
+    VisitExpr(funcCall->Args()->at(0));
+    ir::value* ptr = ret_;
+    VisitExpr(funcCall->Args()->at(1));
+    ir::value* val = ret_;
+    return set_ret(bld_->create_atomic_exch(ptr, val));
+  }
+  if(name == "sqrtf"){
+    VisitExpr(funcCall->Args()->at(0));
+    ir::value* ret = ret_;
+    return set_ret(bld_->create_sqrt(ret));
+  }
+  if(name == "calloc"){
+    VisitExpr(funcCall->Args()->at(0));
+    ir::value* ret = ret_;
+    ir::constant_int *size = dynamic_cast<ir::constant_int*>(ret);
+    assert(size);
+    ir::alloc_const* alloc = new ir::alloc_const(bld_->get_int8_ty(), size);
+    mod_->add_alloc(alloc);
+    return set_ret(alloc);
+  }
+  //TODO: integrate this into conditionalop
+  if(name == "select"){
+    VisitExpr(funcCall->Args()->at(0));
+    ir::value* cond = ret_;
+    VisitExpr(funcCall->Args()->at(1));
+    ir::value* true_val = ret_;
+    VisitExpr(funcCall->Args()->at(2));
+    ir::value* false_val = ret_;
+    return set_ret(bld_->create_select(cond, true_val, false_val));
+  }
+  return error_not_implemented();
+}
+
+void Generator::VisitObject(Object* obj) {
+  return set_ret(mod_->get_value(obj->Name()));
+}
+
+void Generator::VisitEnumerator(Enumerator* enumer) {
+  return error_not_implemented();
+}
+
+void Generator::VisitIdentifier(Identifier* ident) {
+  return set_ret(mod_->get_value(ident->Name()));
+}
+
+void Generator::VisitConstant(Constant* cons) {
+  Type* ctype = cons->Type();
+  ir::type *type = GenIRType(cons->Type(), *ctx_);
+  if(ctype->IsInteger())
+    return set_ret(ir::constant_int::get(type, cons->IVal()));
+  if(ctype->IsFloat() && ctype->IsReal())
+    return set_ret(ir::constant_fp::get(type, cons->FVal()));
+  return error_not_implemented();
+}
+
+void Generator::VisitTempVar(TempVar* tempVar) {
+  return error_not_implemented();
+}
+
+// Statement
+void Generator::VisitDeclaration(Declaration* decl) {
+  auto obj = decl->obj_;
+  // initialize to undef
+  ir::type* ty = GenIRType(obj->Type(), *ctx_);
+  ir::value* val = ir::undef_value::get(ty);
+  // compute initializers
+  std::vector<ir::value*> inits;
+  for (const Initializer& init: decl->Inits()) {
+    VisitExpr(init.expr_);
+    inits.push_back(ret_);
+  }
+  // initialize declaration
+  ir::type::id_t id = ty->get_type_id();
+  if(id == ir::type::StructTyID)
+    should_not_happen();
+  if(inits.size() > 1)
+    should_not_happen();
+  if(inits.size() > 0)
+    val = inits[0];
+  assert(val->get_type() == ty);
+  // update scope symbols table
+  const std::string &name = obj->Name();
+  if(!name.empty()){
+    mod_->set_value(name, val);
+    mod_->get_scope().types[name] = ty;
+  }
+}
+
+void Generator::VisitEmptyStmt(EmptyStmt*) {
+  return;
+}
+
+void Generator::VisitIfStmt(IfStmt* ifStmt) {
+  ir::function *fn = bld_->get_insert_block()->get_parent();
+  Stmt *then_ = ifStmt->then_;
+  Stmt *else_ = ifStmt->else_;
+  VisitExpr(ifStmt->cond_);
+  ir::value* cond = ret_;
+  ir::basic_block *then_bb = ir::basic_block::create(*ctx_, "then", fn);
+  ir::basic_block *else_bb = else_? ir::basic_block::create(*ctx_, "else", fn) : nullptr;
+  ir::basic_block *endif_bb = ir::basic_block::create(*ctx_, "endif", fn);
+  // seal blocks
+  mod_->seal_block(then_bb);
+  if(else_bb)
+    mod_->seal_block(else_bb);
+  // branches
+  if(else_)
+    bld_->create_cond_br(cond, then_bb, else_bb);
+  else
+    bld_->create_cond_br(cond, then_bb, endif_bb);
+  // then
+  bld_->set_insert_point(then_bb);
+  VisitStmt(then_);
+  if(!is_terminator(ret_))
+    bld_->create_br(endif_bb);
+  // else
+  if(else_){
+    bld_->set_insert_point(else_bb);
+    VisitStmt(else_);
+    if(!is_terminator(ret_))
+      bld_->create_br(endif_bb);
+  }
+  // endif
+  mod_->seal_block(endif_bb);
+  bld_->set_insert_point(endif_bb);
+}
+
+void Generator::VisitForStmt(ForStmt *forStmt) {
+  Stmt *init_ = forStmt->init_;
+  Expr *cond_ = forStmt->cond_;
+  Expr *step_ = forStmt->step_;
+  Stmt *body_ = forStmt->body_;
+  ir::basic_block *current_bb = bld_->get_insert_block();
+  ir::function *fn = current_bb->get_parent();
+  ir::basic_block *loop_bb = ir::basic_block::create(*ctx_, "loop", fn);
+  ir::basic_block *next_bb = ir::basic_block::create(*ctx_, "postloop", fn);
+  mod_->set_continue_fn([&](){
+    if(step_)
+      VisitExpr(step_);
+    VisitExpr(cond_);
+    ir::value *cond = ret_;
+    return bld_->create_cond_br(cond, loop_bb, next_bb);
+  });
+  if(init_)
+    VisitStmt(init_);
+//  VisitExpr(cond_);
+//  ir::value *cond = ret_;
+//  bld_->create_cond_br(cond, loop_bb, next_bb);
+  bld_->create_br(loop_bb);
+  bld_->set_insert_point(loop_bb);
+  if(body_)
+    VisitStmt(body_);
+  if(!is_terminator(ret_))
+    mod_->get_continue_fn()();
+  ir::basic_block *stop_bb = bld_->get_insert_block();
+  mod_->seal_block(stop_bb);
+  mod_->seal_block(loop_bb);
+  mod_->seal_block(bld_->get_insert_block());
+  mod_->seal_block(next_bb);
+  bld_->set_insert_point(next_bb);
+}
+
+void Generator::VisitJumpStmt(JumpStmt* jumpStmt) {
+  return error_not_implemented();
+}
+
+void Generator::VisitReturnStmt(ReturnStmt* returnStmt) {
+  ir::value *ret;
+  if(returnStmt->expr_)
+    return error_not_implemented();
+  else
+    ret = bld_->create_ret_void();
+  return set_ret(ret);
+}
+
+void Generator::VisitLabelStmt(LabelStmt* labelStmt) {
+  return error_not_implemented();
+}
+
+void Generator::VisitCompoundStmt(CompoundStmt* compoundStmt) {
+  if (compoundStmt->scope_)
+    pushScope();
+  for (auto stmt: compoundStmt->stmts_)
+    Visit(stmt);
+  if(compoundStmt->scope_)
+    popScope();
+}
+
+void Generator::VisitFuncDef(FuncDef* funcDef) {
+  Stmt *body = funcDef->body_;
+  const std::string& name = funcDef->Name();
+  FuncType* type = funcDef->FuncType();
+  auto prototype = dynamic_cast<ir::function_type*>(GenIRType(type, *ctx_));
+  if(!prototype)
+    should_not_happen();
+  ir::function *fn = mod_->get_or_insert_function(name, prototype);
+  std::vector<ir::argument*> args = fn->args();
+  size_t i = 0;
+  for(Object* obj: type->Params()){
+    std::string name = obj->Name();
+    args[i]->set_name(name);
+    if(obj->Type()->ToPointer())
+      fn->add_attr(i + 1, ir::attribute(ir::aligned, 16));
+    for(ASTNode::Attr attr: obj->GetAttrList()){
+      fn->add_attr(i + 1, GenIRAttr(attr));
+    }
+    if(obj->IsRestrictQualified())
+      fn->add_attr(i, ir::attribute(ir::noalias));
+    mod_->set_value(name, nullptr, args[i]);
+    mod_->get_scope().types[name] = args[i]->get_type();
+    i++;
+  }
+  ir::basic_block *entry = ir::basic_block::create(mod_->get_context(), "entry", fn);
+  mod_->seal_block(entry);
+  mod_->get_builder().set_insert_point(entry);
+  VisitStmt(body);
+  if(!dynamic_cast<ir::return_inst*>(ret_))
+    mod_->get_builder().create_ret_void();
+}
+
+void Generator::VisitTranslationUnit(TranslationUnit* unit) {
+  pushScope();
+  for (auto extDecl: unit->ExtDecls())
+    Visit(extDecl);
+  popScope();
+}
+
+void Generator::Gen(ir::module *mod) {
+  mod_ = mod;
+  ctx_ = &mod_->get_context();
+  bld_ = &mod_->get_builder();
+  assign_ = new LValAssigner(this);
+  VisitTranslationUnit(parser_->Unit());
+  delete assign_;
+  assign_ = nullptr;
+}
+
+
+
+ir::value* Generator::GenBroadcastOp(ir::value* src, ir::type* dst_ty) {
+  if(src->get_type() == dst_ty)
+    return src;
+  if(dst_ty->is_tile_ty()) {
+    ir::type *src_ty = src->get_type();
+    auto dst_shapes = dst_ty->get_tile_shapes();
+    if(!src_ty->is_tile_ty())
+      return bld_->create_splat(src, dst_shapes);
+    auto src_shapes = src_ty->get_tile_shapes();
+    if(src_shapes.size() != dst_shapes.size()){
+      unsigned src_numel = 1;
+      for(unsigned s: src_shapes)
+        src_numel *= s;
+      unsigned dst_numel = 1;
+      for(unsigned s: dst_shapes)
+        dst_numel *= s;
+      if(src_numel == dst_numel)
+        return bld_->create_reshape(src, dst_shapes);
+      else {
+        auto padded_shapes = src_shapes;
+        while(padded_shapes.size() != dst_shapes.size())
+          padded_shapes.insert(padded_shapes.begin(), 1);
+        // check that broadcast is legal
+        for(size_t d = 0; d < padded_shapes.size(); d++){
+          if(dst_shapes[d] != padded_shapes[d] &&
+             padded_shapes[d] != 1)
+            should_not_happen();
+        }
+        // pad and broadcast
+        ir::value *padded = bld_->create_reshape(src, padded_shapes);
+        return bld_->create_broadcast(padded, dst_shapes);
+      }
+    }
+    else{
+      return bld_->create_broadcast(src, dst_shapes);
+    }
+  }
+  else if(src->get_type()->is_tile_ty() && src->get_type()->get_tile_num_elements() == 1){
+    return bld_->create_downcast(src);
+  }
+  return src;
+}
+
+ir::value* Generator::GenNumcastOp(ir::value*src, ir::type* dst_ty) {
+  ir::type *src_scalar_ty = src->get_type()->get_scalar_ty();
+  ir::type *dst_scalar_ty = dst_ty->get_scalar_ty();
+  if(src->get_type()->is_tile_ty())
+    dst_ty = ir::tile_type::get_same_shapes(dst_scalar_ty, src->get_type());
+  bool src_signed = false;
+  bool dst_signed = false;
+  if(src_scalar_ty == dst_scalar_ty)
+    return src;
+  else if(src_scalar_ty->is_integer_ty() && src_signed && dst_scalar_ty->is_floating_point_ty())
+    return bld_->create_si_to_fp(src, dst_ty);
+  else if(src_scalar_ty->is_integer_ty() && !src_signed && dst_scalar_ty->is_floating_point_ty())
+    return bld_->create_ui_to_fp(src, dst_ty);
+  else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_integer_ty() && dst_signed)
+    return bld_->create_fp_to_si(src, dst_ty);
+  else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_integer_ty() && !dst_signed)
+    return bld_->create_fp_to_ui(src, dst_ty);
+  else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_floating_point_ty() &&
+          src_scalar_ty->get_fp_mantissa_width() < dst_scalar_ty->get_fp_mantissa_width())
+    return bld_->create_fp_ext(src, dst_ty);
+  else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_floating_point_ty() &&
+          src_scalar_ty->get_fp_mantissa_width() > dst_scalar_ty->get_fp_mantissa_width())
+    return bld_->create_fp_trunc(src, dst_ty);
+  else if(src_scalar_ty->is_integer_ty() && dst_scalar_ty->is_integer_ty() &&
+          src_scalar_ty->get_integer_bitwidth())
+    return bld_->create_int_cast(src, dst_ty, dst_signed);
+  else if(src_scalar_ty->is_pointer_ty() && dst_scalar_ty->is_pointer_ty())
+    return bld_->create_cast(ir::BitCast, src, dst_ty);
+  else{
+    should_not_happen();
+    return nullptr;
+  }
+}
+
+ir::value* Generator::GenCastOp(ir::value* src, ir::type* dst_ty) {
+  return GenNumcastOp(GenBroadcastOp(src, dst_ty), dst_ty);
+}
+
+// Triton-IR Attr
+ir::attribute Generator::GenIRAttr(ASTNode::Attr attr) {
+  if(attr.kind == ASTNode::Attr::MULTIPLEOF) {
+    VisitExpr(attr.vals[0]);
+    auto cst = dynamic_cast<ir::constant_int*>(ret_);
+    if(!cst) should_not_happen();
+    return ir::attribute(ir::multiple_of, cst->get_value());
+  }
+  if(attr.kind == ASTNode::Attr::ALIGNED) {
+    VisitExpr(attr.vals[0]);
+    auto cst = dynamic_cast<ir::constant_int*>(ret_);
+    return ir::attribute(ir::aligned, cst->get_value());
+  }
+  if(attr.kind == ASTNode::Attr::NOALIAS)
+    return ir::attribute(ir::noalias);
+  if(attr.kind == ASTNode::Attr::READONLY)
+    return ir::attribute(ir::readonly);
+  if(attr.kind == ASTNode::Attr::WRITEONLY)
+    return ir::attribute(ir::writeonly);
+  should_not_happen();
+}
+
+// Triton-IR Types
+ir::type* Generator::GenIRType(::Type* type, ir::context& ctx) {
+  if(auto T = type->ToVoid())
+    return ir::type::get_void_ty(ctx);
+  if(auto T = type->ToArithm())
+    return GenIRArithmType(T, ctx);
+  if(auto T = type->ToArray())
+    return GenIRArrayType(T, ctx);
+  if(auto T = type->ToTile())
+    return GenIRTileType(T, ctx);
+  if(auto T = type->ToFunc())
+    return GenIRFuncType(T, ctx);
+  if(auto T = type->ToPointer())
+    return GenIRPointerType(T, ctx);
+  if(auto T = type->ToStruct())
+    return GenIRStructType(T, ctx);
+  assert(false);
+  return nullptr;
+}
+
+ir::type* Generator::GenIRArithmType(ArithmType* type, ir::context& ctx) {
+  int tag = type->Tag();
+  if(tag & T_BOOL)
+    return ir::type::get_int1_ty(ctx);
+  if(tag & T_CHAR)
+    return ir::type::get_int8_ty(ctx);
+  if(tag & T_SHORT)
+    return ir::type::get_int16_ty(ctx);
+  if(tag & T_INT)
+    return ir::type::get_int32_ty(ctx);
+  if(tag & T_LONG)
+    return ir::type::get_int64_ty(ctx);
+  if(tag & T_HALF)
+    return ir::type::get_half_ty(ctx);
+  if(tag & T_FLOAT)
+    return ir::type::get_float_ty(ctx);
+  if(tag & T_DOUBLE)
+    return ir::type::get_double_ty(ctx);
+  assert(false);
+  return nullptr;
+}
+
+ir::type* Generator::GenIRArrayType(ArrayType* type, ir::context& ctx) {
+  assert(false);
+  return nullptr;
+}
+
+ir::type* Generator::GenIRTileType(TileType* type, ir::context& ctx) {
+  ir::type* ele_ty = GenIRType(type->Derived().GetPtr(), ctx);
+  auto _shape = type->Shape();
+  ir::tile_type::tile_shapes_t shape;
+  for(int s: _shape)
+    shape.push_back(static_cast<unsigned>(s));
+  return ir::tile_type::get(ele_ty, shape);
+}
+
+ir::type* Generator::GenIRFuncType(FuncType* type, ir::context& ctx) {
+  ir::type* ret_ty = GenIRType(type->Derived().GetPtr(), ctx);
+  std::vector<ir::type*> param_tys;
+  for(Object* obj: type->Params())
+    param_tys.push_back(GenIRType(obj->Type(), ctx));
+  return ir::function_type::get(ret_ty, param_tys);
+}
+
+ir::type* Generator::GenIRPointerType(PointerType* type, ir::context& ctx) {
+  ir::type* ele_ty = GenIRType(type->Derived().GetPtr(), ctx);
+  unsigned addr_space = 1;
+  if(type->Derived().IsConstantQualified())
+    addr_space = 4;
+  return ir::pointer_type::get(ele_ty, addr_space);
+}
+
+ir::type* Generator::GenIRStructType(StructType* type, ir::context& ctx) {
+  error_not_implemented();
+  return nullptr;
+}
+
+void Generator::AllocObjects(Scope* scope, const FuncDef::ParamList& params) {
+  return error_not_implemented();
+}
+
+// SSA
+void Generator::pushScope() {
+  mod_->add_new_scope();
+}
+
+void Generator::popScope() {
+  mod_->pop_scope();
+}
+
+// LValue Generator
+void LValAssigner::VisitBinaryOp(BinaryOp* binary) {
+  if(binary->op_ != Token::MASKED_DEREF)
+    error_not_implemented();
+  gen_->VisitExpr(binary->lhs_);
+  ir::value* mask = gen_->ret_;
+  gen_->VisitExpr(binary->rhs_);
+  ir::value* addr = gen_->ret_;
+  ret_ = gen_->bld_->create_masked_store(addr, rhs_, mask);
+}
+
+void LValAssigner::VisitUnaryOp(UnaryOp* unary) {
+  if(unary->op_ != Token::DEREF)
+    should_not_happen();
+  gen_->VisitExpr(unary->operand_);
+  ir::value* addr = gen_->ret_;
+  ret_ = gen_->bld_->create_store(addr, rhs_);
+}
+
+void LValAssigner::VisitObject(Object* obj) {
+  std::string name = obj->Name();
+  gen_->mod_->set_value(name, rhs_);
+  ret_ = rhs_;
+}
+
+void LValAssigner::VisitIdentifier(Identifier* ident) {
+  std::string name = ident->Name();
+  gen_->mod_->set_value(name, rhs_);
+  ret_ = rhs_;
+}
+
+
+
diff --git a/lib/lang/cpp.cc b/lib/lang/cpp.cc
new file mode 100644
index 000000000..2cdfb453a
--- /dev/null
+++ b/lib/lang/cpp.cc
@@ -0,0 +1,884 @@
+#include "triton/lang/cpp.h"
+
+#include "triton/lang/evaluator.h"
+#include "triton/lang/parser.h"
+
+#include <ctime>
+#include <fcntl.h>
+#include <unistd.h>
+#include <unordered_map>
+
+
+
+using DirectiveMap = std::unordered_map<std::string, int>;
+
+static const DirectiveMap directiveMap {
+  {"if", Token::PP_IF},
+  {"ifdef", Token::PP_IFDEF},
+  {"ifndef", Token::PP_IFNDEF},
+  {"elif", Token::PP_ELIF},
+  {"else", Token::PP_ELSE},
+  {"endif", Token::PP_ENDIF},
+  {"include", Token::PP_INCLUDE},
+  // Non-standard GNU extension
+  {"include_next", Token::PP_INCLUDE},
+  {"define", Token::PP_DEFINE},
+  {"undef", Token::PP_UNDEF},
+  {"line", Token::PP_LINE},
+  {"error", Token::PP_ERROR},
+  {"pragma", Token::PP_PRAGMA}
+};
+
+
+/*
+ * params:
+ *  is: input token sequence
+ *  os: output token sequence
+ */
+void Preprocessor::Expand(TokenSequence& os, TokenSequence is, bool inCond) {
+  Macro* macro = nullptr;
+  int direcitve;
+  while (!is.Empty()) {
+    UpdateFirstTokenLine(is);
+    auto tok = is.Peek();
+    const auto& name = tok->str_;
+
+    if ((direcitve = GetDirective(is)) != Token::INVALID) {
+      ParseDirective(os, is, direcitve);
+    } else if (!inCond && !NeedExpand()) {
+      // Discards the token
+      is.Next();
+    } else if (inCond && name == "defined") {
+      is.Next();
+      os.InsertBack(EvalDefOp(is));
+    } else if (tok->hs_ && tok->hs_->find(name) != tok->hs_->end()) {
+      os.InsertBack(is.Next());
+    } else if ((macro = FindMacro(name))) {
+      is.Next();
+
+      if (name == "__FILE__") {
+        HandleTheFileMacro(os, tok);
+      } else if (name == "__LINE__") {
+        HandleTheLineMacro(os, tok);
+      } else if (macro->ObjLike()) {
+        // Make a copy, as subst will change repSeq
+        auto repSeq = macro->RepSeq(tok->loc_.filename_, tok->loc_.line_);
+
+        TokenList tokList;
+        TokenSequence repSeqSubsted(&tokList);
+        ParamMap paramMap;
+        // TODO(wgtdkp): hideset is not right
+        // Make a copy of hideset
+        // HS U {name}
+        auto hs = tok->hs_ ? *tok->hs_: HideSet();
+        hs.insert(name);
+        Subst(repSeqSubsted, repSeq, tok->ws_, hs, paramMap);
+        is.InsertFront(repSeqSubsted);
+      } else if (is.Try('(')) {
+        ParamMap paramMap;
+        auto rpar = ParseActualParam(is, macro, paramMap);
+        auto repSeq = macro->RepSeq(tok->loc_.filename_, tok->loc_.line_);
+        TokenList tokList;
+        TokenSequence repSeqSubsted(&tokList);
+
+        // (HS ^ HS') U {name}
+        // Use HS' U {name} directly
+        auto hs = rpar->hs_ ? *rpar->hs_: HideSet();
+        hs.insert(name);
+        Subst(repSeqSubsted, repSeq, tok->ws_, hs, paramMap);
+        is.InsertFront(repSeqSubsted);
+      } else {
+        os.InsertBack(tok);
+      }
+    } else {
+      os.InsertBack(is.Next());
+    }
+  }
+}
+
+
+static bool FindActualParam(TokenSequence& ap,
+                            ParamMap& params,
+                            const std::string& fp) {
+  auto res = params.find(fp);
+  if (res == params.end()) {
+    return false;
+  }
+  ap.Copy(res->second);
+  return true;
+}
+
+
+void Preprocessor::Subst(TokenSequence& os,
+                         TokenSequence is,
+                         bool leadingWS,
+                         const HideSet& hs,
+                         ParamMap& params) {
+  TokenSequence ap;
+
+  while (!is.Empty()) {
+    if (is.Test('#') && FindActualParam(ap, params, is.Peek2()->str_)) {
+      is.Next(); is.Next();
+      auto tok = Stringize(ap);
+      os.InsertBack(tok);
+    } else if (is.Test(Token::DSHARP) &&
+               FindActualParam(ap, params, is.Peek2()->str_)) {
+      is.Next(); is.Next();
+      if (!ap.Empty())
+        Glue(os, ap);
+    } else if (is.Test(Token::DSHARP)) {
+      is.Next();
+      auto tok = is.Next();
+      Glue(os, tok);
+    } else if (is.Peek2()->tag_ == Token::DSHARP &&
+               FindActualParam(ap, params, is.Peek()->str_)) {
+      is.Next();
+
+      if (ap.Empty()) {
+        is.Next();
+        if (FindActualParam(ap, params, is.Peek()->str_)) {
+          is.Next();
+          os.InsertBack(ap);
+        }
+      } else {
+        os.InsertBack(ap);
+      }
+    } else if (FindActualParam(ap, params, is.Peek()->str_)) {
+      auto tok = is.Next();
+      const_cast<Token*>(ap.Peek())->ws_ = tok->ws_;
+      Expand(os, ap);
+    } else {
+      os.InsertBack(is.Peek());
+      is.Next();
+    }
+  }
+
+  os.FinalizeSubst(leadingWS, hs);
+}
+
+
+void Preprocessor::Glue(TokenSequence& os, const Token* tok) {
+  TokenList tokList {tok};
+  TokenSequence is(&tokList);
+  Glue(os, is);
+}
+
+
+void Preprocessor::Glue(TokenSequence& os, TokenSequence is) {
+  auto lhs = os.Back();
+  auto rhs = is.Peek();
+
+  auto str = new std::string(lhs->str_ + rhs->str_);
+  TokenSequence ts;
+  Scanner scanner(str, lhs->loc_);
+  scanner.Tokenize(ts);
+
+  is.Next();
+
+  if (ts.Empty()) {
+    // TODO(wgtdkp):
+    // No new Token generated
+    // How to handle it???
+  } else {
+    os.PopBack();
+    auto newTok = const_cast<Token*>(ts.Next());
+    newTok->ws_ = lhs->ws_;
+    newTok->hs_ = lhs->hs_;
+    os.InsertBack(newTok);
+  }
+
+  if (!ts.Empty()) {
+    Error(lhs, "macro expansion failed: cannot concatenate");
+  }
+
+  os.InsertBack(is);
+}
+
+
+/*
+ * This is For the '#' operator in func-like macro
+ */
+const Token* Preprocessor::Stringize(TokenSequence is) {
+  std::string str = "\"";
+  while (!is.Empty()) {
+    auto tok = is.Next();
+    // Have preceding white space
+    // and is not the first token of the sequence
+    str.append(tok->ws_ && str.size() > 1, ' ');
+    if (tok->tag_ == Token::LITERAL || tok->tag_ == Token::C_CONSTANT) {
+      for (auto c: tok->str_) {
+        if (c == '"' || c == '\\')
+          str.push_back('\\');
+        str.push_back(c);
+      }
+    } else {
+      str += tok->str_;
+    }
+  }
+  str.push_back('\"');
+
+  auto ret = Token::New(*is.Peek());
+  ret->tag_ = Token::LITERAL;
+  ret->str_ = str;
+  return ret;
+}
+
+
+void Preprocessor::Finalize(TokenSequence os) {
+  while (!os.Empty()) {
+    auto tok = os.Next();
+    if (tok->tag_ == Token::INVALID) {
+      Error(tok, "stray token in program");
+    } else if (tok->tag_ == Token::IDENTIFIER) {
+      auto tag = Token::KeyWordTag(tok->str_);
+      if (Token::IsKeyWord(tag)) {
+        const_cast<Token*>(tok)->tag_ = tag;
+      } else {
+        const_cast<Token*>(tok)->str_ = Scanner(tok).ScanIdentifier();
+      }
+    }
+    if (fName_ && !tok->loc_.filename_) {
+      assert(false);
+    }
+  }
+}
+
+
+// TODO(wgtdkp): add predefined macros
+void Preprocessor::Process(TokenSequence& os) {
+  TokenSequence is;
+  // Add source file
+  if(fName_)
+    IncludeFile(is, fName_);
+  else
+    IncludeSrc(is, fSrc_, nullptr);
+  // Expand
+  Expand(os, is);
+  Finalize(os);
+}
+
+
+const Token* Preprocessor::ParseActualParam(TokenSequence& is,
+                                            Macro* macro,
+                                            ParamMap& paramMap) {
+  const Token* ret;
+  if (macro->Params().size() == 0 && !macro->Variadic()) {
+    ret = is.Next();
+    if (ret->tag_ != ')')
+      Error(ret, "too many arguments");
+    return ret;
+  }
+
+  auto fp = macro->Params().begin();
+  TokenSequence ap;
+
+  int cnt = 1;
+  while (cnt > 0) {
+    if (is.Empty())
+      Error(is.Peek(), "premature end of input");
+    else if (is.Test('('))
+      ++cnt;
+    else if (is.Test(')'))
+      --cnt;
+
+    if ((is.Test(',') && cnt == 1) || cnt == 0) {
+
+      if (fp == macro->Params().end()) {
+        if (!macro->Variadic())
+          Error(is.Peek(), "too many arguments");
+        if (cnt == 0)
+          paramMap.insert(std::make_pair("__VA_ARGS__", ap));
+        else
+          ap.InsertBack(is.Peek());
+      } else {
+        paramMap.insert(std::make_pair(*fp, ap));
+        ap = TokenSequence();
+        ++fp;
+      }
+    } else {
+      ap.InsertBack(is.Peek());
+    }
+    ret = is.Next();
+  }
+
+  if (fp != macro->Params().end())
+    Error(is.Peek(), "too few params");
+  return ret;
+}
+
+
+const Token* Preprocessor::EvalDefOp(TokenSequence& is) {
+  auto hasPar = is.Try('(');
+  auto macro = is.Expect(Token::IDENTIFIER);
+  auto cons = Token::New(*macro);
+  if (hasPar) is.Expect(')');
+  cons->tag_ = Token::I_CONSTANT;
+  cons->str_ = FindMacro(macro->str_) ? "1": "0";
+  return cons;
+}
+
+
+void Preprocessor::ReplaceIdent(TokenSequence& is) {
+  TokenSequence os;
+  while (!is.Empty()) {
+    auto tok = is.Next();
+    if (tok->tag_ == Token::IDENTIFIER) {
+      auto cons = Token::New(*tok);
+      cons->tag_ = Token::I_CONSTANT;
+      cons->str_ = "0";
+      os.InsertBack(cons);
+    } else {
+      os.InsertBack(tok);
+    }
+  }
+  is = os;
+}
+
+
+int Preprocessor::GetDirective(TokenSequence& is) {
+  if (!is.Test('#') || !is.IsBeginOfLine())
+    return Token::INVALID;
+
+  is.Next();
+  if (is.IsBeginOfLine())
+    return Token::PP_EMPTY;
+
+  auto tag = is.Peek()->tag_;
+  if (tag == Token::IDENTIFIER || Token::IsKeyWord(tag)) {
+    auto str = is.Peek()->str_;
+    auto res = directiveMap.find(str);
+    if (res == directiveMap.end())
+      return Token::PP_NONE;
+    return res->second;
+  }
+  return Token::PP_NONE;
+}
+
+
+void Preprocessor::ParseDirective(TokenSequence& os,
+                                  TokenSequence& is,
+                                  int directive) {
+  if (directive == Token::PP_EMPTY)
+    return;
+  auto ls = is.GetLine();
+  switch(directive) {
+  case Token::PP_IF:
+    ParseIf(ls); break;
+  case Token::PP_IFDEF:
+    ParseIfdef(ls); break;
+  case Token::PP_IFNDEF:
+    ParseIfndef(ls); break;
+  case Token::PP_ELIF:
+    ParseElif(ls); break;
+  case Token::PP_ELSE:
+    ParseElse(ls); break;
+  case Token::PP_ENDIF:
+    ParseEndif(ls); break;
+  case Token::PP_INCLUDE:
+    if (NeedExpand())
+      ParseInclude(is, ls);
+    break;
+  case Token::PP_DEFINE:
+    if (NeedExpand())
+      ParseDef(ls);
+    break;
+  case Token::PP_UNDEF:
+    if (NeedExpand())
+      ParseUndef(ls);
+    break;
+  case Token::PP_LINE:
+    if (NeedExpand())
+      ParseLine(ls);
+    break;
+  case Token::PP_ERROR:
+    if (NeedExpand())
+      ParseError(ls);
+    break;
+  case Token::PP_PRAGMA:
+    if (NeedExpand())
+      ParsePragma(ls);
+    break;
+  case Token::PP_NONE:
+    break;
+  default:
+    assert(false);
+  }
+}
+
+
+void Preprocessor::ParsePragma(TokenSequence ls) {
+  // TODO(wgtdkp):
+  ls.Next();
+}
+
+
+void Preprocessor::ParseError(TokenSequence ls) {
+  ls.Next();
+  const auto& literal = Stringize(ls);
+  std::string msg;
+  Scanner(literal).ScanLiteral(msg);
+  Error(ls.Peek(), "%s", msg.c_str());
+}
+
+
+void Preprocessor::ParseLine(TokenSequence ls) {
+  auto directive = ls.Next(); // Skip directive 'line'
+  TokenSequence ts;
+  Expand(ts, ls);
+  auto tok = ts.Expect(Token::I_CONSTANT);
+
+  int line = 0;
+  size_t end = 0;
+  try {
+    line = stoi(tok->str_, &end, 10);
+  } catch (const std::out_of_range& oor) {
+    Error(tok, "line number out of range");
+  }
+  if (line == 0 || end != tok->str_.size()) {
+    Error(tok, "illegal line number");
+  }
+
+  curLine_ = line;
+  lineLine_ = directive->loc_.line_;
+  if (ts.Empty())
+    return;
+  tok = ts.Expect(Token::LITERAL);
+
+  // Enusure "s-char-sequence"
+  if (tok->str_.front() != '"' || tok->str_.back() != '"') {
+    Error(tok, "expect s-char-sequence");
+  }
+}
+
+
+void Preprocessor::ParseIf(TokenSequence ls) {
+  if (!NeedExpand()) {
+    ppCondStack_.push({Token::PP_IF, false, false});
+    return;
+  }
+
+  auto tok = ls.Next(); // Skip the directive
+
+  if (ls.Empty()) {
+    Error(tok, "expect expression in 'if' directive");
+  }
+
+  TokenSequence ts;
+  Expand(ts, ls, true);
+  ReplaceIdent(ts);
+
+  Parser parser(ts);
+  auto expr = parser.ParseExpr();
+  if (!parser.ts().Empty()) {
+    Error(parser.ts().Peek(), "unexpected extra expression");
+  }
+  bool cond;
+  if (expr->Type()->IsFloat()) {
+    cond = static_cast<bool>(Evaluator<double>().Eval(expr));
+  } else {
+    cond = static_cast<bool>(Evaluator<long>().Eval(expr));
+  }
+  ppCondStack_.push({Token::PP_IF, NeedExpand(), cond});
+}
+
+
+void Preprocessor::ParseIfdef(TokenSequence ls) {
+  if (!NeedExpand()) {
+    ppCondStack_.push({Token::PP_IFDEF, false, false});
+    return;
+  }
+
+  ls.Next();
+  auto ident = ls.Expect(Token::IDENTIFIER);
+  if (!ls.Empty()) {
+    Error(ls.Peek(), "expect new line");
+  }
+
+  auto cond = FindMacro(ident->str_) != nullptr;
+  ppCondStack_.push({Token::PP_IFDEF, NeedExpand(), cond});
+}
+
+
+void Preprocessor::ParseIfndef(TokenSequence ls) {
+  ParseIfdef(ls);
+  auto top = ppCondStack_.top();
+  ppCondStack_.pop();
+  top.tag_ = Token::PP_IFNDEF;
+  top.cond_ = !top.cond_;
+
+  ppCondStack_.push(top);
+}
+
+
+void Preprocessor::ParseElif(TokenSequence ls) {
+  auto directive = ls.Next(); // Skip the directive
+
+  if (ppCondStack_.empty())
+    Error(directive, "unexpected 'elif' directive");
+  auto top = ppCondStack_.top();
+  if (top.tag_ == Token::PP_ELSE)
+    Error(directive, "unexpected 'elif' directive");
+
+  while (!ppCondStack_.empty()) {
+    top = ppCondStack_.top();
+    if (top.tag_ == Token::PP_IF ||
+        top.tag_ == Token::PP_IFDEF ||
+        top.tag_ == Token::PP_IFNDEF ||
+        top.cond_) {
+      break;
+    }
+    ppCondStack_.pop();
+  }
+  if (ppCondStack_.empty())
+    Error(directive, "unexpected 'elif' directive");
+  auto enabled = top.enabled_;
+  if (!enabled) {
+    ppCondStack_.push({Token::PP_ELIF, false, false});
+    return;
+  }
+
+  if (ls.Empty()) {
+    Error(ls.Peek(), "expect expression in 'elif' directive");
+  }
+
+  TokenSequence ts;
+  Expand(ts, ls, true);
+  ReplaceIdent(ts);
+
+  Parser parser(ts);
+  auto expr = parser.ParseExpr();
+  if (!parser.ts().Empty()) {
+    Error(parser.ts().Peek(), "unexpected extra expression");
+  }
+  bool cond;
+  if (expr->Type()->IsFloat()) {
+    std::cout << Evaluator<double>().Eval(expr) << std::endl;
+    cond = static_cast<bool>(Evaluator<double>().Eval(expr));
+  } else {
+    cond = static_cast<bool>(Evaluator<long>().Eval(expr));
+  }
+  cond = cond && !top.cond_;
+  ppCondStack_.push({Token::PP_ELIF, true, cond});
+}
+
+
+void Preprocessor::ParseElse(TokenSequence ls) {
+  auto directive = ls.Next();
+  if (!ls.Empty())
+    Error(ls.Peek(), "expect new line");
+
+  if (ppCondStack_.empty())
+    Error(directive, "unexpected 'else' directive");
+  auto top = ppCondStack_.top();
+  if (top.tag_ == Token::PP_ELSE)
+    Error(directive, "unexpected 'else' directive");
+
+  while (!ppCondStack_.empty()) {
+    top = ppCondStack_.top();
+    if (top.tag_ == Token::PP_IF ||
+        top.tag_ == Token::PP_IFDEF ||
+        top.tag_ == Token::PP_IFNDEF ||
+        top.cond_) {
+      break;
+    }
+    ppCondStack_.pop();
+  }
+  if (ppCondStack_.empty())
+    Error(directive, "unexpected 'else' directive");
+
+  auto cond = !top.cond_;
+  auto enabled = top.enabled_;
+  ppCondStack_.push({Token::PP_ELSE, enabled, cond});
+}
+
+
+void Preprocessor::ParseEndif(TokenSequence ls) {
+  auto directive = ls.Next();
+  if (!ls.Empty())
+    Error(ls.Peek(), "expect new line");
+
+  while ( !ppCondStack_.empty()) {
+    auto top = ppCondStack_.top();
+    ppCondStack_.pop();
+
+    if (top.tag_ == Token::PP_IF
+        || top.tag_ == Token::PP_IFDEF
+        || top.tag_ == Token::PP_IFNDEF) {
+      return;
+    }
+  }
+
+  if (ppCondStack_.empty())
+    Error(directive, "unexpected 'endif' directive");
+}
+
+
+// Have Read the '#'
+void Preprocessor::ParseInclude(TokenSequence& is, TokenSequence ls) {
+  bool next = ls.Next()->str_ == "include_next"; // Skip 'include'
+  if (!ls.Test(Token::LITERAL) && !ls.Test('<')) {
+    TokenSequence ts;
+    Expand(ts, ls, true);
+    ls = ts;
+  }
+
+  auto tok = ls.Next();
+  if (tok->tag_ == Token::LITERAL) {
+    if (!ls.Empty()) {
+      Error(ls.Peek(), "expect new line");
+    }
+    std::string filename;
+    Scanner(tok).ScanLiteral(filename);
+    auto fullPath = SearchFile(filename, false, next, *tok->loc_.filename_);
+    if (fullPath == nullptr)
+      Error(tok, "%s: No such file or directory", filename.c_str());
+
+    IncludeFile(is, fullPath);
+  } else if (tok->tag_ == '<') {
+    auto lhs = tok;
+    auto rhs = tok;
+    int cnt = 1;
+    while (!(rhs = ls.Next())->IsEOF()) {
+      if (rhs->tag_ == '<')
+        ++cnt;
+      else if (rhs->tag_ == '>')
+        --cnt;
+      if (cnt == 0)
+        break;
+    }
+    if (cnt != 0)
+      Error(rhs, "expect '>'");
+    if (!ls.Empty())
+      Error(ls.Peek(), "expect new line");
+
+    const auto& filename = Scanner::ScanHeadName(lhs, rhs);
+    auto fullPath = SearchFile(filename, true, next, *tok->loc_.filename_);
+    if (fullPath == nullptr) {
+      Error(tok, "%s: No such file or directory", filename.c_str());
+    }
+    IncludeFile(is, fullPath);
+  } else {
+    Error(tok, "expect filename(string or in '<>')");
+  }
+}
+
+
+void Preprocessor::ParseUndef(TokenSequence ls) {
+  ls.Next(); // Skip directive
+
+  auto ident = ls.Expect(Token::IDENTIFIER);
+  if (!ls.Empty())
+    Error(ls.Peek(), "expect new line");
+
+  RemoveMacro(ident->str_);
+}
+
+
+void Preprocessor::ParseDef(TokenSequence ls) {
+  ls.Next();
+  auto ident = ls.Expect(Token::IDENTIFIER);
+  if (ident->str_ == "defined") {
+    Error(ident, "'defined' cannot be used as a macro name");
+  }
+  auto tok = ls.Peek();
+  if (tok->tag_ == '(' && !tok->ws_) {
+    // There is no white space between ident and '('
+    // Hence, we are defining function-like macro
+
+    // Parse Identifier list
+    ls.Next(); // Skip '('
+    ParamList params;
+    auto variadic = ParseIdentList(params, ls);
+    const auto& macro = Macro(variadic, params, ls);
+    AddMacro(ident->str_, macro);
+  } else {
+    AddMacro(ident->str_, Macro(ls));
+  }
+}
+
+
+bool Preprocessor::ParseIdentList(ParamList& params, TokenSequence& is) {
+  const Token* tok = is.Peek();
+  while (!is.Empty()) {
+    tok = is.Next();
+    if (tok->tag_ == ')') {
+      return false;
+    } else if (tok->tag_ == Token::ELLIPSIS) {
+      is.Expect(')');
+      return true;
+    } else if (tok->tag_ != Token::IDENTIFIER) {
+      Error(tok, "expect identifier");
+    }
+
+    for (const auto& param: params) {
+      if (param == tok->str_)
+        Error(tok, "duplicated param");
+    }
+    params.push_back(tok->str_);
+
+    if (!is.Try(',')) {
+      is.Expect(')');
+      return false;
+    }
+  }
+
+  Error(tok, "unexpected end of line");
+}
+
+void Preprocessor::IncludeSrc(TokenSequence& is,
+                              const std::string* text,
+                              const std::string* filename) {
+  TokenSequence ts {is.tokList_, is.begin_, is.begin_};
+  Scanner scanner(text, filename);
+  scanner.Tokenize(ts);
+
+  // We done including header file
+  is.begin_ = ts.begin_;
+}
+
+void Preprocessor::IncludeFile(TokenSequence& is,
+                               const std::string* filename) {
+  IncludeSrc(is, ReadFile(*filename), filename);
+}
+
+
+static std::string GetDir(const std::string& path) {
+  auto pos = path.rfind('/');
+  if (pos == std::string::npos)
+    return "./";
+  return path.substr(0, pos + 1);
+}
+
+
+std::string* Preprocessor::SearchFile(const std::string& name,
+                                      const bool libHeader,
+                                      bool next,
+                                      const std::string& curPath) {
+  if (libHeader && !next) {
+    searchPaths_.push_back(GetDir(curPath));
+  } else {
+    searchPaths_.push_front(GetDir(curPath));
+  }
+
+  auto iter = searchPaths_.begin();
+  for (; iter != searchPaths_.end(); ++iter) {
+    auto dd = open(iter->c_str(), O_RDONLY);
+    if (dd == -1) // TODO(wgtdkp): or ensure it before preprocessing
+      continue;
+    auto fd = openat(dd, name.c_str(), O_RDONLY);
+    close(dd);
+    if (fd != -1) {
+      // Intentional, so that recursive include
+      // will result in running out of file descriptor
+      //close(fd);
+      auto path = *iter + name;
+      if (next) {
+        if (path != curPath)
+          continue;
+        else
+          next = false;
+      } else {
+        if (path == curPath)
+          continue;
+        if (libHeader && !next)
+          searchPaths_.pop_back();
+        else
+          searchPaths_.pop_front();
+        return new std::string(path);
+      }
+    } else if (errno == EMFILE) {
+      Error("may recursive include");
+    }
+  }
+  return nullptr;
+}
+
+
+void Preprocessor::AddMacro(const std::string& name,
+                            std::string* text,
+                            bool preDef) {
+  TokenSequence ts;
+  Scanner scanner(text);
+  scanner.Tokenize(ts);
+  Macro macro(ts, preDef);
+
+  AddMacro(name, macro);
+}
+
+
+static std::string* Date() {
+  time_t t = time(NULL);
+  struct tm* tm = localtime(&t);
+  char buf[14];
+  strftime(buf, sizeof buf, "\"%a %M %Y\"", tm);
+  return new std::string(buf);
+}
+
+
+void Preprocessor::Init() {
+  // Preinclude search paths
+  AddSearchPath("/usr/local/include/");
+  AddSearchPath("/usr/include/x86_64-linux-gnu/");
+  AddSearchPath("/usr/include/linux/");
+  AddSearchPath("/usr/include/");
+  AddSearchPath("/usr/local/include/");
+
+  // The __FILE__ and __LINE__ macro is empty
+  // They are handled seperately
+  AddMacro("__FILE__", Macro(TokenSequence(), true));
+  AddMacro("__LINE__", Macro(TokenSequence(), true));
+
+  AddMacro("__DATE__", Date(), true);
+  AddMacro("__STDC__", new std::string("1"), true);
+  AddMacro("__STDC__HOSTED__", new std::string("0"), true);
+  AddMacro("__STDC_VERSION__", new std::string("201103L"), true);
+}
+
+
+void Preprocessor::HandleTheFileMacro(TokenSequence& os, const Token* macro) {
+  auto file = Token::New(*macro);
+  file->tag_ = Token::LITERAL;
+  file->str_ = "\"" + *macro->loc_.filename_ + "\"";
+  os.InsertBack(file);
+}
+
+
+void Preprocessor::HandleTheLineMacro(TokenSequence& os, const Token* macro) {
+  auto line = Token::New(*macro);
+  line->tag_ = Token::I_CONSTANT;
+  line->str_ = std::to_string(macro->loc_.line_);
+  os.InsertBack(line);
+}
+
+
+void Preprocessor::UpdateFirstTokenLine(TokenSequence ts) {
+  auto loc = ts.Peek()->loc_;
+  loc.line_ = curLine_  + loc.line_ - lineLine_ - 1;
+  ts.UpdateHeadLocation(loc);
+}
+
+
+TokenSequence Macro::RepSeq(const std::string* filename, unsigned line) {
+  // Update line
+  TokenList tl;
+  TokenSequence ret(&tl);
+  ret.Copy(repSeq_);
+  auto ts = ret;
+  while (!ts.Empty()) {
+    auto loc = ts.Peek()->loc_;
+    loc.filename_ = filename;
+    loc.line_ = line;
+    ts.UpdateHeadLocation(loc);
+    ts.Next();
+  }
+  return ret;
+}
+
+
+void Preprocessor::AddSearchPath(std::string path) {
+  if (path.back() != '/')
+    path += "/";
+  if (path[0] != '/')
+    path = "./" + path;
+  searchPaths_.push_front(path);
+}
diff --git a/lib/lang/encoding.cc b/lib/lang/encoding.cc
new file mode 100644
index 000000000..931e4fc30
--- /dev/null
+++ b/lib/lang/encoding.cc
@@ -0,0 +1,42 @@
+#include "triton/lang/encoding.h"
+
+#include <climits>
+#include <codecvt>
+#include <locale>
+#include <iostream>
+
+
+static void Append16LE(std::string& str, char16_t c) {
+  str.push_back(c & UCHAR_MAX);
+  str.push_back((c >> 8) & UCHAR_MAX);
+}
+
+
+static void Append32LE(std::string& str, char32_t c) {
+  Append16LE(str, c & USHRT_MAX);
+  Append16LE(str, (c >> 16) & USHRT_MAX);
+}
+
+
+void ConvertToUTF16(std::string& str) {
+  std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> utf8_ucs2_cvt;
+  auto str16 = utf8_ucs2_cvt.from_bytes(str);
+  str.resize(0);
+  for (auto c16: str16)
+    Append16LE(str, c16);
+}
+
+
+void ConvertToUTF32(std::string& str) {
+  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> utf8_ucs4_cvt;
+  auto str32 = utf8_ucs4_cvt.from_bytes(str);
+  str.resize(0);
+  for (auto c32: str32)
+    Append32LE(str, c32);
+}
+
+
+void AppendUCN(std::string& str, int c) {
+  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> utf8_ucs4_cvt;
+  str += utf8_ucs4_cvt.to_bytes(static_cast<char32_t>(c));
+}
diff --git a/lib/lang/error.cc b/lib/lang/error.cc
new file mode 100644
index 000000000..ddda6b6ce
--- /dev/null
+++ b/lib/lang/error.cc
@@ -0,0 +1,91 @@
+#include "triton/lang/error.h"
+
+#include "triton/lang/ast.h"
+#include "triton/lang/token.h"
+
+#include <cstdarg>
+#include <cstdio>
+#include <cstring>
+#include <string>
+
+
+#define ANSI_COLOR_RED     "\x1b[31m"
+#define ANSI_COLOR_GREEN   "\x1b[32m"
+#define ANSI_COLOR_YELLOW  "\x1b[33m"
+#define ANSI_COLOR_BLUE    "\x1b[34m"
+#define ANSI_COLOR_MAGENTA "\x1b[35m"
+#define ANSI_COLOR_CYAN    "\x1b[36m"
+#define ANSI_COLOR_RESET   "\x1b[0m"
+
+
+void Error(const char* format, ...) {
+  fprintf(stderr,
+          ANSI_COLOR_RED "error: " ANSI_COLOR_RESET);
+
+  va_list args;
+  va_start(args, format);
+  vfprintf(stderr, format, args);
+  va_end(args);
+
+  fprintf(stderr, "\n");
+
+  exit(-1);
+}
+
+
+[[noreturn]]
+static void VError(const SourceLocation& loc,
+                   const char* format,
+                   va_list args) {
+  const char* filename = nullptr;
+  if(loc.filename_)
+    filename = loc.filename_->c_str();
+  fprintf(stderr,
+          "%s:%d:%d: " ANSI_COLOR_RED "error: " ANSI_COLOR_RESET,
+          filename,
+          loc.line_,
+          loc.column_);
+  vfprintf(stderr, format, args);
+  fprintf(stderr, "\n    ");
+
+  bool sawNoSpace = false;
+  int nspaces = 0;
+  for (auto p = loc.lineBegin_; *p != '\n' && *p != 0; p++) {
+    if (!sawNoSpace && (*p == ' ' || *p == '\t')) {
+      ++nspaces;
+    } else {
+      sawNoSpace = true;
+      fputc(*p, stderr);
+    }
+  }
+
+  fprintf(stderr, "\n    ");
+  for (unsigned i = 1; i + nspaces < loc.column_; ++i)
+    fputc(' ', stderr);
+  fprintf(stderr, ANSI_COLOR_GREEN "^\n");
+  exit(-1);
+}
+
+
+void Error(const SourceLocation& loc, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  VError(loc, format, args);
+  va_end(args);
+}
+
+
+void Error(const Token* tok, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  VError(tok->loc_, format, args);
+  va_end(args);
+}
+
+
+void Error(const Expr* expr, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  VError(expr->Tok()->loc_, format, args);
+  va_end(args);
+}
diff --git a/lib/lang/evaluator.cc b/lib/lang/evaluator.cc
new file mode 100644
index 000000000..0123f4239
--- /dev/null
+++ b/lib/lang/evaluator.cc
@@ -0,0 +1,206 @@
+#include "triton/lang/evaluator.h"
+#include "triton/lang/ast.h"
+#include "triton/lang/token.h"
+
+
+template<typename T>
+void Evaluator<T>::VisitBinaryOp(BinaryOp* binary) {
+#define L   Evaluator<T>().Eval(binary->lhs_)
+#define R   Evaluator<T>().Eval(binary->rhs_)
+#define LL  Evaluator<long>().Eval(binary->lhs_)
+#define LR  Evaluator<long>().Eval(binary->rhs_)
+
+  if (binary->Type()->ToPointer()) {
+    auto val = Evaluator<Addr>().Eval(binary);
+    if (val.label_.size()) {
+      Error(binary, "expect constant integer expression");
+    }
+    val_ = static_cast<T>(val.offset_);
+    return;
+  }
+
+  switch (binary->op_) {
+  case '+': val_ = L + R; break;
+  case '-': val_ = L - R; break;
+  case '*': val_ = L * R; break;
+  case '/': {
+    auto l = L, r = R;
+    if (r == 0)
+      Error(binary, "division by zero");
+    val_ = l / r;
+  } break;
+  case '%': {
+    auto l = LL, r = LR;
+    if (r == 0)
+      Error(binary, "division by zero");
+    val_ = l % r;
+  } break;
+  // Bitwise operators that do not accept float
+  case '|': val_ = LL | LR; break;
+  case '&': val_ = LL & LR; break;
+  case '^': val_ = LL ^ LR; break;
+  case Token::LEFT: val_ = LL << LR; break;
+  case Token::RIGHT: val_ = LL >> LR; break;
+
+  case '<': val_ = L < R; break;
+  case '>': val_ = L > R; break;
+  case Token::LOGICAL_AND: val_ = L && R; break;
+  case Token::LOGICAL_OR: val_ = L || R; break;
+  case Token::EQ: val_ = L == R; break;
+  case Token::NE: val_ = L != R; break;
+  case Token::LE: val_ = L <= R; break;
+  case Token::GE: val_ = L >= R; break;
+  case '=': case ',': val_ = R; break;
+  case '.': {
+    auto addr = Evaluator<Addr>().Eval(binary);
+    if (addr.label_.size())
+      Error(binary, "expect constant expression");
+    val_ = addr.offset_;
+  }
+  default: assert(false);
+  }
+
+#undef L
+#undef R
+#undef LL
+#undef LR
+}
+
+
+template<typename T>
+void Evaluator<T>::VisitUnaryOp(UnaryOp* unary) {
+#define VAL     Evaluator<T>().Eval(unary->operand_)
+#define LVAL    Evaluator<long>().Eval(unary->operand_)
+
+  switch (unary->op_) {
+  case Token::PLUS: val_ = VAL; break;
+  case Token::MINUS: val_ = -VAL; break;
+  case '~': val_ = ~LVAL; break;
+  case '!': val_ = !VAL; break;
+  case Token::CAST:
+    if (unary->Type()->IsInteger())
+      val_ = static_cast<long>(VAL);
+    else
+      val_ = VAL;
+    break;
+  case Token::ADDR: {
+    auto addr = Evaluator<Addr>().Eval(unary->operand_);
+    if (addr.label_.size())
+      Error(unary, "expect constant expression");
+    val_ = addr.offset_;
+  } break;
+  default: Error(unary, "expect constant expression");
+  }
+
+#undef LVAL
+#undef VAL
+}
+
+
+template<typename T>
+void Evaluator<T>::VisitConditionalOp(ConditionalOp* condOp) {
+  bool cond;
+  auto condType = condOp->cond_->Type();
+  if (condType->IsInteger()) {
+    auto val = Evaluator<long>().Eval(condOp->cond_);
+    cond = val != 0;
+  } else if (condType->IsFloat()) {
+    auto val = Evaluator<double>().Eval(condOp->cond_);
+    cond  = val != 0.0;
+  } else if (condType->ToPointer()) {
+    auto val = Evaluator<Addr>().Eval(condOp->cond_);
+    cond = val.label_.size() || val.offset_;
+  } else {
+    assert(false);
+  }
+
+  if (cond) {
+    val_ = Evaluator<T>().Eval(condOp->exprTrue_);
+  } else {
+    val_ = Evaluator<T>().Eval(condOp->exprFalse_);
+  }
+}
+
+
+void Evaluator<Addr>::VisitBinaryOp(BinaryOp* binary) {
+#define LR   Evaluator<long>().Eval(binary->rhs_)
+#define R   Evaluator<Addr>().Eval(binary->rhs_)
+
+  auto l = Evaluator<Addr>().Eval(binary->lhs_);
+
+  int width = 1;
+  auto pointerType = binary->Type()->ToPointer();
+  if (pointerType)
+    width = pointerType->Derived()->Width();
+
+  switch (binary->op_) {
+  case '+':
+    assert(pointerType);
+    addr_.label_ = l.label_;
+    addr_.offset_ = l.offset_ + LR * width;
+    break;
+  case '-':
+    assert(pointerType);
+    addr_.label_ = l.label_;
+    addr_.offset_ = l.offset_ + LR * width;
+    break;
+  case '.': {
+    addr_.label_ = l.label_;
+    auto type = binary->lhs_->Type()->ToStruct();
+    auto offset = type->GetMember(binary->rhs_->tok_->str_)->Offset();
+    addr_.offset_ = l.offset_ + offset;
+    break;
+  }
+  default: assert(false);
+  }
+#undef LR
+#undef R
+}
+
+
+void Evaluator<Addr>::VisitUnaryOp(UnaryOp* unary) {
+  auto addr = Evaluator<Addr>().Eval(unary->operand_);
+
+  switch (unary->op_) {
+  case Token::CAST:
+  case Token::ADDR:
+  case Token::DEREF:
+    addr_ = addr; break;
+  default: assert(false);
+  }
+}
+
+
+void Evaluator<Addr>::VisitConditionalOp(ConditionalOp* condOp) {
+  bool cond;
+  auto condType = condOp->cond_->Type();
+  if (condType->IsInteger()) {
+    auto val = Evaluator<long>().Eval(condOp->cond_);
+    cond = val != 0;
+  } else if (condType->IsFloat()) {
+    auto val = Evaluator<double>().Eval(condOp->cond_);
+    cond  = val != 0.0;
+  } else if (condType->ToPointer()) {
+    auto val = Evaluator<Addr>().Eval(condOp->cond_);
+    cond = val.label_.size() || val.offset_;
+  } else {
+    assert(false);
+  }
+
+  if (cond) {
+    addr_ = Evaluator<Addr>().Eval(condOp->exprTrue_);
+  } else {
+    addr_ = Evaluator<Addr>().Eval(condOp->exprFalse_);
+  }
+}
+
+
+void Evaluator<Addr>::VisitConstant(Constant* cons)  {
+  if (cons->Type()->IsInteger()) {
+    addr_ = {"", static_cast<int>(cons->IVal())};
+  } else if (cons->Type()->ToArray()) {
+    assert(false);
+  } else {
+    assert(false);
+  }
+}
diff --git a/lib/lang/parser.cc b/lib/lang/parser.cc
new file mode 100644
index 000000000..ae37d9567
--- /dev/null
+++ b/lib/lang/parser.cc
@@ -0,0 +1,2777 @@
+#include "triton/lang/parser.h"
+
+#include "triton/lang/cpp.h"
+#include "triton/lang/encoding.h"
+#include "triton/lang/error.h"
+#include "triton/lang/evaluator.h"
+#include "triton/lang/scope.h"
+#include "triton/lang/type.h"
+
+#include <iostream>
+#include <set>
+#include <string>
+#include <climits>
+
+
+FuncType* Parser::vaStartType_ {nullptr};
+FuncType* Parser::vaArgType_ {nullptr};
+
+
+FuncDef* Parser::EnterFunc(Identifier* ident) {
+  curFunc_ = FuncDef::New(ident, LabelStmt::New());
+  return curFunc_;
+}
+
+
+void Parser::ExitFunc() {
+  // Resolve 那些待定的jump；
+  // 如果有jump无法resolve，也就是有未定义的label，报错；
+  for (auto iter = unresolvedJumps_.begin();
+       iter != unresolvedJumps_.end(); ++iter) {
+    auto label = iter->first;
+    auto labelStmt = FindLabel(label->str_);
+    if (labelStmt == nullptr) {
+      Error(label, "label '%s' used but not defined",
+          label->str_.c_str());
+    }
+
+    iter->second->SetLabel(labelStmt);
+  }
+
+  unresolvedJumps_.clear();	//清空未定的 jump 动作
+  curLabels_.clear();	//清空 label map
+
+  curFunc_ = nullptr;
+}
+
+
+void Parser::EnterBlock(FuncType* funcType) {
+  curScope_ = new Scope(curScope_, S_BLOCK);
+  if (funcType) {
+    // Merge elements in param scope into current block scope
+    for (auto param: funcType->Params())
+      curScope_->Insert(param);
+  }
+}
+
+
+void Parser::Parse() {
+  DefineBuiltins();
+  ParseTranslationUnit();
+}
+
+
+void Parser::ParseTranslationUnit() {
+  while (!ts_.Peek()->IsEOF()) {
+    if (ts_.Try(Token::STATIC_ASSERT)) {
+      ParseStaticAssert();
+      continue;
+    } else if (ts_.Try(';')) {
+      continue;
+    }
+
+    int storageSpec, funcSpec, align;
+    auto declType = ParseDeclSpec(&storageSpec, &funcSpec, &align);
+    auto declInfo = ParseDeclarator(declType);
+
+    auto tok = declInfo.tok;
+    auto type = declInfo.type;
+    auto attrs = declInfo.attrs;
+
+    if (tok == nullptr) {
+      ts_.Expect(';');
+      continue;
+    }
+
+    auto ident = ProcessDeclarator(tok, type, attrs, storageSpec, funcSpec, align);
+    type = ident->Type();
+
+    if (tok && type->ToFunc() && ts_.Try('{')) { // Function definition
+      unit_->Add(ParseFuncDef(ident));
+    } else { // Declaration
+      auto decl = ParseInitDeclarator(ident);
+      if (decl) unit_->Add(decl);
+
+      while (ts_.Try(',')) {
+        auto ident = ParseDirectDeclarator(declType, storageSpec,
+                                           funcSpec, align);
+        decl = ParseInitDeclarator(ident);
+        if (decl) unit_->Add(decl);
+      }
+      // GNU extension: function/type/variable attributes
+      TryAttributeSpecList();
+      ts_.Expect(';');
+    }
+  }
+}
+
+
+FuncDef* Parser::ParseFuncDef(Identifier* ident) {
+  auto funcDef = EnterFunc(ident);
+
+  if (funcDef->FuncType()->Complete()) {
+    Error(ident, "redefinition of '%s'", funcDef->Name().c_str());
+  }
+
+  // TODO(wgtdkp): param checking
+  auto funcType = ident->Type()->ToFunc();
+  funcType->SetComplete(true);
+  for (auto param: funcType->Params()) {
+    if (param->Anonymous())
+      Error(param, "param name omitted");
+  }
+  funcDef->SetBody(ParseCompoundStmt(funcType));
+  ExitFunc();
+
+  return funcDef;
+}
+
+
+Expr* Parser::ParseExpr() {
+  return ParseCommaExpr();
+}
+
+
+Expr* Parser::ParseCommaExpr() {
+  auto lhs = ParseAssignExpr();
+  auto tok = ts_.Peek();
+  while (ts_.Try(',')) {
+    auto rhs = ParseAssignExpr();
+    lhs = BinaryOp::New(tok, lhs, rhs);
+
+    tok = ts_.Peek();
+  }
+  return lhs;
+}
+
+
+Expr* Parser::ParsePrimaryExpr() {
+  if (ts_.Empty()) {
+    Error(ts_.Peek(), "premature end of input");
+  }
+
+  auto tok = ts_.Next();
+  if (tok->tag_ == '(') {
+    auto expr = ParseExpr();
+    ts_.Expect(')');
+    return expr;
+  }
+
+  if (tok->IsIdentifier()) {
+    auto ident = curScope_->Find(tok);
+    if (ident) return ident;
+    if (IsBuiltin(tok->str_)) return GetBuiltin(tok);
+    Error(tok, "undefined symbol '%s'", tok->str_.c_str());
+  } else if (tok->IsConstant()) {
+    return ParseConstant(tok);
+  } else if (tok->IsLiteral()) {
+    return ConcatLiterals(tok);
+  } else if (tok->tag_ == Token::GENERIC) {
+    return ParseGeneric();
+  }
+
+  Error(tok, "'%s' unexpected", tok->str_.c_str());
+  return nullptr; // Make compiler happy
+}
+
+
+static void ConvertLiteral(std::string& val, Encoding enc) {
+  switch (enc) {
+  case Encoding::NONE:
+  case Encoding::UTF8: break;
+  case Encoding::CHAR16: ConvertToUTF16(val); break;
+  case Encoding::CHAR32:
+  case Encoding::WCHAR: ConvertToUTF32(val); break;
+  }
+}
+
+
+Constant* Parser::ConcatLiterals(const Token* tok) {
+  auto val = new std::string;
+  auto enc = Scanner(tok).ScanLiteral(*val);
+  ConvertLiteral(*val, enc);
+  while (ts_.Test(Token::LITERAL)) {
+    auto nextTok = ts_.Next();
+    std::string nextVal;
+    auto nextEnc = Scanner(nextTok).ScanLiteral(nextVal);
+    ConvertLiteral(nextVal, nextEnc);
+    if (enc == Encoding::NONE) {
+      ConvertLiteral(*val, nextEnc);
+      enc = nextEnc;
+    }
+    if (nextEnc != Encoding::NONE && nextEnc != enc)
+      Error(nextTok, "cannot concat lietrals with different encodings");
+    *val += nextVal;
+  }
+
+  int tag = T_CHAR;
+  switch (enc) {
+  case Encoding::NONE:
+  case Encoding::UTF8:
+    tag = T_CHAR; val->append(1, '\0'); break;
+  case Encoding::CHAR16:
+    tag = T_UNSIGNED | T_SHORT; val->append(2, '\0'); break;
+  case Encoding::CHAR32:
+  case Encoding::WCHAR:
+    tag = T_UNSIGNED | T_INT; val->append(4, '\0'); break;
+  }
+
+  return Constant::New(tok, tag, val);
+}
+
+
+Encoding Parser::ParseLiteral(std::string& str, const Token* tok) {
+  return Scanner(tok).ScanLiteral(str);
+}
+
+
+Constant* Parser::ParseConstant(const Token* tok) {
+  assert(tok->IsConstant());
+
+  if (tok->tag_ == Token::I_CONSTANT) {
+    return ParseInteger(tok);
+  } else if (tok->tag_ == Token::C_CONSTANT) {
+    return ParseCharacter(tok);
+  } else {
+    return ParseFloat(tok);
+  }
+}
+
+
+Constant* Parser::ParseFloat(const Token* tok) {
+  const auto& str = tok->str_;
+  size_t end = 0;
+  double val = 0.0;
+  try {
+    val = stod(str, &end);
+  } catch (const std::out_of_range& oor) {
+    Error(tok, "float out of range");
+  }
+
+  int tag = T_DOUBLE;
+  if (str[end] == 'f' || str[end] == 'F') {
+    tag = T_FLOAT;
+    ++end;
+  } else if (str[end] == 'l' || str[end] == 'L') {
+    tag = T_LONG | T_DOUBLE;
+    ++end;
+  }
+  if (str[end] != 0)
+    Error(tok, "invalid suffix");
+
+  return Constant::New(tok, tag, val);
+}
+
+
+Constant* Parser::ParseCharacter(const Token* tok) {
+  int val;
+  auto enc = Scanner(tok).ScanCharacter(val);
+
+  int tag;
+  switch (enc) {
+  case Encoding::NONE:
+    val = (char)val;
+    tag = T_INT; break;
+  case Encoding::CHAR16:
+    val = (char16_t)val;
+    tag = T_UNSIGNED | T_SHORT; break;
+  case Encoding::WCHAR:
+  case Encoding::CHAR32: tag = T_UNSIGNED | T_INT; break;
+  default: assert(false);
+  }
+  return Constant::New(tok, tag, static_cast<long>(val));
+}
+
+
+Constant* Parser::ParseInteger(const Token* tok) {
+  const auto& str = tok->str_;
+  size_t end = 0;
+  long val = 0;
+  try {
+    val = stoull(str, &end, 0);
+  } catch (const std::out_of_range& oor) {
+    Error(tok, "integer out of range");
+  }
+
+  int tag = 0;
+  for (; str[end]; ++end) {
+    if (str[end] == 'u' || str[end] == 'U') {
+      if (tag & T_UNSIGNED)
+        Error(tok, "invalid suffix");
+      tag |= T_UNSIGNED;
+    } else {
+      if ((tag & T_LONG) || (tag & T_LLONG))
+        Error(tok, "invalid suffix");
+      if (str[end + 1] == 'l' || str[end + 1] =='L') {
+        tag |= T_LLONG;
+        ++end;
+      } else {
+        tag |= T_LONG;
+      }
+    }
+  }
+
+  bool decimal = ('1' <= str[0] && str[0] <= '9');
+  if (decimal) {
+    switch (tag) {
+    case 0:
+      tag |= !(val & ~(long)INT_MAX) ? T_INT: T_LONG; break;
+    case T_UNSIGNED:
+      tag |= !(val & ~(long)UINT_MAX) ? T_INT: T_LONG; break;
+    case T_LONG: break;
+    case T_UNSIGNED | T_LONG: break;
+    }
+  } else {
+    switch (tag) {
+    case 0:
+      tag |= !(val & ~(long)INT_MAX) ? T_INT
+           : !(val & ~(long)UINT_MAX) ? T_UNSIGNED
+           : !(val & ~(long)LONG_MAX) ? T_LONG
+           : T_UNSIGNED | T_LONG; break;
+    case T_UNSIGNED:
+      tag |= !(val & ~(long)UINT_MAX) ? T_INT: T_LONG; break;
+    case T_LONG:
+      tag |= !(val & ~(long)LONG_MAX) ? 0: T_UNSIGNED; break;
+    case T_UNSIGNED | T_LONG:
+      break;
+    }
+  }
+
+  return Constant::New(tok, tag, val);
+}
+
+
+Expr* Parser::ParseGeneric() {
+  ts_.Expect('(');
+  auto controlExpr = ParseAssignExpr();
+  ts_.Expect(',');
+  Expr* selectedExpr = nullptr;
+  bool isDefault = false;
+  while (true) {
+    if (ts_.Try(Token::DEFAULT)) {
+      ts_.Expect(':');
+      auto defaultExpr = ParseAssignExpr();
+      if (!selectedExpr) {
+        selectedExpr = defaultExpr;
+        isDefault = true;
+      }
+    } else {
+      auto tok = ts_.Peek();
+      auto type = ParseTypeName();
+      ts_.Expect(':');
+      auto expr = ParseAssignExpr();
+      if (type->Compatible(*controlExpr->Type())) {
+        if (selectedExpr && !isDefault) {
+          Error(tok, "more than one generic association"
+              " are compatible with control expression");
+        }
+        selectedExpr = expr;
+        isDefault = false;
+      }
+    }
+    if (!ts_.Try(',')) {
+      ts_.Expect(')');
+      break;
+    }
+  }
+
+  if (!selectedExpr)
+    Error(ts_.Peek(), "no compatible generic association");
+  return selectedExpr;
+}
+
+
+QualType Parser::TryCompoundLiteral() {
+  auto mark = ts_.Mark();
+  if (ts_.Try('(') && IsTypeName(ts_.Peek())) {
+    auto type = ParseTypeName();
+    if (ts_.Try(')') && ts_.Test('{'))
+      return type;
+  }
+  ts_.ResetTo(mark);
+  return nullptr;
+}
+
+
+Expr* Parser::ParsePostfixExpr() {
+  if (ts_.Peek()->IsEOF()) {
+    Error(ts_.Peek(), "premature end of input");
+  }
+
+  auto type = TryCompoundLiteral();
+  if (type) {
+    auto anony = ParseCompoundLiteral(type);
+    return ParsePostfixExprTail(anony);
+  }
+
+  auto primExpr = ParsePrimaryExpr();
+  return ParsePostfixExprTail(primExpr);
+}
+
+
+Object* Parser::ParseCompoundLiteral(QualType type) {
+  auto linkage = curScope_->Type() == S_FILE ? L_INTERNAL: L_NONE;
+  auto anony = Object::NewAnony(ts_.Peek(), type, 0, linkage);
+  auto decl = ParseInitDeclaratorSub(anony);
+
+  // Just for generator to find the compound literal
+  if (curScope_->Type() == S_FILE) {
+    unit_->Add(decl);
+  } else {
+    curScope_->Insert(anony->Repr(), anony);
+  }
+  return anony;
+}
+
+
+// Return the constructed postfix expression
+Expr* Parser::ParsePostfixExprTail(Expr* lhs) {
+  while (true) {
+    auto tok = ts_.Next();
+
+    switch (tok->tag_) {
+    case '[': lhs = ParseSubScripting(lhs); break;
+    case '(': lhs = ParseFuncCall(lhs); break;
+    case Token::PTR: lhs = UnaryOp::New(Token::DEREF, lhs);
+    // Fall through
+    case '.': lhs = ParseMemberRef(tok, '.', lhs); break;
+    case Token::INC:
+    case Token::DEC: lhs = ParsePostfixIncDec(tok, lhs); break;
+    default: ts_.PutBack(); return lhs;
+    }
+  }
+}
+
+
+Expr* Parser::ParseSubScripting(Expr* lhs) {
+  auto lhsTile = lhs->Type()->ToTile();
+  if(lhsTile == nullptr)
+    Error(lhs, "tile expected");
+  TileType::ShapeInt lhsShape = lhsTile->Shape();
+  QualType lhsQual = lhsTile->Derived();
+  // create ret shape
+  TileType::ShapeInt shape;
+  TileType::ShapeInt axVec;
+  size_t i = 0;
+  const Token* tok;
+  std::vector<std::pair<int, int>> redInfo;
+  do {
+    tok = ts_.Next();
+    switch(tok->tag_) {
+      case ':':
+        shape.push_back(lhsShape[i++]);
+        break;
+
+      case Token::NEWAXIS:
+        shape.push_back(1);
+        break;
+
+      case Token::ADD:
+      case Token::SUB:
+      case Token::MAX:
+      case Token::MIN:{
+        int info = UnaryOp::encodeRed(i, tok->tag_);
+        redInfo.push_back({i, info});
+        shape.push_back(lhsShape[i++]);
+        break;
+      }
+
+      case '^':{
+        Expr* expr = ParseConditionalExpr();
+        EnsureInteger(expr);
+        int ax = Evaluator<long>().Eval(expr);
+        axVec.push_back(ax);
+        if(ax < 0 || ax >= lhsShape.size())
+          Error(tok, "unknown axis %d in transposition", ax);
+        shape.push_back(lhsShape[ax]);
+        i++;
+        break;
+      }
+
+      default:
+        Error(tok, "Unexpected subscript symbol encountered at dimension %d", i);
+        break;
+    }
+  }while(ts_.Try(','));
+  ts_.Expect(']');
+
+  // transposition mode
+  std::set<int> axSet(axVec.begin(), axVec.end());
+  if(!axSet.empty()){
+    if(axSet.size()!=lhsShape.size())
+      Error(tok, "transposition must address all axes of input array");
+    return TransOp::New(axVec, lhs);
+  }
+
+  // broadcasting mode
+  if(lhsShape.size() > i)
+    Error(tok, "broadcasting not using all operand axes");
+
+  // create ret tile
+  Expr* res = lhs;
+  for(auto r: redInfo){
+    shape.erase(shape.begin() + r.first);
+    Type *retType;
+    if(shape.empty())
+      retType = lhsQual.GetPtr();
+    else
+      retType = TileType::New(shape, lhsQual);
+    res = UnaryOp::New(Token::REDUCE, res, retType, r.second);
+  }
+  if(!shape.empty()){
+    TileType *retType = TileType::New(shape, lhsQual);
+    res = UnaryOp::New(Token::CAST, res, retType);
+  }
+  return res;
+}
+
+
+BinaryOp* Parser::ParseMemberRef(const Token* tok, int op, Expr* lhs) {
+  auto memberName = ts_.Peek()->str_;
+  ts_.Expect(Token::IDENTIFIER);
+
+  auto structUnionType = lhs->Type()->ToStruct();
+  if (structUnionType == nullptr) {
+    Error(tok, "an struct/union expected");
+  }
+
+  auto rhs = structUnionType->GetMember(memberName);
+  if (rhs == nullptr) {
+    Error(tok, "'%s' is not a member of '%s'",
+        memberName.c_str(), "[obj]");
+  }
+
+  return  BinaryOp::New(tok, op, lhs, rhs);
+}
+
+
+UnaryOp* Parser::ParsePostfixIncDec(const Token* tok, Expr* operand) {
+  auto op = tok->tag_ == Token::INC ?
+            Token::POSTFIX_INC: Token::POSTFIX_DEC;
+  return UnaryOp::New(op, operand);
+}
+
+
+FuncCall* Parser::ParseFuncCall(Expr* designator) {
+  FuncCall::ArgList args;
+  while (!ts_.Try(')')) {
+    args.push_back(Expr::MayCast(ParseAssignExpr()));
+    if (!ts_.Test(')'))
+      ts_.Expect(',');
+  }
+
+  return FuncCall::New(designator, args);
+}
+
+
+Expr* Parser::ParseUnaryExpr() {
+  auto tok = ts_.Next();
+  switch (tok->tag_) {
+  case Token::ALIGNOF: return ParseAlignof();
+  case Token::SIZEOF: return ParseSizeof();
+  case Token::INC: return ParsePrefixIncDec(tok);
+  case Token::DEC: return ParsePrefixIncDec(tok);
+  case '&': return ParseUnaryOp(tok, Token::ADDR);
+  case '*': return ParseDerefOp(tok);
+  case '+': return ParseUnaryOp(tok, Token::PLUS);
+  case '-': return ParseUnaryOp(tok, Token::MINUS);
+  case '~': return ParseUnaryOp(tok, '~');
+  case '!': return ParseUnaryOp(tok, '!');
+  case '^': {
+    auto operand = ParseCastExpr();
+    TileType::ShapeInt shape = operand->Type()->ToTile()->Shape();
+    TransOp::PermInt perm(shape.size());
+    for(int d = 0; d < shape.size(); d++)
+      perm[d] = d;
+    std::rotate(perm.begin(), perm.begin() + 1, perm.end());
+    return TransOp::New(perm, operand);
+  }
+  default:
+    ts_.PutBack();
+    return ParsePostfixExpr();
+  }
+}
+
+
+Constant* Parser::ParseSizeof() {
+  QualType type(nullptr);
+  auto tok = ts_.Next();
+  if (tok->tag_ == '(' && IsTypeName(ts_.Peek())) {
+    type = ParseTypeName();
+    ts_.Expect(')');
+  } else {
+    ts_.PutBack();
+    auto expr = ParseUnaryExpr();
+    type = expr->Type();
+  }
+
+  if (type->ToFunc() || type->ToVoid()) {
+  } else if (!type->Complete()) {
+    Error(tok, "sizeof(incomplete type)");
+  }
+  long val = type->Width();
+  return Constant::New(tok, T_UNSIGNED | T_LONG, val);
+}
+
+
+Constant* Parser::ParseAlignof() {
+  ts_.Expect('(');
+  auto tok = ts_.Peek();
+  auto type = ParseTypeName();
+  ts_.Expect(')');
+
+  long val = type->Align();
+  return Constant::New(tok, T_UNSIGNED| T_LONG, val);
+}
+
+
+UnaryOp* Parser::ParsePrefixIncDec(const Token* tok) {
+  assert(tok->tag_ == Token::INC || tok->tag_ == Token::DEC);
+
+  auto op = tok->tag_ == Token::INC ?
+            Token::PREFIX_INC: Token::PREFIX_DEC;
+  auto operand = ParseUnaryExpr();
+  return UnaryOp::New(op, operand);
+}
+
+
+UnaryOp* Parser::ParseUnaryOp(const Token* tok, int op) {
+  auto operand = ParseCastExpr();
+  return UnaryOp::New(op, operand);
+}
+
+Expr* Parser::ParseDerefOp(const Token* tok) {
+  Expr* pred = nullptr;
+  if(ts_.Try('?')){
+    ts_.Expect('(');
+    pred = ParseCastExpr();
+    ts_.Expect(')');
+  }
+  Expr* addr = ParseCastExpr();
+  if(pred)
+    return BinaryOp::New(tok, Token::MASKED_DEREF, pred, addr);
+  else
+    return UnaryOp::New(Token::DEREF, addr);
+}
+
+QualType Parser::ParseTypeName() {
+  auto type = ParseSpecQual();
+  if (ts_.Test('*') || ts_.Test('(') || ts_.Test('[')) // abstract-declarator FIRST set
+    return ParseAbstractDeclarator(type);
+  return type;
+}
+
+
+Expr* Parser::ParseCastExpr() {
+  auto tok = ts_.Next();
+  if (tok->tag_ == '(' && IsTypeName(ts_.Peek())) {
+    auto type = ParseTypeName();
+    ts_.Expect(')');
+    if (ts_.Test('{')) {
+      auto anony = ParseCompoundLiteral(type);
+      return ParsePostfixExprTail(anony);
+    }
+    auto operand = ParseCastExpr();
+    return UnaryOp::New(Token::CAST, operand, type);
+  }
+
+  ts_.PutBack();
+  return ParseUnaryExpr();
+}
+
+Expr* Parser::ParseRangeExpr() {
+  auto lhs = ParseCastExpr();
+  auto tok = ts_.Next();
+  while (tok->tag_ == Token::ELLIPSIS) {
+    auto rhs = ParseCastExpr();
+    lhs = BinaryOp::New(tok, lhs, rhs);
+    tok = ts_.Next();
+  }
+  ts_.PutBack();
+  return  lhs;
+}
+
+Expr* Parser::ParseMatmulExpr() {
+  auto lhs = ParseRangeExpr();
+  auto tok = ts_.Next();
+  while (tok->tag_ == Token::MATMUL) {
+    auto rhs = ParseRangeExpr();
+    lhs = BinaryOp::New(tok, lhs, rhs);
+    tok = ts_.Next();
+  }
+  ts_.PutBack();
+  return lhs;
+}
+
+Expr* Parser::ParseMultiplicativeExpr() {
+  auto lhs = ParseMatmulExpr();
+  auto tok = ts_.Next();
+  while (tok->tag_ == '*' || tok->tag_ == '/' || tok->tag_ == '%') {
+    auto rhs = ParseMatmulExpr();
+    lhs = BinaryOp::New(tok, lhs, rhs);
+    tok = ts_.Next();
+  }
+  ts_.PutBack();
+  return lhs;
+}
+
+
+Expr* Parser::ParseAdditiveExpr() {
+  auto lhs = ParseMultiplicativeExpr();
+  auto tok = ts_.Next();
+  while (tok->tag_ == '+' || tok->tag_ == '-') {
+    auto rhs = ParseMultiplicativeExpr();
+    lhs = BinaryOp::New(tok, lhs, rhs);
+
+    tok = ts_.Next();
+  }
+
+  ts_.PutBack();
+  return lhs;
+}
+
+
+Expr* Parser::ParseShiftExpr() {
+  auto lhs = ParseAdditiveExpr();
+  auto tok = ts_.Next();
+  while (tok->tag_ == Token::LEFT || tok->tag_ == Token::RIGHT) {
+    auto rhs = ParseAdditiveExpr();
+    lhs = BinaryOp::New(tok, lhs, rhs);
+
+    tok = ts_.Next();
+  }
+
+  ts_.PutBack();
+  return lhs;
+}
+
+
+Expr* Parser::ParseRelationalExpr() {
+  auto lhs = ParseShiftExpr();
+  auto tok = ts_.Next();
+  while (tok->tag_ == Token::LE || tok->tag_ == Token::GE
+      || tok->tag_ == '<' || tok->tag_ == '>') {
+    auto rhs = ParseShiftExpr();
+    lhs = BinaryOp::New(tok, lhs, rhs);
+
+    tok = ts_.Next();
+  }
+
+  ts_.PutBack();
+  return lhs;
+}
+
+
+Expr* Parser::ParseEqualityExpr() {
+  auto lhs = ParseRelationalExpr();
+  auto tok = ts_.Next();
+  while (tok->tag_ == Token::EQ || tok->tag_ == Token::NE) {
+    auto rhs = ParseRelationalExpr();
+    lhs = BinaryOp::New(tok, lhs, rhs);
+
+    tok = ts_.Next();
+  }
+
+  ts_.PutBack();
+  return lhs;
+}
+
+
+Expr* Parser::ParseBitiwiseAndExpr() {
+  auto lhs = ParseEqualityExpr();
+  auto tok = ts_.Peek();
+  while (ts_.Try('&')) {
+    auto rhs = ParseEqualityExpr();
+    lhs = BinaryOp::New(tok, lhs, rhs);
+
+    tok = ts_.Peek();
+  }
+
+  return lhs;
+}
+
+
+Expr* Parser::ParseBitwiseXorExpr() {
+  auto lhs = ParseBitiwiseAndExpr();
+  auto tok = ts_.Peek();
+  while (ts_.Try('^')) {
+    auto rhs = ParseBitiwiseAndExpr();
+    lhs = BinaryOp::New(tok, lhs, rhs);
+
+    tok = ts_.Peek();
+  }
+
+  return lhs;
+}
+
+
+Expr* Parser::ParseBitwiseOrExpr() {
+  auto lhs = ParseBitwiseXorExpr();
+  auto tok = ts_.Peek();
+  while (ts_.Try('|')) {
+    auto rhs = ParseBitwiseXorExpr();
+    lhs = BinaryOp::New(tok, lhs, rhs);
+
+    tok = ts_.Peek();
+  }
+
+  return lhs;
+}
+
+
+Expr* Parser::ParseLogicalAndExpr() {
+  auto lhs = ParseBitwiseOrExpr();
+  auto tok = ts_.Peek();
+  while (ts_.Try(Token::LOGICAL_AND)) {
+    auto rhs = ParseBitwiseOrExpr();
+    lhs = BinaryOp::New(tok, lhs, rhs);
+
+    tok = ts_.Peek();
+  }
+
+  return lhs;
+}
+
+
+Expr* Parser::ParseLogicalOrExpr() {
+  auto lhs = ParseLogicalAndExpr();
+  auto tok = ts_.Peek();
+  while (ts_.Try(Token::LOGICAL_OR)) {
+    auto rhs = ParseLogicalAndExpr();
+    lhs = BinaryOp::New(tok, lhs, rhs);
+
+    tok = ts_.Peek();
+  }
+
+  return lhs;
+}
+
+
+Expr* Parser::ParseConditionalExpr() {
+  auto cond = ParseLogicalOrExpr();
+  auto tok = ts_.Peek();
+  if (ts_.Try('?')) {
+    // Non-standard GNU extension
+    // a ?: b equals a ? a: c
+    auto exprTrue = ts_.Test(':') ? cond: ParseExpr();
+    ts_.Expect(':');
+    auto exprFalse = ParseConditionalExpr();
+
+    return ConditionalOp::New(tok, cond, exprTrue, exprFalse);
+  }
+
+  return cond;
+}
+
+
+Expr* Parser::ParseAssignExpr() {
+  // Yes, I know the lhs should be unary expression,
+  // let it handled by type checking
+  Expr* lhs = ParseConditionalExpr();
+  Expr* rhs;
+
+  auto tok = ts_.Next();
+  switch (tok->tag_) {
+  case Token::MUL_ASSIGN:
+    rhs = ParseAssignExpr();
+    rhs = BinaryOp::New(tok, '*', lhs, rhs);
+    break;
+
+  case Token::DIV_ASSIGN:
+    rhs = ParseAssignExpr();
+    rhs = BinaryOp::New(tok, '/', lhs, rhs);
+    break;
+
+  case Token::MOD_ASSIGN:
+    rhs = ParseAssignExpr();
+    rhs = BinaryOp::New(tok, '%', lhs, rhs);
+    break;
+
+  case Token::ADD_ASSIGN:
+    rhs = ParseAssignExpr();
+    rhs = BinaryOp::New(tok, '+', lhs, rhs);
+    break;
+
+  case Token::SUB_ASSIGN:
+    rhs = ParseAssignExpr();
+    rhs = BinaryOp::New(tok, '-', lhs, rhs);
+    break;
+
+  case Token::LEFT_ASSIGN:
+    rhs = ParseAssignExpr();
+    rhs = BinaryOp::New(tok, Token::LEFT, lhs, rhs);
+    break;
+
+  case Token::RIGHT_ASSIGN:
+    rhs = ParseAssignExpr();
+    rhs = BinaryOp::New(tok, Token::RIGHT, lhs, rhs);
+    break;
+
+  case Token::AND_ASSIGN:
+    rhs = ParseAssignExpr();
+    rhs = BinaryOp::New(tok, '&', lhs, rhs);
+    break;
+
+  case Token::XOR_ASSIGN:
+    rhs = ParseAssignExpr();
+    rhs = BinaryOp::New(tok, '^', lhs, rhs);
+    break;
+
+  case Token::OR_ASSIGN:
+    rhs = ParseAssignExpr();
+    rhs = BinaryOp::New(tok, '|', lhs, rhs);
+    break;
+
+  case '=':
+    rhs = ParseAssignExpr();
+    break;
+
+  default:
+    ts_.PutBack();
+    return lhs; // Could be constant
+  }
+
+  return BinaryOp::New(tok, '=', lhs, rhs);
+}
+
+
+void Parser::ParseStaticAssert() {
+  ts_.Expect('(');
+  auto condExpr = ParseAssignExpr();
+  ts_.Expect(',');
+  auto msg = ConcatLiterals(ts_.Expect(Token::LITERAL));
+  ts_.Expect(')');
+  ts_.Expect(';');
+  if (!Evaluator<long>().Eval(condExpr)) {
+    Error(ts_.Peek(), "static assertion failed: %s\n",
+          msg->SVal()->c_str());
+  }
+}
+
+
+// Return: list of declarations
+CompoundStmt* Parser::ParseDecl() {
+  StmtList stmts;
+  if (ts_.Try(Token::STATIC_ASSERT)) {
+    ParseStaticAssert();
+  } else {
+    int storageSpec, funcSpec, align;
+    auto type = ParseDeclSpec(&storageSpec, &funcSpec, &align);
+    if (!ts_.Test(';')) {
+      do {
+        auto ident = ParseDirectDeclarator(type, storageSpec, funcSpec, align);
+        auto init = ParseInitDeclarator(ident);
+        if (init) stmts.push_back(init);
+      } while (ts_.Try(','));
+    }
+    ts_.Expect(';');
+  }
+
+  return CompoundStmt::New(stmts);
+}
+
+
+// For state machine
+enum {
+  // Compatibility for these key words
+  COMP_SIGNED = T_SHORT | T_INT | T_LONG | T_LLONG,
+  COMP_UNSIGNED = T_SHORT | T_INT | T_LONG | T_LLONG,
+  COMP_CHAR = T_SIGNED | T_UNSIGNED,
+  COMP_SHORT = T_SIGNED | T_UNSIGNED | T_INT,
+  COMP_INT = T_SIGNED | T_UNSIGNED | T_LONG | T_SHORT | T_LLONG,
+  COMP_LONG = T_SIGNED | T_UNSIGNED | T_LONG | T_INT,
+  COMP_DOUBLE = T_LONG | T_COMPLEX,
+  COMP_COMPLEX = T_FLOAT | T_DOUBLE | T_LONG,
+
+  COMP_THREAD = S_EXTERN | S_STATIC,
+};
+
+
+static inline void TypeLL(int& typeSpec) {
+  if (typeSpec & T_LONG) {
+    typeSpec &= ~T_LONG;
+    typeSpec |= T_LLONG;
+  } else {
+    typeSpec |= T_LONG;
+  }
+}
+
+
+QualType Parser::ParseSpecQual() {
+  return ParseDeclSpec(nullptr, nullptr, nullptr);
+}
+
+
+static void EnsureAndSetStorageSpec(const Token* tok, int* storage, int spec) {
+  if (!storage)
+    Error(tok, "unexpected storage specifier");
+  if (*storage != 0)
+    Error(tok, "duplicated storage specifier");
+  *storage |= spec;
+}
+
+
+/*
+ * param: storage: null, only type specifier and qualifier accepted;
+ */
+QualType Parser::ParseDeclSpec(int* storageSpec, int* funcSpec, int* alignSpec) {
+#define ERR_FUNC_SPEC ("unexpected function specifier")
+#define ERR_STOR_SPEC ("unexpected storage specifier")
+#define ERR_DECL_SPEC ("two or more data types in declaration specifiers")
+
+  QualType type(nullptr);
+  int qualSpec = 0;
+  int typeSpec = 0;
+
+  if (storageSpec) *storageSpec = 0;
+  if (funcSpec) *funcSpec = 0;
+  if (alignSpec) *alignSpec = 0;
+
+  const Token* tok;
+  for (; ;) {
+    tok = ts_.Next();
+    switch (tok->tag_) {
+    // Function specifier
+    case Token::INLINE:
+      if (!funcSpec)
+        Error(tok, ERR_FUNC_SPEC);
+      *funcSpec |= F_INLINE;
+      break;
+
+    case Token::NORETURN:
+      if (!funcSpec)
+        Error(tok, ERR_FUNC_SPEC);
+      *funcSpec |= F_NORETURN;
+      break;
+
+    // Alignment specifier
+    case Token::ALIGNAS: {
+      if (!alignSpec)
+        Error(tok, "unexpected alignment specifier");
+      auto align = ParseAlignas();
+      if (align)
+        *alignSpec = align;
+      break;
+    }
+    // Storage specifier
+    // TODO(wgtdkp): typedef needs more constraints
+    case Token::TYPEDEF:
+      EnsureAndSetStorageSpec(tok, storageSpec, S_TYPEDEF);
+      break;
+
+    case Token::EXTERN:
+      EnsureAndSetStorageSpec(tok, storageSpec, S_EXTERN);
+      break;
+
+    case Token::GLOBAL:
+      EnsureAndSetStorageSpec(tok, storageSpec, S_GLOBAL);
+      break;
+
+    case Token::STATIC:
+      if (!storageSpec)
+        Error(tok, ERR_FUNC_SPEC);
+      if (*storageSpec & ~S_THREAD)
+        Error(tok, "duplicated storage specifier");
+      *storageSpec |= S_STATIC;
+      break;
+
+    case Token::THREAD:
+      if (!storageSpec)
+        Error(tok, ERR_FUNC_SPEC);
+      if (*storageSpec & ~COMP_THREAD)
+        Error(tok, "duplicated storage specifier");
+      *storageSpec |= S_THREAD;
+      break;
+
+
+    // Type qualifier
+    case Token::CONST:    qualSpec |= Qualifier::CONST;    break;
+    case Token::RESTRICT: qualSpec |= Qualifier::RESTRICT; break;
+    case Token::VOLATILE: qualSpec |= Qualifier::VOLATILE; break;
+    case Token::CMEM:     qualSpec |= Qualifier::CMEM; break;
+
+    // Type specifier
+    case Token::SIGNED:
+      if (typeSpec & ~COMP_SIGNED)
+        Error(tok, ERR_DECL_SPEC);
+      typeSpec |= T_SIGNED;
+      break;
+
+    case Token::UNSIGNED:
+      if (typeSpec & ~COMP_UNSIGNED)
+        Error(tok, ERR_DECL_SPEC);
+      typeSpec |= T_UNSIGNED;
+      break;
+
+    case Token::VOID:
+      if (typeSpec & ~0)
+        Error(tok, ERR_DECL_SPEC);
+      typeSpec |= T_VOID;
+      break;
+
+    case Token::CHAR:
+      if (typeSpec & ~COMP_CHAR)
+        Error(tok, ERR_DECL_SPEC);
+      typeSpec |= T_CHAR;
+      break;
+
+    case Token::SHORT:
+      if (typeSpec & ~COMP_SHORT)
+        Error(tok, ERR_DECL_SPEC);
+      typeSpec |= T_SHORT;
+      break;
+
+    case Token::INT:
+      if (typeSpec & ~COMP_INT)
+        Error(tok, ERR_DECL_SPEC);
+      typeSpec |= T_INT;
+      break;
+
+    case Token::LONG:
+      if (typeSpec & ~COMP_LONG)
+        Error(tok, ERR_DECL_SPEC);
+      TypeLL(typeSpec);
+      break;
+
+    case Token::HALF:
+      if(typeSpec & ~T_COMPLEX)
+        Error(tok, ERR_DECL_SPEC);
+      typeSpec |= T_HALF;
+      break;
+
+    case Token::FLOAT:
+      if (typeSpec & ~T_COMPLEX)
+        Error(tok, ERR_DECL_SPEC);
+      typeSpec |= T_FLOAT;
+      break;
+
+    case Token::DOUBLE:
+      if (typeSpec & ~COMP_DOUBLE)
+        Error(tok, ERR_DECL_SPEC);
+      typeSpec |= T_DOUBLE;
+      break;
+
+    case Token::BOOL:
+      if (typeSpec != 0)
+        Error(tok, ERR_DECL_SPEC);
+      typeSpec |= T_BOOL;
+      break;
+
+    case Token::COMPLEX:
+      if (typeSpec & ~COMP_COMPLEX)
+        Error(tok, ERR_DECL_SPEC);
+      typeSpec |= T_COMPLEX;
+      break;
+
+    case Token::STRUCT:
+    case Token::UNION:
+      if (typeSpec & ~0)
+        Error(tok, ERR_DECL_SPEC);
+      type = ParseStructUnionSpec(Token::STRUCT == tok->tag_);
+      typeSpec |= T_STRUCT_UNION;
+      break;
+
+    case Token::ENUM:
+      if (typeSpec != 0)
+        Error(tok, ERR_DECL_SPEC);
+      type = ParseEnumSpec();
+      typeSpec |= T_ENUM;
+      break;
+
+    case Token::ATOMIC:
+      Error(tok, "atomic not supported");
+      break;
+
+    default:
+      if (typeSpec == 0 && IsTypeName(tok)) {
+        auto ident = curScope_->Find(tok);
+        type = ident->Type();
+        // We may change the length of a array type by initializer,
+        // thus, make a copy of this type.
+        auto arrType = type->ToArray();
+        if (arrType && !type->Complete())
+          type = ArrayType::New(arrType->Len(), arrType->Derived());
+        typeSpec |= T_TYPEDEF_NAME;
+      } else  {
+        goto end_of_loop;
+      }
+    }
+  }
+
+end_of_loop:
+  ts_.PutBack();
+  switch (typeSpec) {
+  case 0:
+    Error(tok, "expect type specifier");
+    break;
+
+  case T_VOID:
+    type = VoidType::New();
+    break;
+
+  case T_STRUCT_UNION:
+  case T_ENUM:
+  case T_TYPEDEF_NAME:
+    break;
+
+  default:
+    type = ArithmType::New(typeSpec);
+    break;
+  }
+  // GNU extension: type attributes
+  //if (storageSpec && (*storageSpec & S_TYPEDEF))
+  //  TryAttributeSpecList();
+
+  return QualType(type.GetPtr(), qualSpec | type.Qual());
+
+#undef ERR_FUNC_SPEC
+#undef ERR_STOR_SPEC
+#undef ERR_DECL_SPEC
+}
+
+
+int Parser::ParseAlignas() {
+  int align;
+  ts_.Expect('(');
+  auto tok = ts_.Peek();
+  if (IsTypeName(ts_.Peek())) {
+    auto type = ParseTypeName();
+    ts_.Expect(')');
+    align = type->Align();
+  } else {
+    auto expr = ParseExpr();
+    align = Evaluator<long>().Eval(expr);
+    ts_.Expect(')');
+  }
+  if (align < 0 || ((align - 1) & align))
+    Error(tok, "requested alignment is not a positive power of 2");
+  return align;
+}
+
+
+Type* Parser::ParseEnumSpec() {
+  // GNU extension: type attributes
+  TryAttributeSpecList();
+
+  std::string tagName;
+  auto tok = ts_.Peek();
+  if (ts_.Try(Token::IDENTIFIER)) {
+    tagName = tok->str_;
+    if (ts_.Try('{')) {
+      // 定义enum类型
+      auto tagIdent = curScope_->FindTagInCurScope(tok);
+      if (!tagIdent) {
+        auto type = ArithmType::New(T_INT);
+        auto ident = Identifier::New(tok, type, L_NONE);
+        curScope_->InsertTag(ident);
+        return ParseEnumerator(type);   // 处理反大括号: '}'
+      }
+
+      if (!tagIdent->Type()->IsInteger()) // struct/union tag
+        Error(tok, "redefinition of enumeration tag '%s'", tagName.c_str());
+      return ParseEnumerator(tagIdent->Type()->ToArithm());
+    } else {
+      auto tagIdent = curScope_->FindTag(tok);
+      if (tagIdent) {
+        return tagIdent->Type();
+      }
+      auto type = ArithmType::New(T_INT);
+      auto ident = Identifier::New(tok, type, L_NONE);
+      curScope_->InsertTag(ident);
+      return type;
+    }
+  }
+
+  ts_.Expect('{');
+  auto type = ArithmType::New(T_INT);
+  return ParseEnumerator(type);   // 处理反大括号: '}'
+}
+
+
+Type* Parser::ParseEnumerator(ArithmType* type) {
+  assert(type && type->IsInteger());
+  int val = 0;
+  do {
+    auto tok = ts_.Expect(Token::IDENTIFIER);
+    // GNU extension: enumerator attributes
+    TryAttributeSpecList();
+
+    const auto& enumName = tok->str_;
+    auto ident = curScope_->FindInCurScope(tok);
+    if (ident) {
+      Error(tok, "redefinition of enumerator '%s'", enumName.c_str());
+    }
+    if (ts_.Try('=')) {
+      auto expr = ParseAssignExpr();
+      val = Evaluator<long>().Eval(expr);
+    }
+    auto enumer = Enumerator::New(tok, val);
+    ++val;
+    curScope_->Insert(enumer);
+    ts_.Try(',');
+  } while (!ts_.Try('}'));
+
+  type->SetComplete(true);
+  return type;
+}
+
+
+/*
+ * 四种 name space：
+ * 1.label, 如 goto end; 它有函数作用域
+ * 2.struct/union/enum 的 tag
+ * 3.struct/union 的成员
+ * 4.其它的普通的变量
+ */
+Type* Parser::ParseStructUnionSpec(bool isStruct) {
+  // GNU extension: type attributes
+  TryAttributeSpecList();
+
+  std::string tagName;
+  auto tok = ts_.Peek();
+  if (ts_.Try(Token::IDENTIFIER)) {
+    tagName = tok->str_;
+    if (ts_.Try('{')) {
+      // 看见大括号，表明现在将定义该struct/union类型
+      // 我们不用关心上层scope是否定义了此tag，如果定义了，那么就直接覆盖定义
+      auto tagIdent = curScope_->FindTagInCurScope(tok);
+      if (!tagIdent) {
+        // 现在是在当前scope第一次看到name，所以现在是第一次定义，连前向声明都没有；
+        auto type = StructType::New(isStruct, tagName.size(), curScope_);
+        auto ident = Identifier::New(tok, type, L_NONE);
+        curScope_->InsertTag(ident);
+        return ParseStructUnionDecl(type); // 处理反大括号: '}'
+      }
+
+
+      // 在当前scope找到了类型，但可能只是声明；注意声明与定义只能出现在同一个scope；
+      // 1.如果声明在定义的外层scope,那么即使在内层scope定义了完整的类型，此声明仍然是无效的；
+      //   因为如论如何，编译器都不会在内部scope里面去找定义，所以声明的类型仍然是不完整的；
+      // 2.如果声明在定义的内层scope,(也就是先定义，再在内部scope声明)，这时，不完整的声明会覆盖掉完整的定义；
+      //   因为编译器总是向上查找符号，不管找到的是完整的还是不完整的，都要；
+      if (!tagIdent->Type()->Complete()) {
+        // 找到了此tag的前向声明，并更新其符号表，最后设置为complete type
+        return ParseStructUnionDecl(tagIdent->Type()->ToStruct());
+      } else {
+        // 在当前作用域找到了完整的定义，并且现在正在定义同名的类型，所以报错；
+        Error(tok, "redefinition of struct tag '%s'", tagName.c_str());
+      }
+    } else {
+      // 没有大括号，表明不是定义一个struct/union;那么现在只可能是在：
+      // 1.声明；
+      // 2.声明的同时，定义指针(指针允许指向不完整类型) (struct Foo* p; 是合法的) 或者其他合法的类型；
+      //   如果现在索引符号表，那么：
+      //   1.可能找到name的完整定义，也可能只找得到不完整的声明；不管name指示的是不是完整类型，我们都只能选择name指示的类型；
+      //   2.如果我们在符号表里面压根找不到name,那么现在是name的第一次声明，创建不完整的类型并插入符号表；
+      auto tagIdent = curScope_->FindTag(tok);
+
+      // 如果tag已经定义或声明，那么直接返回此定义或者声明
+      if (tagIdent) {
+        return tagIdent->Type();
+      }
+      // 如果tag尚没有定义或者声明，那么创建此tag的声明(因为没有见到‘{’，所以不会是定义)
+      auto type = StructType::New(isStruct, true, curScope_);
+
+      // 因为有tag，所以不是匿名的struct/union， 向当前的scope插入此tag
+      auto ident = Identifier::New(tok, type, L_NONE);
+      curScope_->InsertTag(ident);
+      return type;
+    }
+  }
+  // 没见到identifier，那就必须有struct/union的定义，这叫做匿名struct/union;
+  ts_.Expect('{');
+
+  // 现在，如果是有tag，那它没有前向声明；如果是没有tag，那更加没有前向声明；
+  // 所以现在是第一次开始定义一个完整的struct/union类型
+  auto type = StructType::New(isStruct, tagName.size(), curScope_);
+  return ParseStructUnionDecl(type); // 处理反大括号: '}'
+}
+
+
+StructType* Parser::ParseStructUnionDecl(StructType* type) {
+#define ADD_MEMBER() {                        \
+  auto member = Object::New(tok, memberType); \
+  if (align > 0)                              \
+    member->SetAlign(align);                  \
+  type->AddMember(member);                    \
+}
+
+  // 既然是定义，那输入肯定是不完整类型，不然就是重定义了
+  assert(type && !type->Complete());
+
+  auto scopeBackup = curScope_;
+  curScope_ = type->MemberMap(); // Internal symbol lookup rely on curScope_
+  while (!ts_.Try('}')) {
+    if (ts_.Empty()) {
+      Error(ts_.Peek(), "premature end of input");
+    }
+
+    if(ts_.Try(Token::STATIC_ASSERT)) {
+      ParseStaticAssert();
+      continue;
+    }
+
+    // 解析type specifier/qualifier, 不接受storage等
+    int align;
+    auto baseType = ParseDeclSpec(nullptr, nullptr, &align);
+    do {
+      auto declInfo = ParseDeclarator(baseType);
+      auto tok = declInfo.tok;
+      auto memberType = declInfo.type;
+
+      if (ts_.Try(':')) {
+        ParseBitField(type, tok, memberType);
+        continue;
+      }
+
+      if (tok == nullptr) {
+        auto suType = memberType->ToStruct();
+        if (suType && !suType->HasTag()) {
+          auto anony = Object::NewAnony(ts_.Peek(), suType);
+          type->MergeAnony(anony);
+          continue;
+        } else {
+          Error(ts_.Peek(), "declaration does not declare anything");
+        }
+      }
+
+      const auto& name = tok->str_;
+      if (type->GetMember(name)) {
+        Error(tok, "duplicate member '%s'", name.c_str());
+      } else if (!memberType->Complete()) {
+        // C11 6.7.2.1 [3]:
+        if (type->IsStruct() &&
+            // Struct has more than one named member
+            type->MemberMap()->size() > 0 &&
+            memberType->ToArray()) {
+          ts_.Expect(';'); ts_.Expect('}');
+          ADD_MEMBER();
+          goto finalize;
+        } else {
+          Error(tok, "field '%s' has incomplete type", name.c_str());
+        }
+      } else if (memberType->ToFunc()) {
+        Error(tok, "field '%s' declared as a function", name.c_str());
+      }
+
+      ADD_MEMBER();
+    } while (ts_.Try(','));
+    ts_.Expect(';');
+  }
+finalize:
+  // GNU extension: type attributes
+  TryAttributeSpecList();
+
+  // struct/union定义结束，设置其为完整类型
+  type->Finalize();
+  type->SetComplete(true);
+  // TODO(wgtdkp): we need to export tags defined inside struct
+  const auto& tags = curScope_->AllTagsInCurScope();
+  for (auto tag: tags) {
+    if (scopeBackup->FindTag(tag->Tok()))
+      Error(tag, "redefinition of tag '%s'\n", tag->Name().c_str());
+    scopeBackup->InsertTag(tag);
+  }
+  curScope_ = scopeBackup;
+
+  return type;
+}
+
+
+void Parser::ParseBitField(StructType* structType,
+                           const Token* tok,
+                           QualType type) {
+  if (!type->IsInteger()) {
+    Error(tok ? tok: ts_.Peek(), "expect integer type for bitfield");
+  }
+
+  auto expr = ParseAssignExpr();
+  auto width = Evaluator<long>().Eval(expr);
+  if (width < 0) {
+    Error(expr, "expect non negative value");
+  } else if (width == 0 && tok) {
+    Error(tok, "no declarator expected for a bitfield with width 0");
+  } else if (width > type->Width() * 8) {
+    Error(expr, "width exceeds its type");
+  }
+
+  auto offset = structType->Offset() - type->Width();
+  // C11 6.7.5 [2]: alignment attribute shall not be specified in declaration of a bit field
+  // so here is ok to use type->Align()
+  offset = Type::MakeAlign(std::max(offset, 0), type->Align());
+
+  int bitFieldOffset;
+  unsigned char begin;
+
+  if (!structType->IsStruct()) {
+    begin = 0;
+    bitFieldOffset = 0;
+  } else if (structType->Members().size() == 0) {
+    begin = 0;
+    bitFieldOffset = 0;
+  } else {
+    auto last = structType->Members().back();
+    auto totalBits = last->Offset() * 8;
+    if (last->BitFieldWidth()) {
+      totalBits += last->BitFieldEnd();
+    } else { // Is not bit field
+      totalBits += last->Type()->Width() * 8;
+    }
+
+    if (width == 0)
+      width = type->Width() * 8 - totalBits; // So posterior bitfield would be packed
+    if (width == 0) // A bitfield with zero width is never added to member list
+      return;       // Because we use bitfield width to tell if a member is bitfield or not.
+    if (width + totalBits <= type->Width() * 8) {
+      begin = totalBits % 8;
+      bitFieldOffset = totalBits / 8;
+    } else {
+      begin = 0;
+      bitFieldOffset = Type::MakeAlign(structType->Offset(), type->Width());
+    }
+  }
+
+  Object* bitField;
+  if (tok) {
+    bitField = Object::New(tok, type, 0, L_NONE, begin, width);
+  } else {
+    bitField = Object::NewAnony(ts_.Peek(), type, 0, L_NONE, begin, width);
+  }
+  structType->AddBitField(bitField, bitFieldOffset);
+}
+
+
+int Parser::ParseQual() {
+  int qualSpec = 0;
+  for (; ;) {
+    auto tok = ts_.Next();
+    switch (tok->tag_) {
+    case Token::CONST:    qualSpec |= Qualifier::CONST;    break;
+    case Token::RESTRICT: qualSpec |= Qualifier::RESTRICT; break;
+    case Token::VOLATILE: qualSpec |= Qualifier::VOLATILE; break;
+    case Token::CMEM:     qualSpec |= Qualifier::CMEM; break;
+    case Token::ATOMIC:   Error(tok, "do not support 'atomic'"); break;
+    default: ts_.PutBack(); return qualSpec;
+    }
+  }
+}
+
+
+QualType Parser::ParsePointer(QualType typePointedTo) {
+  while (ts_.Try('*')) {
+    auto t = PointerType::New(typePointedTo);
+    typePointedTo = QualType(t, ParseQual());
+  }
+  return typePointedTo;
+}
+
+
+static QualType ModifyBase(QualType type, QualType base, QualType newBase) {
+  if (type == base)
+    return newBase;
+
+  auto ty = type->ToDerived();
+  ty->SetDerived(ModifyBase(ty->Derived(), base, newBase));
+
+  return ty;
+}
+
+
+/*
+ * Return: pair of token(must be identifier) and it's type
+ *     if token is nullptr, then we are parsing abstract declarator
+ *     else, parsing direct declarator.
+ */
+DeclInfo Parser::ParseDeclarator(QualType base) {
+  // May be pointer
+  auto pointerType = ParsePointer(base);
+
+  if (ts_.Try('(')) {
+    // 现在的 pointerType 并不是正确的 base type
+    auto declInfo = ParseDeclarator(pointerType);
+    auto tok = declInfo.tok;
+    auto type = declInfo.type;
+
+    ts_.Expect(')');
+
+    auto newBase = ParseArrayFuncDeclarator(tok, pointerType);
+
+    // 修正 base type
+    auto retType = ModifyBase(type, pointerType, newBase);
+    return DeclInfo(declInfo.tok, retType);
+  } else if (ts_.Peek()->IsIdentifier()) {
+    auto tok = ts_.Next();
+    // GNU extension: variable attributes
+    ASTNode::AttrList attrList = TryAttributeSpecList();
+    auto retType = ParseArrayFuncDeclarator(tok, pointerType);
+    return DeclInfo(tok, retType, attrList);
+  } else {
+    errTok_ = ts_.Peek();
+    auto retType = ParseArrayFuncDeclarator(nullptr, pointerType);
+    return DeclInfo(nullptr, retType);
+  }
+}
+
+
+Identifier* Parser::ProcessDeclarator(const Token* tok,
+                                      QualType type,
+                                      const ASTNode::AttrList& attrs,
+                                      int storageSpec,
+                                      int funcSpec,
+                                      int align) {
+  assert(tok);
+
+  // 检查在同一 scope 是否已经定义此变量
+  // 如果 storage 是 typedef，那么应该往符号表里面插入 type
+  // 定义 void 类型变量是非法的，只能是指向void类型的指针
+  // 如果 funcSpec != 0, 那么现在必须是在定义函数，否则出错
+  const auto& name = tok->str_;
+  Identifier* ident;
+
+  if (storageSpec & S_TYPEDEF) {
+    // C11 6.7.5 [2]: alignment specifier
+    if (align > 0)
+      Error(tok, "alignment specified for typedef");
+
+    ident = curScope_->FindInCurScope(tok);
+    if (ident) { // There is prio declaration in the same scope
+      // The same declaration, simply return the prio declaration
+      if (!type->Compatible(*ident->Type()))
+        Error(tok, "conflicting types for '%s'", name.c_str());
+
+      // TODO(wgtdkp): add previous declaration information
+      return ident;
+    }
+
+    if(!attrs.empty()) {
+      Error(tok, "typedef attributes not allowed");
+    }
+
+    ident = Identifier::New(tok, type, L_NONE);
+    curScope_->Insert(ident);
+    return ident;
+  }
+
+  if (type->ToVoid()) {
+    Error(tok, "variable or field '%s' declared void",
+        name.c_str());
+  }
+
+  if (type->ToFunc() && curScope_->Type() != S_FILE
+      && (storageSpec & S_STATIC)) {
+    Error(tok, "invalid storage class for function '%s'", name.c_str());
+  }
+
+  Linkage linkage;
+  // Identifiers in function prototype have no linkage
+  if (curScope_->Type() == S_PROTO) {
+    linkage = L_NONE;
+  } else if (curScope_->Type() == S_FILE) {
+    linkage = L_EXTERNAL; // Default linkage for file scope identifiers
+    if (storageSpec & S_STATIC)
+      linkage = L_INTERNAL;
+  } else if (!(storageSpec & S_EXTERN)) {
+    linkage = L_NONE; // Default linkage for block scope identifiers
+    if (type->ToFunc())
+      linkage = L_EXTERNAL;
+  } else {
+    linkage = L_EXTERNAL;
+  }
+
+  ident = curScope_->FindInCurScope(tok);
+  if (ident) { // There is prio declaration in the same scope
+    if (!type->Compatible(*ident->Type())) {
+      Error(tok, "conflicting types for '%s'", name.c_str());
+    }
+
+    // The same scope prio declaration has no linkage,
+    // there is a redeclaration error
+    if (linkage == L_NONE) {
+      Error(tok, "redeclaration of '%s' with no linkage",
+          name.c_str());
+    } else if (linkage == L_EXTERNAL) {
+      if (ident->Linkage() == L_NONE) {
+        Error(tok, "conflicting linkage for '%s'", name.c_str());
+      }
+    } else {
+      if (ident->Linkage() != L_INTERNAL) {
+        Error(tok, "conflicting linkage for '%s'", name.c_str());
+      }
+    }
+    // The same declaration, simply return the prio declaration
+    if (!ident->Type()->Complete())
+      ident->Type()->SetComplete(type->Complete());
+    // Prio declaration of a function may omit the param name
+    if (type->ToFunc())
+      ident->Type()->ToFunc()->SetParams(type->ToFunc()->Params());
+    else if (ident->ToObject() && !(storageSpec & S_EXTERN))
+      ident->ToObject()->SetStorage(ident->ToObject()->Storage() & ~S_EXTERN);
+    return ident;
+  } else if (linkage == L_EXTERNAL) {
+    ident = curScope_->Find(tok);
+    if (ident) {
+      if (!type->Compatible(*ident->Type())) {
+        Error(tok, "conflicting types for '%s'", name.c_str());
+      }
+      if (ident->Linkage() != L_NONE) {
+        linkage = ident->Linkage();
+      }
+      // Don't return, override it
+    } else {
+      ident = externalSymbols_->FindInCurScope(tok);
+      if (ident) {
+        if (!type->Compatible(*ident->Type())) {
+          Error(tok, "conflicting types for '%s'", name.c_str());
+        }
+        // TODO(wgtdkp): ???????
+        // Don't return
+        // To stop later declaration with the same name in the same scope overriding this declaration
+
+        // Useless here, just keep it
+        if (!ident->Type()->Complete())
+          ident->Type()->SetComplete(type->Complete());
+        //return ident;
+      }
+    }
+  }
+
+  Identifier* ret;
+  // TODO(wgtdkp): Treat function as object ?
+  if (type->ToFunc()) {
+    // C11 6.7.5 [2]: alignment specifier
+    if (align > 0)
+      Error(tok, "alignment specified for function");
+    ret = Identifier::New(tok, type, linkage, attrs);
+  } else {
+    auto obj = Object::New(tok, type, storageSpec, linkage, 0, 0, attrs);
+    if (align > 0)
+      obj->SetAlign(align);
+    ret = obj;
+  }
+  curScope_->Insert(ret);
+  if (linkage == L_EXTERNAL && ident == nullptr) {
+      externalSymbols_->Insert(ret);
+  }
+
+  return ret;
+}
+
+
+QualType Parser::ParseArrayFuncDeclarator(const Token* ident, QualType base) {
+  if (ts_.Try('[')) {
+    if(!base->IsScalar()) {
+      Error(ts_.Peek(), "tiles must have scalar elements");
+    }
+    auto shape = ParseTileShape();
+    ts_.Expect(']');
+    base = ParseArrayFuncDeclarator(ident, base);
+    if (!base->Complete()) {
+      Error(ident, "'%s' has incomplete element type", ident->str_.c_str());
+    }
+    // return a pointer for tiles in constant memory:
+    return TileType::New(shape, base);
+
+  } else if (ts_.Try('(')) {	// Function declaration
+    if (base->ToFunc()) {
+      Error(ts_.Peek(),
+          "the return value of function cannot be function");
+    } else if (nullptr != base->ToArray()) {
+      Error(ts_.Peek(),
+          "the return value of function cannot be array");
+    }
+
+    FuncType::ParamList params;
+    EnterProto();
+    auto variadic = ParseParamList(params);
+    ExitProto();
+
+    ts_.Expect(')');
+    base = ParseArrayFuncDeclarator(ident, base);
+
+    return FuncType::New(base, 0, variadic, params);
+  }
+
+
+  return base;
+}
+
+
+/*
+ * Return: -1, length not specified
+ */
+int Parser::ParseArrayLength() {
+  auto hasStatic = ts_.Try(Token::STATIC);
+  auto qual = ParseQual();
+  if (0 != qual)
+    hasStatic = ts_.Try(Token::STATIC);
+
+  // 不支持变长数组
+  if (!hasStatic && ts_.Test(']'))
+    return -1;
+
+  auto expr = ParseAssignExpr();
+  EnsureInteger(expr);
+  auto ret = Evaluator<long>().Eval(expr);
+  if (ret < 0) {
+    Error(expr, "size of array is negative");
+  }
+  return ret;
+}
+
+TileType::ShapeInt Parser::ParseTileShape() {
+  TileType::ShapeInt ret;
+  size_t i = 0;
+  do {
+    Expr* expr = ParseConditionalExpr();
+    EnsureInteger(expr);
+    int dim = Evaluator<long>().Eval(expr);
+    if (dim < 0)
+      Error(expr, "shape %d of tile is negative", i);
+    ret.push_back(dim);
+    i++;
+  }while(ts_.Try(','));
+  return ret;
+}
+
+/*
+ * Return: true, variadic;
+ */
+bool Parser::ParseParamList(FuncType::ParamList& params) {
+  if (ts_.Test(')'))
+    return false;
+  auto param = ParseParamDecl();
+  if (param->Type()->ToVoid())
+    return false;
+  params.push_back(param);
+
+  while (ts_.Try(',')) {
+    if (ts_.Try(Token::ELLIPSIS))
+      return true;
+    param = ParseParamDecl();
+    if (param->Type()->ToVoid())
+      Error(param, "'void' must be the only parameter");
+    params.push_back(param);
+  }
+  return false;
+}
+
+
+Object* Parser::ParseParamDecl() {
+  int storageSpec, funcSpec;
+  // C11 6.7.5 [2]: alignment specifier cannot be specified in params
+  auto type = ParseDeclSpec(&storageSpec, &funcSpec, nullptr);
+  auto tokTypePair = ParseDeclarator(type);
+  auto tok = tokTypePair.tok;
+  QualType fullType(tokTypePair.type.GetPtr(), type.Qual());
+  type = Type::MayCast(fullType, true);
+  auto attrs = tokTypePair.attrs;
+  if (!tok) { // Abstract declarator
+    return Object::NewAnony(ts_.Peek(), type, 0, Linkage::L_NONE);
+  }
+
+  // Align set to non positive, stands for not specified
+  auto ident = ProcessDeclarator(tok, type, attrs, storageSpec, funcSpec, -1);
+  if (!ident->ToObject())
+    Error(ident, "expect object in param list");
+
+  return ident->ToObject();
+}
+
+
+QualType Parser::ParseAbstractDeclarator(QualType type) {
+  auto declInfo = ParseDeclarator(type);
+  auto tok = declInfo.tok;
+  type = declInfo.type;
+  if (tok) { // Not a abstract declarator!
+    Error(tok, "unexpected identifier '%s'", tok->str_.c_str());
+  }
+  return type;
+}
+
+
+Identifier* Parser::ParseDirectDeclarator(QualType type,
+                                          int storageSpec,
+                                          int funcSpec,
+                                          int align) {
+  auto declInfo = ParseDeclarator(type);
+  auto tok = declInfo.tok;
+  type = declInfo.type;
+  auto attrs = declInfo.attrs;
+  if (tok == nullptr) {
+    Error(errTok_, "expect identifier or '('");
+  }
+
+  return ProcessDeclarator(tok, type, attrs, storageSpec, funcSpec, align);
+}
+
+
+Declaration* Parser::ParseInitDeclarator(Identifier* ident) {
+  auto obj = ident->ToObject();
+  if (!obj) { // Do not record function Declaration
+    return nullptr;
+  }
+
+  const auto& name = obj->Name();
+  if (ts_.Try('=')) {
+    return ParseInitDeclaratorSub(obj);
+  }
+
+  if (!obj->Type()->Complete()) {
+    if (obj->Linkage() == L_NONE) {
+      Error(obj, "storage size of '%s' isn’t known", name.c_str());
+    }
+    // FIXME(wgtdkp):
+    // Discards the incomplete object declarations
+    // It causes linking failure of forward-declared objects with imcomplete type
+    return nullptr;
+  }
+
+  if (!obj->Decl()) {
+    auto decl = Declaration::New(obj);
+    obj->SetDecl(decl);
+    return decl;
+  }
+
+  return nullptr;
+}
+
+
+Declaration* Parser::ParseInitDeclaratorSub(Object* obj) {
+  const auto& name = obj->Name();
+  if ((curScope_->Type() != S_FILE) && obj->Linkage() != L_NONE) {
+    Error(obj, "'%s' has both 'extern' and initializer", name.c_str());
+  }
+
+  if (!obj->Type()->Complete() && !obj->Type()->ToArray()) {
+    Error(obj, "variable '%s' has initializer but incomplete type",
+        name.c_str());
+  }
+
+  if (obj->HasInit()) {
+    Error(obj, "redefinition of variable '%s'", name.c_str());
+  }
+
+  // There could be more than one declaration for
+  // an object in the same scope.
+  // But it must has external or internal linkage.
+  // So, for external/internal objects,
+  // the initialization will always go to
+  // the first declaration. As the initialization
+  // is evaluated at compile time,
+  // the order doesn't matter.
+  // For objects with no linkage, there is
+  // always only one declaration.
+  // Once again, we need not to worry about
+  // the order of the initialization.
+  if (obj->Decl()) {
+    ParseInitializer(obj->Decl(), obj->Type(), 0, false, true);
+    return nullptr;
+  } else {
+    auto decl = Declaration::New(obj);
+    ParseInitializer(decl, obj->Type(), 0, false, true);
+    obj->SetDecl(decl);
+    return decl;
+  }
+}
+
+
+void Parser::ParseInitializer(Declaration* decl,
+                              QualType type,
+                              int offset,
+                              bool designated,
+                              bool forceBrace,
+                              unsigned char bitFieldBegin,
+                              unsigned char bitFieldWidth) {
+  if (designated && !ts_.Test('.') && !ts_.Test('[')) {
+    ts_.Expect('=');
+  }
+
+//  std::cout << "parsing initialized " << decl->Obj()->Name() << std::endl;
+  Expr* expr;
+  auto arrType = type->ToArray();
+  auto structType = type->ToStruct();
+  // A compound literal in initializer is reduced to a initializer directly
+  // It means that the compound literal will never be created
+  //auto literalType = TryCompoundLiteral();
+  //if (literalType && !literalType->Compatible(*type))
+  //    Error("incompatible type of initializer");
+  if (arrType) {
+    if (forceBrace && !ts_.Test('{') && !ts_.Test(Token::LITERAL)) {
+      ts_.Expect('{');
+    } else if (!ParseLiteralInitializer(decl, arrType, offset)) {
+      ParseArrayInitializer(decl, arrType, offset, designated);
+      arrType->SetComplete(true);
+    }
+    return;
+  } else if (structType) {
+    if (!ts_.Test('.') && !ts_.Test('{')) {
+      auto mark = ts_.Mark();
+      expr = ParseAssignExpr();
+      if (structType->Compatible(*expr->Type())) {
+        decl->AddInit({structType, offset, expr});
+        return;
+      }
+      ts_.ResetTo(mark);
+      if (forceBrace)
+        ts_.Expect('{');
+    }
+    return ParseStructInitializer(decl, structType, offset, designated);
+  }
+
+  // Scalar type
+  auto hasBrace = ts_.Try('{');
+  expr = ParseAssignExpr();
+  if (hasBrace) {
+    ts_.Try(',');
+    ts_.Expect('}');
+  }
+  decl->AddInit({type.GetPtr(), offset, expr, bitFieldBegin, bitFieldWidth});
+}
+
+
+bool Parser::ParseLiteralInitializer(Declaration* decl,
+                                     ArrayType* type,
+                                     int offset) {
+  if (!type->Derived()->IsInteger())
+    return false;
+
+  auto hasBrace = ts_.Try('{');
+  if (!ts_.Test(Token::LITERAL)) {
+    if (hasBrace) ts_.PutBack();
+    return false;
+  }
+  auto literal = ConcatLiterals(ts_.Next());
+  auto tok = literal->Tok();
+
+  if (hasBrace) {
+    ts_.Try(',');
+    ts_.Expect('}');
+  }
+
+  if (!type->Complete()) {
+    type->SetLen(literal->Type()->ToArray()->Len());
+    type->SetComplete(true);
+  }
+
+  auto width = std::min(type->Width(), literal->Type()->Width());
+  auto str = literal->SVal()->c_str();
+
+  for (; width >= 8; width -= 8) {
+    auto p = reinterpret_cast<const long*>(str);
+    auto type = ArithmType::New(T_LONG);
+    auto val = Constant::New(tok, T_LONG, static_cast<long>(*p));
+    decl->AddInit({type, offset, val});
+    offset += 8;
+    str += 8;
+  }
+
+  for (; width >= 4; width -= 4) {
+    auto p = reinterpret_cast<const int*>(str);
+    auto type = ArithmType::New(T_INT);
+    auto val = Constant::New(tok, T_INT, static_cast<long>(*p));
+    decl->AddInit({type, offset, val});
+    offset += 4;
+    str += 4;
+  }
+
+  for (; width >= 2; width -= 2) {
+    auto p = reinterpret_cast<const short*>(str);
+    auto type = ArithmType::New(T_SHORT);
+    auto val = Constant::New(tok, T_SHORT, static_cast<long>(*p));
+    decl->AddInit({type, offset, val});
+    offset += 2;
+    str += 2;
+  }
+
+  for (; width >= 1; --width) {
+    auto p = str;
+    auto type = ArithmType::New(T_CHAR);
+    auto val = Constant::New(tok, T_CHAR, static_cast<long>(*p));
+    decl->AddInit({type, offset, val});
+    offset++;
+    str++;
+  }
+
+  return true;
+}
+
+
+void Parser::ParseArrayInitializer(Declaration* decl,
+                                   ArrayType* type,
+                                   int offset,
+                                   bool designated) {
+  assert(type);
+
+  if (!type->Complete())
+    type->SetLen(0);
+
+  int idx = 0;
+  auto width = type->Derived()->Width();
+  auto hasBrace = ts_.Try('{');
+  while (true) {
+    if (ts_.Test('}')) {
+      if (hasBrace)
+        ts_.Next();
+      return;
+    }
+
+    if (!designated && !hasBrace && (ts_.Test('.') || ts_.Test('['))) {
+      ts_.PutBack(); // Put the read comma(',') back
+      return;
+    } else if ((designated = ts_.Try('['))) {
+      auto expr = ParseAssignExpr();
+      EnsureInteger(expr);
+      idx = Evaluator<long>().Eval(expr);
+      ts_.Expect(']');
+
+      if (idx < 0 || (type->Complete() && idx >= type->Len())) {
+        Error(ts_.Peek(), "excess elements in array initializer");
+      }
+    }
+
+    ParseInitializer(decl, type->Derived(), offset + idx * width, designated);
+    designated = false;
+    ++idx;
+
+    if (type->Complete() && idx >= type->Len()) {
+      break;
+    } else if (!type->Complete()) {
+      type->SetLen(std::max(idx, type->Len()));
+    }
+
+    // Needless comma at the end is legal
+    if (!ts_.Try(',')) {
+      if (hasBrace)
+        ts_.Expect('}');
+      return;
+    }
+  }
+
+  if (hasBrace) {
+    ts_.Try(',');
+    if (!ts_.Try('}')) {
+      Error(ts_.Peek(), "excess elements in array initializer");
+    }
+  }
+}
+
+
+StructType::Iterator Parser::ParseStructDesignator(StructType* type,
+                                                   const std::string& name) {
+  auto iter = type->Members().begin();
+  for (; iter != type->Members().end(); ++iter) {
+    if ((*iter)->Anonymous()) {
+      auto anonyType = (*iter)->Type()->ToStruct();
+      assert(anonyType);
+      if (anonyType->GetMember(name)) {
+        return iter; // ParseStructDesignator(anonyType);
+      }
+    } else if ((*iter)->Name() == name) {
+      return iter;
+    }
+  }
+  assert(false);
+  return iter;
+}
+
+
+void Parser::ParseStructInitializer(Declaration* decl,
+                                    StructType* type,
+                                    int offset,
+                                    bool designated) {
+  assert(type);
+
+  auto hasBrace = ts_.Try('{');
+  auto member = type->Members().begin();
+  while (true) {
+    if (ts_.Test('}')) {
+      if (hasBrace)
+        ts_.Next();
+      return;
+    }
+
+    if (!designated && !hasBrace && (ts_.Test('.') || ts_.Test('['))) {
+      ts_.PutBack(); // Put the read comma(',') back
+      return;
+    }
+
+    if ((designated = ts_.Try('.'))) {
+      auto tok = ts_.Expect(Token::IDENTIFIER);
+      const auto& name = tok->str_;
+      if (!type->GetMember(name)) {
+        Error(tok, "member '%s' not found", name.c_str());
+      }
+      member = ParseStructDesignator(type, name);
+    }
+    if (member == type->Members().end())
+      break;
+
+    if ((*member)->Anonymous()) {
+      if (designated) { // Put back '.' and member name.
+        ts_.PutBack();
+        ts_.PutBack();
+      }
+      // Because offsets of member of anonymous struct/union are based
+      // directly on external struct/union
+      ParseInitializer(decl, (*member)->Type(), offset, designated, false,
+                       (*member)->BitFieldBegin(), (*member)->BitFieldWidth());
+    } else {
+      ParseInitializer(decl, (*member)->Type(),
+                       offset + (*member)->Offset(), designated, false,
+                       (*member)->BitFieldBegin(), (*member)->BitFieldWidth());
+    }
+    designated = false;
+    ++member;
+
+    // Union, just init the first member
+    if (!type->IsStruct())
+      break;
+
+    if (!hasBrace && member == type->Members().end())
+      break;
+
+    // Needless comma at the end is allowed
+    if (!ts_.Try(',')) {
+      if (hasBrace)
+        ts_.Expect('}');
+      return;
+    }
+  }
+
+  if (hasBrace) {
+    ts_.Try(',');
+    if (!ts_.Try('}')) {
+      Error(ts_.Peek(), "excess members in struct initializer");
+    }
+  }
+}
+
+
+/*
+ * Statements
+ */
+
+Stmt* Parser::ParseStmt() {
+  auto tok = ts_.Next();
+  if (tok->IsEOF())
+    Error(tok, "premature end of input");
+
+  switch (tok->tag_) {
+  // GNU extension: statement attributes
+  case Token::ATTRIBUTE:
+    TryAttributeSpecList();
+  case ';':
+    return EmptyStmt::New();
+  case '{':
+    return ParseCompoundStmt();
+  case Token::IF:
+    return ParseIfStmt();
+  case Token::SWITCH:
+    return ParseSwitchStmt();
+  case Token::WHILE:
+    return ParseWhileStmt();
+  case Token::DO:
+    return ParseDoStmt();
+  case Token::FOR:
+    return ParseForStmt();
+  case Token::GOTO:
+    return ParseGotoStmt();
+  case Token::CONTINUE:
+    return ParseContinueStmt();
+  case Token::BREAK:
+    return ParseBreakStmt();
+  case Token::RETURN:
+    return ParseReturnStmt();
+  case Token::CASE:
+    return ParseCaseStmt();
+  case Token::DEFAULT:
+    return ParseDefaultStmt();
+  }
+
+  if (tok->IsIdentifier() && ts_.Try(':')) {
+    // GNU extension: label attributes
+    TryAttributeSpecList();
+    return ParseLabelStmt(tok);
+  }
+
+  ts_.PutBack();
+  auto expr = ParseExpr();
+  ts_.Expect(';');
+
+  return expr;
+}
+
+
+CompoundStmt* Parser::ParseCompoundStmt(FuncType* funcType) {
+  EnterBlock(funcType);
+
+  std::list<Stmt*> stmts;
+
+  while (!ts_.Try('}')) {
+    if (ts_.Peek()->IsEOF()) {
+      Error(ts_.Peek(), "premature end of input");
+    }
+
+    if (IsType(ts_.Peek())) {
+      stmts.push_back(ParseDecl());
+    } else {
+      stmts.push_back(ParseStmt());
+    }
+  }
+
+  auto scope = curScope_;
+  ExitBlock();
+
+  return CompoundStmt::New(stmts, scope);
+}
+
+
+IfStmt* Parser::ParseIfStmt() {
+  ts_.Expect('(');
+  auto tok = ts_.Peek();
+  auto cond = ParseExpr();
+  if (!cond->Type()->IsScalar()) {
+    Error(tok, "expect scalar");
+  }
+  ts_.Expect(')');
+
+  auto then = ParseStmt();
+  Stmt* els = nullptr;
+  if (ts_.Try(Token::ELSE))
+    els = ParseStmt();
+
+  return IfStmt::New(cond, then, els);
+}
+
+
+/*
+ * for 循环结构：
+ *      for (declaration; expression1; expression2) statement
+ * 展开后的结构：
+ *		declaration
+ * cond: if (expression1) then empty
+ *		else goto end
+ *		statement
+ * step: expression2
+ *		goto cond
+ * next:
+ */
+
+#define ENTER_LOOP_BODY(breakDest, continueDest)  \
+{                                                 \
+  LabelStmt* breakDestBackup = breakDest_;        \
+  LabelStmt* continueDestBackup = continueDest_;  \
+  breakDest_ = breakDest;                         \
+  continueDest_ = continueDest;
+
+#define EXIT_LOOP_BODY()              \
+  breakDest_ = breakDestBackup;       \
+  continueDest_ = continueDestBackup; \
+}
+
+ForStmt* Parser::ParseForStmt() {
+  EnterBlock();
+  ts_.Expect('(');
+  // init
+  Stmt* init = nullptr;
+  if (IsType(ts_.Peek())) {
+    init = ParseDecl();
+  } else if (!ts_.Try(';')) {
+    init = ParseExpr();
+    ts_.Expect(';');
+  }
+  // cond
+  Expr* cond = nullptr;
+  if (!ts_.Try(';')) {
+    cond = ParseExpr();
+    ts_.Expect(';');
+  }
+  // step
+  Expr* step = nullptr;
+  if (!ts_.Try(')')) {
+    step = ParseExpr();
+    ts_.Expect(')');
+  }
+  // body
+  Stmt* body = ParseStmt();
+  ExitBlock();
+  return ForStmt::New(body, init, cond, step);
+}
+
+
+/*
+ * while 循环结构：
+ * while (expression) statement
+ * 展开后的结构：
+ * cond: if (expression) then empty
+ *		else goto end
+ *		statement
+ *		goto cond
+ * end:
+ */
+CompoundStmt* Parser::ParseWhileStmt() {
+  std::list<Stmt*> stmts;
+  ts_.Expect('(');
+  auto tok = ts_.Peek();
+  auto condExpr = ParseExpr();
+  ts_.Expect(')');
+
+  if (!condExpr->Type()->IsScalar()) {
+    Error(tok, "scalar expression expected");
+  }
+
+  auto condLabel = LabelStmt::New();
+  auto endLabel = LabelStmt::New();
+  auto gotoEndStmt = JumpStmt::New(endLabel);
+  auto ifStmt = IfStmt::New(condExpr, EmptyStmt::New(), gotoEndStmt);
+  stmts.push_back(condLabel);
+  stmts.push_back(ifStmt);
+
+  Stmt* bodyStmt;
+  ENTER_LOOP_BODY(endLabel, condLabel)
+  bodyStmt = ParseStmt();
+  EXIT_LOOP_BODY()
+
+  stmts.push_back(bodyStmt);
+  stmts.push_back(JumpStmt::New(condLabel));
+  stmts.push_back(endLabel);
+
+  return CompoundStmt::New(stmts);
+}
+
+
+/*
+ * do-while 循环结构：
+ *      do statement while (expression)
+ * 展开后的结构：
+ * begin: statement
+ * cond: if (expression) then goto begin
+ *		 else goto end
+ * end:
+ */
+CompoundStmt* Parser::ParseDoStmt() {
+  auto beginLabel = LabelStmt::New();
+  auto condLabel = LabelStmt::New();
+  auto endLabel = LabelStmt::New();
+
+  Stmt* bodyStmt;
+  ENTER_LOOP_BODY(endLabel, beginLabel)
+  bodyStmt = ParseStmt();
+  EXIT_LOOP_BODY()
+
+  ts_.Expect(Token::WHILE);
+  ts_.Expect('(');
+  auto condExpr = ParseExpr();
+  ts_.Expect(')');
+  ts_.Expect(';');
+
+  auto gotoBeginStmt = JumpStmt::New(beginLabel);
+  auto gotoEndStmt = JumpStmt::New(endLabel);
+  auto ifStmt = IfStmt::New(condExpr, gotoBeginStmt, gotoEndStmt);
+
+  std::list<Stmt*> stmts;
+  stmts.push_back(beginLabel);
+  stmts.push_back(bodyStmt);
+  stmts.push_back(condLabel);
+  stmts.push_back(ifStmt);
+  stmts.push_back(endLabel);
+
+  return CompoundStmt::New(stmts);
+}
+
+
+#undef ENTER_LOOP_BODY
+#undef EXIT_LOOP_BODY
+
+
+#define ENTER_SWITCH_BODY(breakDest, caseLabels)  \
+{                                                 \
+  CaseLabelList* caseLabelsBackup = caseLabels_;  \
+  LabelStmt* defaultLabelBackup = defaultLabel_;  \
+  LabelStmt* breakDestBackup = breakDest_;        \
+  breakDest_ = breakDest;                         \
+  caseLabels_ = &caseLabels;                      \
+  defaultLabel_ = nullptr;
+
+#define EXIT_SWITCH_BODY()            \
+  caseLabels_ = caseLabelsBackup;     \
+  breakDest_ = breakDestBackup;       \
+  defaultLabel_ = defaultLabelBackup; \
+}
+
+
+/*
+ * switch
+ *  jump stmt (skip case labels)
+ *  case labels
+ *  jump stmts
+ *  default jump stmt
+ */
+CompoundStmt* Parser::ParseSwitchStmt() {
+  std::list<Stmt*> stmts;
+  ts_.Expect('(');
+  auto tok = ts_.Peek();
+  auto expr = ParseExpr();
+  ts_.Expect(')');
+
+  if (!expr->Type()->IsInteger()) {
+    Error(tok, "switch quantity not an integer");
+  }
+
+  auto testLabel = LabelStmt::New();
+  auto endLabel = LabelStmt::New();
+  auto t = TempVar::New(expr->Type());
+  auto assign = BinaryOp::New(tok, '=', t, expr);
+  stmts.push_back(assign);
+  stmts.push_back(JumpStmt::New(testLabel));
+
+  CaseLabelList caseLabels;
+  ENTER_SWITCH_BODY(endLabel, caseLabels);
+
+  auto bodyStmt = ParseStmt(); // Fill caseLabels and defaultLabel
+  stmts.push_back(bodyStmt);
+  stmts.push_back(JumpStmt::New(endLabel));
+  stmts.push_back(testLabel);
+
+  for (auto iter = caseLabels.begin();
+       iter != caseLabels.end(); ++iter) {
+    auto cond = BinaryOp::New(tok, Token::EQ, t, iter->first);
+    auto then = JumpStmt::New(iter->second);
+    auto ifStmt = IfStmt::New(cond, then, nullptr);
+    stmts.push_back(ifStmt);
+  }
+  if (defaultLabel_)
+    stmts.push_back(JumpStmt::New(defaultLabel_));
+  EXIT_SWITCH_BODY();
+
+  stmts.push_back(endLabel);
+
+  return CompoundStmt::New(stmts);
+}
+
+
+#undef ENTER_SWITCH_BODY
+#undef EXIT_SWITCH_BODY
+
+
+CompoundStmt* Parser::ParseCaseStmt() {
+  auto tok = ts_.Peek();
+
+  // Case ranges: Non-standard GNU extension
+  long begin, end;
+  begin = Evaluator<long>().Eval(ParseAssignExpr());
+  if (ts_.Try(Token::ELLIPSIS))
+    end = Evaluator<long>().Eval(ParseAssignExpr());
+  else
+    end = begin;
+  ts_.Expect(':');
+
+  auto labelStmt = LabelStmt::New();
+  for (auto val = begin; val <= end; ++val) {
+    if (val > INT_MAX)
+      Error(tok, "case range exceed range of int");
+    auto cons = Constant::New(tok, T_INT, val);
+    caseLabels_->push_back(std::make_pair(cons, labelStmt));
+  }
+
+  std::list<Stmt*> stmts;
+  stmts.push_back(labelStmt);
+  stmts.push_back(ParseStmt());
+
+  return CompoundStmt::New(stmts);
+}
+
+
+CompoundStmt* Parser::ParseDefaultStmt() {
+  auto tok = ts_.Peek();
+  ts_.Expect(':');
+  if (defaultLabel_) { // There is a 'default' stmt
+    Error(tok, "multiple default labels in one switch");
+  }
+  auto labelStmt = LabelStmt::New();
+  defaultLabel_ = labelStmt;
+
+  std::list<Stmt*> stmts;
+  stmts.push_back(labelStmt);
+  stmts.push_back(ParseStmt());
+
+  return CompoundStmt::New(stmts);
+}
+
+
+JumpStmt* Parser::ParseContinueStmt() {
+  auto tok = ts_.Peek();
+  ts_.Expect(';');
+  if (continueDest_ == nullptr) {
+    Error(tok, "'continue' is allowed only in loop");
+  }
+
+  return JumpStmt::New(continueDest_);
+}
+
+
+JumpStmt* Parser::ParseBreakStmt() {
+  auto tok = ts_.Peek();
+  ts_.Expect(';');
+  if (breakDest_ == nullptr) {
+    Error(tok, "'break' is allowed only in switch/loop");
+  }
+
+  return JumpStmt::New(breakDest_);
+}
+
+
+ReturnStmt* Parser::ParseReturnStmt() {
+  Expr* expr;
+
+  if (ts_.Try(';')) {
+    expr = nullptr;
+  } else {
+    expr = ParseExpr();
+    ts_.Expect(';');
+
+    auto retType = curFunc_->FuncType()->Derived();
+    expr = Expr::MayCast(expr, retType);
+  }
+
+  return ReturnStmt::New(expr);
+}
+
+
+JumpStmt* Parser::ParseGotoStmt() {
+  auto label = ts_.Peek();
+  ts_.Expect(Token::IDENTIFIER);
+  ts_.Expect(';');
+
+  auto labelStmt = FindLabel(label->str_);
+  if (labelStmt) {
+    return JumpStmt::New(labelStmt);
+  }
+
+  auto unresolvedJump = JumpStmt::New(nullptr);
+  unresolvedJumps_.push_back(std::make_pair(label, unresolvedJump));
+
+  return unresolvedJump;
+}
+
+
+CompoundStmt* Parser::ParseLabelStmt(const Token* label) {
+  const auto& labelStr = label->str_;
+  auto stmt = ParseStmt();
+  if (nullptr != FindLabel(labelStr)) {
+    Error(label, "redefinition of label '%s'", labelStr.c_str());
+  }
+
+  auto labelStmt = LabelStmt::New();
+  AddLabel(labelStr, labelStmt);
+  std::list<Stmt*> stmts;
+  stmts.push_back(labelStmt);
+  stmts.push_back(stmt);
+
+  return CompoundStmt::New(stmts);
+}
+
+
+bool Parser::IsBuiltin(const std::string& name) {
+  return name == "__builtin_va_arg" ||
+         name == "__builtin_va_start";
+}
+
+
+bool Parser::IsBuiltin(FuncType* type) {
+  assert(vaStartType_ && vaArgType_);
+  return type == vaStartType_ || type == vaArgType_;
+}
+
+
+// Builtin functions will be inlined
+void Parser::DefineBuiltins() {
+  // FIXME: potential bug: using same object for params!!!
+  auto voidPtr = PointerType::New(VoidType::New());
+  auto param = Object::New(nullptr, voidPtr);
+  FuncType::ParamList pl;
+  pl.push_back(param);
+  pl.push_back(param);
+  vaStartType_ = FuncType::New(VoidType::New(), F_INLINE, false, pl);
+  vaArgType_ = FuncType::New(voidPtr, F_INLINE, false, pl);
+}
+
+
+Identifier* Parser::GetBuiltin(const Token* tok) {
+  assert(vaStartType_ && vaArgType_);
+  static Identifier* vaStart = nullptr;
+  static Identifier* vaArg = nullptr;
+  const auto& name = tok->str_;
+  if (name == "__builtin_va_start") {
+    if (!vaStart)
+      vaStart = Identifier::New(tok, vaStartType_, Linkage::L_EXTERNAL);
+    return vaStart;
+  } else if (name == "__builtin_va_arg") {
+    if (!vaArg)
+      vaArg = Identifier::New(tok, vaArgType_, Linkage::L_EXTERNAL);
+    return vaArg;
+  }
+  assert(false);
+  return nullptr;
+}
+
+
+/*
+ * GNU extensions
+ */
+
+// Attribute
+ASTNode::AttrList Parser::TryAttributeSpecList() {
+  ASTNode::AttrList attrList;
+  while (ts_.Try(Token::ATTRIBUTE))
+    ParseAttributeSpec(attrList);
+  return attrList;
+}
+
+
+void Parser::ParseAttributeSpec(ASTNode::AttrList& attrList) {
+  ts_.Expect('(');
+  ts_.Expect('(');
+
+  while (!ts_.Try(')')) {
+    attrList.push_back(ParseAttribute());
+    if (!ts_.Try(',')) {
+      ts_.Expect(')');
+      break;
+    }
+  }
+  ts_.Expect(')');
+}
+
+
+ASTNode::Attr Parser::ParseAttribute() {
+  ASTNode::Attr ret;
+  if (!ts_.Test(Token::IDENTIFIER))
+    return ret;
+  auto tok = ts_.Next();
+  std::string name = tok->str_;
+  // set kind
+  if(name == "aligned")
+    ret.kind = ASTNode::Attr::ALIGNED;
+  else if(name == "readonly")
+    ret.kind = ASTNode::Attr::READONLY;
+  else if(name == "writeonly")
+    ret.kind = ASTNode::Attr::WRITEONLY;
+  else if(name == "multipleof")
+    ret.kind = ASTNode::Attr::MULTIPLEOF;
+  else if(name == "noalias")
+    ret.kind = ASTNode::Attr::NOALIAS;
+  else
+    Error(tok, "unknown attribute kind");
+  // set exprs
+  if (ts_.Try('(')) {
+    if (ts_.Try(')'))
+      return ret;
+    ret.vals.push_back(ParseExpr());
+    if (ts_.Test(',')) {
+      while (ts_.Try(',')) {}
+    }
+    ts_.Try(')');
+  }
+  return ret;
+}
diff --git a/lib/lang/scanner.cc b/lib/lang/scanner.cc
new file mode 100644
index 000000000..9c394ecfd
--- /dev/null
+++ b/lib/lang/scanner.cc
@@ -0,0 +1,452 @@
+#include "triton/lang/scanner.h"
+
+#include <cctype>
+#include <climits>
+
+
+void Scanner::Tokenize(TokenSequence& ts) {
+  while (true) {
+    auto tok = Scan();
+    if (tok->tag_ == Token::END) {
+      if (ts.Empty() || (ts.Back()->tag_ != Token::NEW_LINE)) {
+        auto t = Token::New(*tok);
+        t->tag_ = Token::NEW_LINE;
+        t->str_ = "\n";
+        ts.InsertBack(t);
+      }
+      break;
+    } else {
+      if (!ts.Empty() && ts.Back()->tag_ == Token::NEW_LINE)
+        tok->ws_ = true;
+      ts.InsertBack(tok);
+    }
+  }
+}
+
+
+std::string Scanner::ScanHeadName(const Token* lhs, const Token* rhs) {
+  std::string str;
+  const char* begin = lhs->loc_.Begin() + 1;
+  const char* end = rhs->loc_.Begin();
+  for (; begin != end; ++begin) {
+    if (*begin == '\n' && str.back() == '\\')
+      str.pop_back();
+    else
+      str.push_back(*begin);
+  }
+  return str;
+}
+
+
+Token* Scanner::Scan(bool ws) {
+  tok_.ws_ = ws;
+  SkipWhiteSpace();
+
+  Mark();
+
+  if (Test('\n')) {
+    auto ret = MakeNewLine();
+    Next();
+    return ret;
+  }
+  auto c = Next();
+  switch (c) {
+  case '#': return MakeToken(Try('#') ? Token::DSHARP: c);
+  case ':': return MakeToken(Try('>') ? ']': c);
+  case '(': case ')': case '[': case ']':
+  case '?': case ',': case '{': case '}':
+  case '~': case ';': case '@':
+    return MakeToken(c);
+  case '-':
+    if (Try('>')) return MakeToken(Token::PTR);
+    if (Try('-')) return MakeToken(Token::DEC);
+    if (Try('=')) return MakeToken(Token::SUB_ASSIGN);
+    return MakeToken(c);
+  case '+':
+    if (Try('+')) return MakeToken(Token::INC);
+    if (Try('=')) return MakeToken(Token::ADD_ASSIGN);
+    return MakeToken(c);
+  case '<':
+    if (Try('<')) return MakeToken(Try('=') ? Token::LEFT_ASSIGN: Token::LEFT);
+    if (Try('=')) return MakeToken(Token::LE);
+    if (Try(':')) return MakeToken('[');
+    if (Try('%')) return MakeToken('{');
+    return MakeToken(c);
+  case '%':
+    if (Try('=')) return MakeToken(Token::MOD_ASSIGN);
+    if (Try('>')) return MakeToken('}');
+    if (Try(':')) {
+      if (Try('%')) {
+        if (Try(':')) return MakeToken(Token::DSHARP);
+        PutBack();
+      }
+      return MakeToken('#');
+    }
+    return MakeToken(c);
+  case '>':
+    if (Try('>')) return MakeToken(Try('=') ? Token::RIGHT_ASSIGN: Token::RIGHT);
+    if (Try('=')) return MakeToken(Token::GE);
+    return MakeToken(c);
+  case '=': return MakeToken(Try('=') ? Token::EQ: c);
+  case '!': return MakeToken(Try('=') ? Token::NE: c);
+  case '&':
+    if (Try('&')) return MakeToken(Token::LOGICAL_AND);
+    if (Try('=')) return MakeToken(Token::AND_ASSIGN);
+    return MakeToken(c);
+  case '|':
+    if (Try('|')) return MakeToken(Token::LOGICAL_OR);
+    if (Try('=')) return MakeToken(Token::OR_ASSIGN);
+    return MakeToken(c);
+  case '*': return MakeToken(Try('=') ? Token::MUL_ASSIGN: c);
+  case '/':
+    if (Test('/') || Test('*')) {
+      SkipComment();
+      return Scan(true);
+    }
+    return MakeToken(Try('=') ? Token::DIV_ASSIGN: c);
+  case '^': return MakeToken(Try('=') ? Token::XOR_ASSIGN: c);
+  case '.':
+    if (isdigit(Peek())) return SkipNumber();
+    if (Try('.')) {
+      if (Try('.')) return MakeToken(Token::ELLIPSIS);
+      PutBack();
+      return MakeToken('.');
+    }
+    return MakeToken(c);
+  case '0' ... '9': return SkipNumber();
+  case 'u': case 'U': case 'L': {
+    /*auto enc = */ScanEncoding(c);
+    if (Try('\'')) return SkipCharacter();
+    if (Try('\"')) return SkipLiteral();
+    return SkipIdentifier();
+  }
+  case '\'': return SkipCharacter();
+  case '\"': return SkipLiteral();
+  case 'a' ... 't': case 'v' ... 'z': case 'A' ... 'K':
+  case 'M' ... 'T': case 'V' ... 'Z': case '_': case '$':
+  case 0x80 ... 0xfd:
+    return SkipIdentifier();
+  case '\\':
+    // Universal character name is allowed in identifier
+    if (Test('u') || Test('U'))
+      return SkipIdentifier();
+    return MakeToken(Token::INVALID);
+  case '\0': return MakeToken(Token::END);
+  default: return MakeToken(Token::INVALID);
+  }
+}
+
+
+void Scanner::SkipWhiteSpace() {
+  while (isspace(Peek()) && Peek() != '\n') {
+    tok_.ws_ = true;
+    Next();
+  }
+}
+
+
+void Scanner::SkipComment() {
+  if (Try('/')) {
+    // Line comment terminated an newline or eof
+    while (!Empty()) {
+      if (Peek() == '\n')
+        return;
+      Next();
+    }
+    return;
+  } else if (Try('*')) {
+    while (!Empty()) {
+      auto c = Next();
+      if (c  == '*' && Peek() == '/') {
+        Next();
+        return;
+      }
+    }
+    Error(loc_, "unterminated block comment");
+  }
+  assert(false);
+}
+
+
+std::string Scanner::ScanIdentifier() {
+  std::string val;
+  while (!Empty()) {
+      auto c = Next();
+      if (IsUCN(c)) {
+        c = ScanEscaped(); // Call ScanUCN()
+        AppendUCN(val, c);
+      } else {
+        val.push_back(c);
+      }
+  }
+  return val;
+}
+
+
+Token* Scanner::SkipIdentifier() {
+  PutBack();
+  auto c = Next();
+  while (isalnum(c)
+       || (0x80 <= c && c <= 0xfd)
+       || c == '_'
+       || c == '$'
+       || IsUCN(c)) {
+    if (IsUCN(c))
+      c = ScanEscaped(); // Just read it
+    c = Next();
+  }
+  PutBack();
+  return MakeToken(Token::IDENTIFIER);
+}
+
+
+// Scan PP-Number
+Token* Scanner::SkipNumber() {
+  PutBack();
+  bool sawHexPrefix = false;
+  int tag = Token::I_CONSTANT;
+  auto c = Next();
+  while (c == '.' || isdigit(c) || isalpha(c) || c == '_' || IsUCN(c)) {
+    if (c == 'e' || c =='E' || c == 'p' || c == 'P') {
+      if (!Try('-')) Try('+');
+      if (!((c == 'e' || c == 'E') && sawHexPrefix))
+        tag = Token::F_CONSTANT;
+    }  else if (IsUCN(c)) {
+      ScanEscaped();
+    } else if (c == '.') {
+      tag = Token::F_CONSTANT;
+    } else if (c == 'x' || c == 'X') {
+      sawHexPrefix = true;
+    }
+    c = Next();
+  }
+  PutBack();
+  return MakeToken(tag);
+}
+
+
+Encoding Scanner::ScanLiteral(std::string& val) {
+  auto enc = Test('\"') ? Encoding::NONE: ScanEncoding(Next());
+  Next();
+  val.resize(0);
+  while (!Test('\"')) {
+    auto c = Next();
+    bool isucn = IsUCN(c);
+    if (c == '\\')
+      c = ScanEscaped();
+    if (isucn)
+      AppendUCN(val, c);
+    else
+      val.push_back(c);
+  }
+  return enc;
+}
+
+
+Token* Scanner::SkipLiteral() {
+  auto c = Next();
+  while (c != '\"' && c != '\n' && c != '\0') {
+    if (c == '\\') Next();
+    c = Next();
+  }
+  if (c != '\"')
+    Error(loc_, "unterminated string literal");
+  return MakeToken(Token::LITERAL);
+}
+
+
+Encoding Scanner::ScanCharacter(int& val) {
+  auto enc = Test('\'') ? Encoding::NONE: ScanEncoding(Next());
+  Next();
+  val = 0;
+  while (!Test('\'')) {
+    auto c = Next();
+    if (c == '\\')
+      c = ScanEscaped();
+    if (enc == Encoding::NONE)
+      val = (val << 8) + c;
+    else
+      val = c;
+  }
+  return enc;
+}
+
+
+Token* Scanner::SkipCharacter() {
+  auto c = Next();
+  while (c != '\'' && c != '\n' && c != '\0') {
+    if (c == '\\') Next();
+    c = Next();
+  }
+  if (c != '\'')
+    Error(loc_, "unterminated character constant");
+  return MakeToken(Token::C_CONSTANT);
+}
+
+
+int Scanner::ScanEscaped() {
+  auto c = Next();
+  switch (c) {
+  case '\\': case '\'': case '\"': case '\?':
+    return c;
+  case 'a': return '\a';
+  case 'b': return '\b';
+  case 'f': return '\f';
+  case 'n': return '\n';
+  case 'r': return '\r';
+  case 't': return '\t';
+  case 'v': return '\v';
+  // Non-standard GCC extention
+  case 'e': return '\033';
+  case 'x': return ScanHexEscaped();
+  case '0' ... '7': return ScanOctEscaped(c);
+  case 'u': return ScanUCN(4);
+  case 'U': return ScanUCN(8);
+  default: Error(loc_, "unrecognized escape character '%c'", c);
+  }
+  return c; // Make compiler happy
+}
+
+
+int Scanner::ScanHexEscaped() {
+  int val = 0, c = Peek();
+  if (!isxdigit(c))
+    Error(loc_, "expect xdigit, but got '%c'", c);
+  while (isxdigit(c)) {
+    val = (val << 4) + XDigit(c);
+    Next();
+    c = Peek();
+  }
+  return val;
+}
+
+
+int Scanner::ScanOctEscaped(int c) {
+  int val = XDigit(c);
+  c = Peek();
+  if (!IsOctal(c))
+    return val;
+  val = (val << 3) + XDigit(c);
+  Next();
+
+  c = Peek();
+  if (!IsOctal(c))
+    return val;
+  val = (val << 3) + XDigit(c);
+  Next();
+  return val;
+}
+
+
+int Scanner::ScanUCN(int len) {
+  assert(len == 4 || len == 8);
+  int val = 0;
+  for (auto i = 0; i < len; ++i) {
+    auto c = Next();
+    if (!isxdigit(c))
+      Error(loc_, "expect xdigit, but got '%c'", c);
+    val = (val << 4) + XDigit(c);
+  }
+  return val;
+}
+
+
+int Scanner::XDigit(int c) {
+  switch (c) {
+  case '0' ... '9': return c - '0';
+  case 'a' ... 'z': return c - 'a' + 10;
+  case 'A' ... 'Z': return c - 'A' + 10;
+  default: assert(false); return c;
+  }
+}
+
+
+Encoding Scanner::ScanEncoding(int c) {
+  switch (c) {
+  case 'u': return Try('8') ? Encoding::UTF8: Encoding::CHAR16;
+  case 'U': return Encoding::CHAR32;
+  case 'L': return Encoding::WCHAR;
+  default: assert(false); return Encoding::NONE;
+  }
+}
+
+
+std::string* ReadFile(const std::string& filename) {
+  FILE* f = fopen(filename.c_str(), "r");
+  if (!f) Error("%s: No such file or directory", filename.c_str());
+  auto text = new std::string;
+  int c;
+  while (EOF != (c = fgetc(f)))
+      text->push_back(c);
+  fclose(f);
+  return text;
+}
+
+
+int Scanner::Next() {
+  int c = Peek();
+  ++p_;
+  if (c == '\n') {
+    ++loc_.line_;
+    loc_.column_ = 1;
+    loc_.lineBegin_ = p_;
+  } else {
+    ++loc_.column_;
+  }
+  return c;
+}
+
+
+int Scanner::Peek() {
+  int c = (uint8_t)(*p_);
+  if (c == '\\' && p_[1] == '\n') {
+    p_ += 2;
+    ++loc_.line_;
+    loc_.column_ = 1;
+    loc_.lineBegin_ = p_;
+    return Peek();
+  }
+  return c;
+}
+
+
+// There couldn't be more than one PutBack() that
+// cross two line, so just leave lineBegin, because
+// we never care about the pos of newline token
+void Scanner::PutBack() {
+  int c = *--p_;
+  if (c == '\n' && p_[-1] == '\\') {
+    --loc_.line_;
+    --p_;
+    return PutBack();
+  } else if (c == '\n') {
+    --loc_.line_;
+  } else {
+    --loc_.column_;
+  }
+}
+
+
+Token* Scanner::MakeToken(int tag) {
+  tok_.tag_ = tag;
+  auto& str = tok_.str_;
+  str.resize(0);
+  const char* p = tok_.loc_.lineBegin_ + tok_.loc_.column_ - 1;
+  for (; p < p_; ++p) {
+    if (p[0] == '\n' && p[-1] == '\\')
+      str.pop_back();
+    else
+      str.push_back(p[0]);
+  }
+  return Token::New(tok_);
+}
+
+
+/*
+ * New line is special, it is generated before reading the character '\n'
+ */
+Token* Scanner::MakeNewLine() {
+  tok_.tag_ = '\n';
+  tok_.str_ = std::string(p_, p_ + 1);
+  return Token::New(tok_);
+}
diff --git a/lib/lang/scope.cc b/lib/lang/scope.cc
new file mode 100644
index 000000000..9e487deba
--- /dev/null
+++ b/lib/lang/scope.cc
@@ -0,0 +1,111 @@
+#include "triton/lang/scope.h"
+
+#include "triton/lang/ast.h"
+
+#include <cassert>
+#include <iostream>
+
+
+Identifier* Scope::Find(const Token* tok) {
+  auto ret = Find(tok->str_);
+  if (ret) ret->SetTok(tok);
+  return ret;
+}
+
+
+Identifier* Scope::FindInCurScope(const Token* tok) {
+  auto ret = FindInCurScope(tok->str_);
+  if (ret) ret->SetTok(tok);
+  return ret;
+}
+
+
+Identifier* Scope::FindTag(const Token* tok) {
+  auto ret = FindTag(tok->str_);
+  if (ret) ret->SetTok(tok);
+  return ret;
+}
+
+
+Identifier* Scope::FindTagInCurScope(const Token* tok) {
+  auto ret = FindTagInCurScope(tok->str_);
+  if (ret) ret->SetTok(tok);
+  return ret;
+}
+
+
+void Scope::Insert(Identifier* ident) {
+  Insert(ident->Name(), ident);
+}
+
+
+void Scope::InsertTag(Identifier* ident) {
+  Insert(TagName(ident->Name()), ident);
+}
+
+
+Identifier* Scope::Find(const std::string& name) {
+  auto ident = identMap_.find(name);
+  if (ident != identMap_.end())
+    return ident->second;
+  if (type_ == S_FILE || parent_ == nullptr)
+    return nullptr;
+  return parent_->Find(name);
+}
+
+
+Identifier* Scope::FindInCurScope(const std::string& name) {
+  auto ident = identMap_.find(name);
+  if (ident == identMap_.end())
+    return nullptr;
+  return ident->second;
+}
+
+
+void Scope::Insert(const std::string& name, Identifier* ident) {
+  assert(FindInCurScope(name) == nullptr);
+  identMap_[name] = ident;
+}
+
+
+Identifier* Scope::FindTag(const std::string& name) {
+  auto tag = Find(TagName(name));
+  if (tag) assert(tag->ToTypeName());
+  return tag;
+}
+
+
+Identifier* Scope::FindTagInCurScope(const std::string& name) {
+  auto tag = FindInCurScope(TagName(name));
+  assert(tag == nullptr || tag->ToTypeName());
+  return tag;
+}
+
+
+Scope::TagList Scope::AllTagsInCurScope() const {
+  TagList tags;
+  for (auto& kv: identMap_) {
+    if (IsTagName(kv.first))
+      tags.push_back(kv.second);
+  }
+  return tags;
+}
+
+
+void Scope::Print() {
+  std::cout << "scope: " << this << std::endl;
+
+  auto iter = identMap_.begin();
+  for (; iter != identMap_.end(); ++iter) {
+    auto name = iter->first;
+    auto ident = iter->second;
+    if (ident->ToTypeName()) {
+      std::cout << name << "\t[type:\t"
+                << ident->Type()->Str() << "]" << std::endl;
+    } else {
+      std::cout << name << "\t[object:\t"
+                << ident->Type()->Str() << "]" << std::endl;
+    }
+  }
+  std::cout << std::endl;
+}
diff --git a/lib/lang/token.cc b/lib/lang/token.cc
new file mode 100644
index 000000000..c4a95c0c4
--- /dev/null
+++ b/lib/lang/token.cc
@@ -0,0 +1,263 @@
+#include "triton/lang/token.h"
+
+#include "triton/lang/mem_pool.h"
+#include "triton/lang/parser.h"
+
+
+static MemPoolImp<Token> tokenPool;
+
+const std::unordered_map<std::string, int> Token::kwTypeMap_ {
+  { "__constant__", Token::CMEM },
+  { "__global__", Token::GLOBAL },
+  { "auto", Token::AUTO },
+  { "break", Token::BREAK },
+  { "case", Token::CASE },
+  { "char", Token::CHAR },
+  { "const", Token::CONST },
+  { "continue", Token::CONTINUE },
+  { "default", Token::DEFAULT },
+  { "do", Token::DO },
+  { "double", Token::DOUBLE },
+  { "else", Token::ELSE },
+  { "enum", Token::ENUM },
+  { "extern", Token::EXTERN },
+  { "float", Token::FLOAT },
+  { "for", Token::FOR },
+  { "goto", Token::GOTO },
+  { "half", Token::HALF },
+  { "if", Token::IF },
+  { "inline", Token::INLINE },
+  { "int", Token::INT },
+  { "long", Token::LONG },
+  { "newaxis", Token::NEWAXIS },
+  { "signed", Token::SIGNED },
+  { "unsigned", Token::UNSIGNED },
+  { "restrict", Token::RESTRICT },
+  { "return", Token::RETURN },
+  { "short", Token::SHORT },
+  { "sizeof", Token::SIZEOF },
+  { "static", Token::STATIC },
+  { "struct", Token::STRUCT },
+  { "switch", Token::SWITCH },
+  { "typedef", Token::TYPEDEF },
+  { "union", Token::UNION },
+  { "void", Token::VOID },
+  { "volatile", Token::VOLATILE },
+  { "while", Token::WHILE },
+  { "_Alignas", Token::ALIGNAS },
+  { "_Alignof", Token::ALIGNOF },
+  { "_Atomic", Token::ATOMIC },
+  { "__attribute__", Token::ATTRIBUTE },
+  { "_Bool", Token::BOOL },
+  { "_Complex", Token::COMPLEX },
+  { "_Generic", Token::GENERIC },
+  { "_Imaginary", Token::IMAGINARY },
+  { "_Noreturn", Token::NORETURN },
+  { "_Static_assert", Token::STATIC_ASSERT },
+  { "_Thread_local", Token::THREAD },
+  { "max", Token::MAX },
+  { "min", Token::MIN },
+};
+
+const std::unordered_map<int, const char*> Token::tagLexemeMap_ {
+  { '(', "(" },
+  { ')', ")" },
+  { '[', "[" },
+  { ']', "]" },
+  { ':', ":" },
+  { ',', "," },
+  { ';', ";" },
+  { '+', "+" },
+  { '-', "-" },
+  { '*', "*" },
+  { '/', "/" },
+  { '|', "|" },
+  { '&', "&" },
+  { '<', "<" },
+  { '>', ">" },
+  { '=', "=" },
+  { '.', "." },
+  { '%', "%" },
+  { '{', "{" },
+  { '}', "}" },
+  { '^', "^" },
+  { '~', "~" },
+  { '!', "!" },
+  { '?', "?" },
+  { '#', "#" },
+  { '@', "@" },
+
+  { Token::DSHARP, "##" },
+  { Token::PTR, "->" },
+  { Token::INC, "++" },
+  { Token::DEC, "--" },
+  { Token::LEFT, "<<" },
+  { Token::RIGHT, ">>" },
+  { Token::LE, "<=" },
+  { Token::GE, ">=" },
+  { Token::EQ, "==" },
+  { Token::NE, "!=" },
+  { Token::LOGICAL_AND, "&&" },
+  { Token::LOGICAL_OR, "||" },
+  { Token::MUL_ASSIGN, "*=" },
+  { Token::DIV_ASSIGN, "/=" },
+  { Token::MOD_ASSIGN, "%=" },
+  { Token::ADD_ASSIGN, "+=" },
+  { Token::SUB_ASSIGN, "-=" },
+  { Token::LEFT_ASSIGN, "<<=" },
+  { Token::RIGHT_ASSIGN, ">>=" },
+  { Token::AND_ASSIGN, "&=" },
+  { Token::XOR_ASSIGN, "^=" },
+  { Token::OR_ASSIGN, "|=" },
+  { Token::ELLIPSIS, "..." },
+  { Token::AUTO, "auto" },
+  { Token::BREAK, "break" },
+  { Token::CASE, "case" },
+  { Token::CHAR, "char" },
+  { Token::CONST, "const" },
+  { Token::CONTINUE, "continue" },
+  { Token::DEFAULT, "default" },
+  { Token::DO, "do" },
+  { Token::DOUBLE, "double" },
+  { Token::ELSE, "else" },
+  { Token::ENUM, "enum" },
+  { Token::EXTERN, "extern" },
+  { Token::FLOAT, "float" },
+  { Token::FOR, "for" },
+  { Token::GLOBAL, "global" },
+  { Token::GOTO, "goto" },
+  { Token::IF, "if" },
+  { Token::INLINE, "inline" },
+  { Token::INT, "int" },
+  { Token::LONG, "long" },
+  { Token::NEWAXIS, "newaxis" },
+  { Token::SIGNED, "signed" },
+  { Token::UNSIGNED, "unsigned" },
+  { Token::RESTRICT, "restrict" },
+  { Token::RETURN, "return" },
+  { Token::SHORT, "short" },
+  { Token::SIZEOF, "sizeof" },
+  { Token::STATIC, "static" },
+  { Token::STRUCT, "struct" },
+  { Token::SWITCH, "switch" },
+  { Token::TYPEDEF, "typedef" },
+  { Token::UNION, "union" },
+  { Token::VOID, "void" },
+  { Token::VOLATILE, "volatile" },
+  { Token::WHILE, "while" },
+  { Token::ALIGNAS, "_Alignas" },
+  { Token::ALIGNOF, "_Alignof" },
+  { Token::ATOMIC, "_Atomic" },
+  { Token::ATTRIBUTE, "__attribute__" },
+  { Token::BOOL, "_Bool" },
+  { Token::COMPLEX, "_Complex" },
+  { Token::GENERIC, "_Generic" },
+  { Token::IMAGINARY, "_Imaginary" },
+  { Token::NORETURN, "_Noreturn" },
+  { Token::STATIC_ASSERT, "_Static_assert" },
+  { Token::THREAD, "_Thread_local" },
+
+  { Token::END, "(eof)" },
+  { Token::IDENTIFIER, "(identifier)" },
+  { Token::CONSTANT, "(constant)" },
+  { Token::LITERAL, "(string literal)" },
+};
+
+
+Token* Token::New(int tag) {
+  return new (tokenPool.Alloc()) Token(tag);
+}
+
+
+Token* Token::New(const Token& other) {
+  return new (tokenPool.Alloc()) Token(other);
+}
+
+
+Token* Token::New(int tag,
+                  const SourceLocation& loc,
+                  const std::string& str,
+                  bool ws) {
+  return new (tokenPool.Alloc()) Token(tag, loc, str, ws);
+}
+
+
+TokenSequence TokenSequence::GetLine() {
+  auto begin = begin_;
+  while (begin_ != end_ && (*begin_)->tag_ != Token::NEW_LINE)
+    ++begin_;
+  auto end = begin_;
+  return {tokList_, begin, end};
+}
+
+
+/*
+ * If this seq starts from the begin of a line.
+ * Called only after we have saw '#' in the token sequence.
+ */
+bool TokenSequence::IsBeginOfLine() const {
+  if (begin_ == tokList_->begin())
+    return true;
+
+  auto pre = begin_;
+  --pre;
+
+  // We do not insert a newline at the end of a source file.
+  // Thus if two token have different filename, the second is
+  // the begin of a line.
+  return ((*pre)->tag_ == Token::NEW_LINE ||
+          (*pre)->loc_.filename_ != (*begin_)->loc_.filename_);
+}
+
+const Token* TokenSequence::Peek() const {
+  static auto eof = Token::New(Token::END);
+  if (begin_ != end_ && (*begin_)->tag_ == Token::NEW_LINE) {
+    ++begin_;
+    return Peek();
+  } else if (begin_ == end_) {
+    if (end_ != tokList_->begin())
+      *eof = *Back();
+    eof->tag_ = Token::END;
+    return eof;
+  } else if (parser_ && (*begin_)->tag_ == Token::IDENTIFIER &&
+             (*begin_)->str_ == "__func__") {
+    auto filename = Token::New(*(*begin_));
+    filename->tag_ = Token::LITERAL;
+    filename->str_ = "\"" + parser_->CurFunc()->Name() + "\"";
+    *begin_ = filename;
+  }
+  return *begin_;
+}
+
+
+const Token* TokenSequence::Expect(int expect) {
+  auto tok = Peek();
+  if (!Try(expect)) {
+    Error(tok, "'%s' expected, but got '%s'",
+        Token::Lexeme(expect), tok->str_.c_str());
+  }
+  return tok;
+}
+
+void TokenSequence::Print(FILE* fp) const {
+  unsigned lastLine = 0;
+  auto ts = *this;
+  while (!ts.Empty()) {
+    auto tok = ts.Next();
+    if (lastLine != tok->loc_.line_) {
+      fputs("\n", fp);
+      for (unsigned i = 0; i < tok->loc_.column_; ++i)
+        fputc(' ', fp);
+    } else if (tok->ws_) {
+      fputc(' ', fp);
+    }
+    fputs(tok->str_.c_str(), fp);
+    fflush(fp);
+    lastLine = tok->loc_.line_;
+  }
+  fputs("\n", fp);
+}
+
+//void TokenSequence::Print(std::string *str) const {
+
+//}
diff --git a/lib/lang/type.cc b/lib/lang/type.cc
new file mode 100644
index 000000000..13d09cf89
--- /dev/null
+++ b/lib/lang/type.cc
@@ -0,0 +1,504 @@
+#include "triton/lang/type.h"
+
+#include "triton/lang/ast.h"
+#include "triton/lang/scope.h"
+#include "triton/lang/token.h"
+
+#include <cassert>
+#include <algorithm>
+#include <iostream>
+
+
+static MemPoolImp<VoidType>     voidTypePool;
+static MemPoolImp<ArrayType>    arrayTypePool;
+static MemPoolImp<TileType>     tileTypePool;
+static MemPoolImp<FuncType>     funcTypePool;
+static MemPoolImp<PointerType>  pointerTypePool;
+static MemPoolImp<StructType>   structUnionTypePool;
+static MemPoolImp<ArithmType>   arithmTypePool;
+
+
+QualType Type::MayCast(QualType type, bool inProtoScope) {
+  auto funcType = type->ToFunc();
+  auto arrayType = type->ToArray();
+  if (funcType) {
+    return PointerType::New(funcType);
+  } else if (arrayType) {
+    auto ret = PointerType::New(arrayType->Derived());
+    // C11 6.7.6.3 [7]: qualifiers are specified in '[]'
+    // As we do not support qualifiers in '[]', the qualifier whould be none
+    return QualType(ret, inProtoScope? 0: Qualifier::CONST);
+  }
+  return type;
+}
+
+const Type* Type::ScalarType() const {
+  if(IsScalar())
+    return this;
+  if(const TileType* p = ToTile())
+    return p->Derived().GetPtr();
+  return nullptr;
+}
+
+Type* Type::ScalarType() {
+  auto cthis = const_cast<const Type*>(this);
+  return const_cast<Type*>(cthis->ScalarType());
+}
+
+VoidType* VoidType::New() {
+  static auto ret = new (voidTypePool.Alloc()) VoidType(&voidTypePool);
+  return ret;
+}
+
+
+ArithmType* ArithmType::New(int typeSpec) {
+#define NEW_TYPE(tag)                                           \
+  new (arithmTypePool.Alloc()) ArithmType(&arithmTypePool, tag);
+
+  static auto boolType    = NEW_TYPE(T_BOOL);
+  static auto charType    = NEW_TYPE(T_CHAR);
+  static auto ucharType   = NEW_TYPE(T_UNSIGNED | T_CHAR);
+  static auto shortType   = NEW_TYPE(T_SHORT);
+  static auto ushortType  = NEW_TYPE(T_UNSIGNED | T_SHORT);
+  static auto intType     = NEW_TYPE(T_INT);
+  static auto uintType    = NEW_TYPE(T_UNSIGNED | T_INT);
+  static auto longType    = NEW_TYPE(T_LONG);
+  static auto ulongType   = NEW_TYPE(T_UNSIGNED | T_LONG);
+  static auto llongType   = NEW_TYPE(T_LLONG)
+  static auto ullongType  = NEW_TYPE(T_UNSIGNED | T_LLONG);
+  static auto halfType    = NEW_TYPE(T_HALF);
+  static auto floatType   = NEW_TYPE(T_FLOAT);
+  static auto doubleType  = NEW_TYPE(T_DOUBLE);
+  static auto ldoubleType = NEW_TYPE(T_LONG | T_DOUBLE);
+
+  auto tag = ArithmType::Spec2Tag(typeSpec);
+  switch (tag) {
+  case T_BOOL:              return boolType;
+  case T_CHAR:              return charType;
+  case T_UNSIGNED | T_CHAR: return ucharType;
+  case T_SHORT:             return shortType;
+  case T_UNSIGNED | T_SHORT:return ushortType;
+  case T_INT:               return intType;
+  case T_UNSIGNED:
+  case T_UNSIGNED | T_INT:  return uintType;
+  case T_LONG:              return longType;
+  case T_UNSIGNED | T_LONG: return ulongType;
+  case T_LLONG:             return llongType;
+  case T_UNSIGNED | T_LLONG:return ullongType;
+  case T_HALF:              return halfType;
+  case T_FLOAT:             return floatType;
+  case T_DOUBLE:            return doubleType;
+  case T_LONG | T_DOUBLE:   return ldoubleType;
+  default:
+    assert(tag & T_COMPLEX);
+    Error("complex not supported yet");
+  }
+  return nullptr; // Make compiler happy
+
+#undef NEW_TYPE
+}
+
+
+ArrayType* ArrayType::New(int len, QualType eleType) {
+  return new (arrayTypePool.Alloc())
+         ArrayType(&arrayTypePool, len, eleType);
+}
+
+
+ArrayType* ArrayType::New(Expr* expr, QualType eleType) {
+  return new (arrayTypePool.Alloc())
+         ArrayType(&arrayTypePool, expr, eleType);
+}
+
+TileType* TileType::New(const ShapeExpr &expr, QualType eleType) {
+  return new (tileTypePool.Alloc())
+         TileType(&tileTypePool, expr, eleType);
+}
+
+TileType* TileType::New(const ShapeInt &shape, QualType eleType) {
+  return new (tileTypePool.Alloc())
+         TileType(&tileTypePool, shape, eleType);
+}
+
+FuncType* FuncType::New(QualType derived,
+                        int funcSpec,
+                        bool variadic,
+                        const ParamList& params) {
+  return new (funcTypePool.Alloc())
+         FuncType(&funcTypePool, derived, funcSpec, variadic, params);
+}
+
+
+PointerType* PointerType::New(QualType derived) {
+  return new (pointerTypePool.Alloc())
+         PointerType(&pointerTypePool, derived);
+}
+
+
+StructType* StructType::New(bool isStruct,
+                            bool hasTag,
+                            Scope* parent) {
+  return new (structUnionTypePool.Alloc())
+         StructType(&structUnionTypePool, isStruct, hasTag, parent);
+}
+
+
+int ArithmType::Width() const {
+  switch (tag_) {
+  case T_BOOL: case T_CHAR: case T_UNSIGNED | T_CHAR:
+    return 1;
+  case T_SHORT: case T_UNSIGNED | T_SHORT:
+    return intWidth_ >> 1;
+  case T_INT: case T_UNSIGNED: case T_UNSIGNED | T_INT:
+    return intWidth_;
+  case T_LONG: case T_UNSIGNED | T_LONG:
+    return intWidth_ << 1;
+  case T_LLONG: case T_UNSIGNED | T_LLONG:
+    return intWidth_ << 1;
+  case T_HALF:
+    return intWidth_ >> 1;
+  case T_FLOAT:
+    return intWidth_;
+  case T_DOUBLE:
+    return intWidth_ << 1;
+  case T_LONG | T_DOUBLE:
+    return intWidth_ << 1;
+  case T_HALF | T_COMPLEX:
+    return intWidth_;
+  case T_FLOAT | T_COMPLEX:
+    return intWidth_ << 1;
+  case T_DOUBLE | T_COMPLEX:
+    return intWidth_ << 2;
+  case T_LONG | T_DOUBLE | T_COMPLEX:
+    return intWidth_ << 2;
+  default:
+    assert(false);
+  }
+
+  return intWidth_; // Make compiler happy
+}
+
+
+int ArithmType::Rank() const {
+  switch (tag_) {
+  case T_BOOL: return 0;
+  case T_CHAR: case T_UNSIGNED | T_CHAR: return 1;
+  case T_SHORT: case T_UNSIGNED | T_SHORT: return 2;
+  case T_INT: case T_UNSIGNED: case T_UNSIGNED | T_INT: return 3;
+  case T_LONG: case T_UNSIGNED | T_LONG: return 4;
+  case T_LLONG: case T_UNSIGNED | T_LLONG: return 5;
+  case T_HALF: return 6;
+  case T_FLOAT: return 7;
+  case T_DOUBLE: return 8;
+  case T_LONG | T_DOUBLE: return 9;
+  default:
+    assert(tag_ & T_COMPLEX);
+    Error("complex not supported yet");
+  }
+  return 0;
+}
+
+
+ArithmType* ArithmType::MaxType(ArithmType* lhs,
+                                ArithmType* rhs) {
+  if (lhs->IsInteger())
+    lhs = ArithmType::IntegerPromote(lhs);
+  if (rhs->IsInteger())
+    rhs = ArithmType::IntegerPromote(rhs);
+  auto ret = lhs->Rank() > rhs->Rank() ? lhs: rhs;
+  if (lhs->Width() == rhs->Width() && (lhs->IsUnsigned() || rhs->IsUnsigned()))
+    return ArithmType::New(T_UNSIGNED | ret->Tag());
+  return ret;
+}
+
+
+/*
+ * Converting from type specifier to type tag
+ */
+int ArithmType::Spec2Tag(int spec) {
+  if (spec == T_SIGNED) {
+    return T_INT;
+  }
+  spec &= ~T_SIGNED;
+  if ((spec & T_SHORT) || (spec & T_LONG)
+      || (spec & T_LLONG)) {
+    spec &= ~T_INT;
+  }
+  return spec;
+}
+
+
+std::string ArithmType::Str() const {
+  std::string width = ":" + std::to_string(Width());
+
+  switch (tag_) {
+  case T_BOOL:
+    return "bool" + width;
+
+  case T_CHAR:
+    return "char" + width;
+
+  case T_UNSIGNED | T_CHAR:
+    return "unsigned char" + width;
+
+  case T_SHORT:
+    return "short" + width;
+
+  case T_UNSIGNED | T_SHORT:
+    return "unsigned short" + width;
+
+  case T_INT:
+    return "int" + width;
+
+  case T_UNSIGNED:
+    return "unsigned int" + width;
+
+  case T_LONG:
+    return "long" + width;
+
+  case T_UNSIGNED | T_LONG:
+    return "unsigned long" + width;
+
+  case T_LLONG:
+    return "long long" + width;
+
+  case T_UNSIGNED | T_LLONG:
+    return "unsigned long long" + width;
+
+  case T_FLOAT:
+    return "float" + width;
+
+  case T_DOUBLE:
+    return "double" + width;
+
+  case T_LONG | T_DOUBLE:
+    return "long double" + width;
+
+  case T_FLOAT | T_COMPLEX:
+    return "float complex" + width;
+
+  case T_DOUBLE | T_COMPLEX:
+    return "double complex" + width;
+
+  case T_LONG | T_DOUBLE | T_COMPLEX:
+    return "long double complex" + width;
+
+  default:
+    assert(false);
+  }
+
+  return "error"; // Make compiler happy
+}
+
+
+bool PointerType::Compatible(const Type& other) const {
+  // C11 6.7.6.1 [2]: pointer compatibility
+  auto otherPointer = other.ToPointer();
+  return otherPointer &&
+         derived_->Compatible(*otherPointer->derived_);
+
+  // FIXME(wgtdkp): cannot loose compatible constraints
+  //return other.IsInteger() ||
+  //       (otherPointer && derived_->Compatible(*otherPointer->derived_));
+}
+
+
+bool ArrayType::Compatible(const Type& other) const {
+  // C11 6.7.6.2 [6]: For two array type to be compatible,
+  // the element types must be compatible, and have same length
+  // if both specified.
+  auto otherArray = other.ToArray();
+  if (!otherArray) return false;
+  if (!derived_->Compatible(*otherArray->derived_)) return false;
+  // The lengths should equal if both specified
+  if (complete_ && otherArray->complete_)
+    return len_ == otherArray->len_;
+  return true;
+}
+
+bool TileType::Compatible(const Type& other) const {
+  // For two tile type to be compatible,
+  // the element types must be compatible
+  // and they must have the same shapea
+  auto otherTile = other.ToTile();
+  if(!otherTile)
+    return false;
+  if (!derived_->Compatible(*otherTile->derived_))
+    return false;
+  // The shapes should be equal if both specified
+  if(complete_ && otherTile->complete_)
+    return shape_ == otherTile->shape_;
+  return true;
+}
+
+
+
+bool FuncType::Compatible(const Type& other) const {
+  auto otherFunc = other.ToFunc();
+  // The other type is not an function type
+  if (!otherFunc) return false;
+  // TODO(wgtdkp): do we need to check the type of return value when deciding
+  // compatibility of two function types ??
+  if (!derived_->Compatible(*otherFunc->derived_))
+    return false;
+  if (params_.size() != otherFunc->params_.size())
+    return false;
+
+  auto thisIter = params_.begin();
+  auto otherIter = otherFunc->params_.begin();
+  while (thisIter != params_.end()) {
+    if (!(*thisIter)->Type()->Compatible(*(*otherIter)->Type()))
+      return false;
+    ++thisIter;
+    ++otherIter;
+  }
+
+  return true;
+}
+
+
+std::string FuncType::Str() const {
+  auto str = derived_->Str() + "(";
+  auto iter = params_.begin();
+  for (; iter != params_.end(); ++iter) {
+    str += (*iter)->Type()->Str() + ", ";
+  }
+  if (variadic_)
+    str += "...";
+  else if (params_.size())
+    str.resize(str.size() - 2);
+
+  return str + ")";
+}
+
+
+StructType::StructType(MemPool* pool,
+                       bool isStruct,
+                       bool hasTag,
+                       Scope* parent)
+    : Type(pool, false),
+      isStruct_(isStruct),
+      hasTag_(hasTag),
+      memberMap_(new Scope(parent, S_BLOCK)),
+      offset_(0),
+      width_(0),
+      // If a struct type has no member, it gets alignment of 1
+      align_(1),
+      bitFieldAlign_(1) {}
+
+
+Object* StructType::GetMember(const std::string& member) {
+  auto ident = memberMap_->FindInCurScope(member);
+  if (ident == nullptr)
+    return nullptr;
+  return ident->ToObject();
+}
+
+
+void StructType::CalcWidth() {
+  width_ = 0;
+  auto iter = memberMap_->identMap_.begin();
+  for (; iter != memberMap_->identMap_.end(); ++iter) {
+    width_ += iter->second->Type()->Width();
+  }
+}
+
+
+bool StructType::Compatible(const Type& other) const {
+  return this == &other; // Pointer comparison
+}
+
+
+// TODO(wgtdkp): more detailed representation
+std::string StructType::Str() const {
+  std::string str = isStruct_ ? "struct": "union";
+  return str + ":" + std::to_string(width_);
+}
+
+
+// Remove useless unnamed bitfield members as they are just for parsing
+void StructType::Finalize() {
+  for (auto iter = members_.begin(); iter != members_.end();) {
+    if ((*iter)->BitFieldWidth() && (*iter)->Anonymous()) {
+      members_.erase(iter++);
+    } else {
+      ++iter;
+    }
+  }
+}
+
+
+void StructType::AddMember(Object* member) {
+  auto offset = MakeAlign(offset_, member->Align());
+  member->SetOffset(offset);
+
+  members_.push_back(member);
+  memberMap_->Insert(member->Name(), member);
+
+  align_ = std::max(align_, member->Align());
+  bitFieldAlign_ = std::max(bitFieldAlign_, align_);
+
+  if (isStruct_) {
+    offset_ = offset + member->Type()->Width();
+    width_ = MakeAlign(offset_, align_);
+  } else {
+    assert(offset_ == 0);
+    width_ = std::max(width_, member->Type()->Width());
+    width_ = MakeAlign(width_, align_);
+  }
+}
+
+
+void StructType::AddBitField(Object* bitField, int offset) {
+  bitField->SetOffset(offset);
+  members_.push_back(bitField);
+  if (!bitField->Anonymous())
+    memberMap_->Insert(bitField->Name(), bitField);
+
+  auto bytes = MakeAlign(bitField->BitFieldEnd(), 8) / 8;
+  bitFieldAlign_ = std::max(bitFieldAlign_, bitField->Align());
+  // Does not aligned, default is 1
+  if (isStruct_) {
+    offset_ = offset + bytes;
+    width_ = MakeAlign(offset_, std::max(bitFieldAlign_, bitField->Align()));
+  } else {
+    assert(offset_ == 0);
+    width_ = std::max(width_, bitField->Type()->Width());
+  }
+}
+
+
+// Move members of Anonymous struct/union to external struct/union
+void StructType::MergeAnony(Object* anony) {
+  auto anonyType = anony->Type()->ToStruct();
+  auto offset = MakeAlign(offset_, anony->Align());
+
+  // Members in map are never anonymous
+  for (auto& kv: *anonyType->memberMap_) {
+    auto& name = kv.first;
+    auto member = kv.second->ToObject();
+    if (member == nullptr) {
+      continue;
+    }
+    // Every member of anonymous struct/union
+    // are offseted by external struct/union
+    member->SetOffset(offset + member->Offset());
+
+    if (GetMember(name)) {
+      Error(member, "duplicated member '%s'", name.c_str());
+    }
+    // Simplify anony struct's member searching
+    memberMap_->Insert(name, member);
+  }
+  anony->SetOffset(offset);
+  members_.push_back(anony);
+
+  align_ = std::max(align_, anony->Align());
+  if (isStruct_) {
+    offset_ = offset + anonyType->Width();
+    width_ = MakeAlign(offset_, align_);
+  } else {
+    assert(offset_ == 0);
+    width_ = std::max(width_, anonyType->Width());
+  }
+}
diff --git a/lib/runtime/arg.cc b/lib/runtime/arg.cc
new file mode 100644
index 000000000..e69de29bb
diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc
new file mode 100644
index 000000000..3cc5ea87b
--- /dev/null
+++ b/lib/runtime/function.cc
@@ -0,0 +1,332 @@
+#include <string>
+#include <mutex>
+#include <regex>
+#include <functional>
+#include <algorithm>
+#include "triton/codegen/analysis/axes.h"
+#include "triton/codegen/analysis/allocation.h"
+#include "triton/codegen/analysis/liveness.h"
+#include "triton/codegen/analysis/align.h"
+#include "triton/codegen/transform/coalesce.h"
+#include "triton/codegen/transform/dce.h"
+#include "triton/codegen/transform/peephole.h"
+#include "triton/codegen/transform/membar.h"
+#include "triton/codegen/transform/reassociate.h"
+#include "triton/codegen/transform/cts.h"
+#include "triton/codegen/transform/disassociate.h"
+#include "triton/codegen/selection/generator.h"
+#include "triton/runtime/function.h"
+#include "triton/lang/cpp.h"
+#include "triton/lang/parser.h"
+#include "triton/lang/code_gen.h"
+#include "triton/driver/device.h"
+#include "triton/driver/stream.h"
+#include "triton/driver/kernel.h"
+#include "triton/driver/module.h"
+#include "triton/driver/error.h"
+#include "triton/ir/module.h"
+#include "triton/ir/function.h"
+#include "triton/ir/print.h"
+#include "triton/tools/bench.hpp"
+#include "llvm/IR/Module.h"
+#include <mutex>
+
+std::mutex mut;
+
+namespace triton{
+namespace runtime {
+
+// helpers
+void _parallel_loop_nest(std::vector<size_t> const & ranges,
+                       std::function<void(std::vector<size_t> const &)> const & f,
+                       size_t nthreads){
+  size_t D = ranges.size();
+  std::vector<size_t> values(D, 0);
+  // Start with innermost loop
+  size_t i = D - 1;
+  while(true){
+    //  Execute function
+    f(values);
+    while(values[i]++ == ranges[i] - 1){
+      if(i == 0)
+        return;
+      values[i--] = 0;
+    }
+    i = D - 1;
+  }
+}
+
+template<class T>
+void _parallel_loop_nest(std::vector<std::vector<T>> const & iterates, std::function<void(std::vector<T>)> const & f, size_t nthreads){
+  //Ranges to iterate over
+  std::vector<size_t> ranges;
+  for(auto const & x: iterates)
+    ranges.push_back(x.size());
+  //Proxy function
+  auto proxy = [&](std::vector<size_t> const & idx){
+    std::vector<T> x(iterates.size());
+    for(size_t i = 0; i < x.size(); ++i)
+    x[i] = iterates[i][idx[i]];
+  f(x);
+  };
+  //Iterate
+  _parallel_loop_nest(ranges, proxy, nthreads);
+}
+
+// caller
+
+arg_type convert(ir::type *ty) {
+  if(ty->is_integer_ty(1))
+    return INT1_T;
+  if(ty->is_integer_ty(8))
+    return INT8_T;
+  if(ty->is_integer_ty(16))
+    return INT16_T;
+  if(ty->is_integer_ty(32))
+    return INT32_T;
+  if(ty->is_integer_ty(64))
+    return INT64_T;
+  if(ty->is_half_ty())
+    return HALF_T;
+  if(ty->is_float_ty())
+    return FLOAT_T;
+  if(ty->is_double_ty())
+    return DOUBLE_T;
+  if(ty->is_pointer_ty())
+    return BUFFER_T;
+  throw std::runtime_error("unknown type");
+}
+
+function::caller::caller(ir::function *ir, std::shared_ptr<driver::module> parent, const options_t& opt)
+  : bin_(driver::kernel::create(&*parent, ir->get_name().c_str())), parent_(parent), opt_(opt) {
+  // extract signature
+  ir::function_type* ty = ir->get_fn_type();
+  for(size_t i = 0; i < ty->get_num_params(); i++)
+    param_tys_.push_back(convert(ty->get_param_ty(i)));
+}
+
+
+void function::caller::operator ()(driver::stream *stream, const grid_t& _grid, const std::vector<arg>& args) const {
+  if(args.size() != param_tys_.size())
+    throw std::runtime_error("invalid number of arguments");
+  for(size_t i = 0; i < args.size(); i++){
+    arg arg_i = args.at(i);
+    arg_type ty = arg_i.type();
+    if(ty != param_tys_.at(i))
+      throw std::runtime_error("invalid type for argument " + std::to_string(i));
+    if(ty == BUFFER_T)
+      bin_->setArg(i, *((driver::buffer**)arg_i.data()));
+    else
+      bin_->setArg(i, size_of(ty), arg_i.data());
+  }
+  // sanity check
+  if(_grid.size() > 3)
+    throw std::runtime_error("grid size must be no greater than 3");
+  std::array<size_t, 3> grid;
+  for(size_t i = 0; i < 3; i++)
+    grid[i] = (i < _grid.size()) ? _grid[i] : 1;
+  stream->enqueue(&*bin_, grid, {opt_.num_warps * 32, 1, 1});
+}
+
+
+std::unique_ptr<ir::module> function::make_ir(Parser& parser) {
+  // create Triton-IR from AST
+  ir::module* module = new ir::module("", ctx_);
+  Generator gen(&parser);
+  gen.Gen(module);
+  return std::unique_ptr<ir::module>(module);
+}
+
+
+function::caller function::autotune(driver::stream* stream, const grid_fn_ty& grid_fn,
+                                       const std::vector<arg>& args) {
+
+  // all tuning parameters are strings
+  std::vector<std::string> num_warps;
+  for(size_t i: opt_space_.num_warps)
+    num_warps.push_back(std::to_string(i));
+  std::vector<std::vector<std::string>> space;
+  space.push_back(num_warps);
+  for(const auto& i: opt_space_.defines)
+    space.push_back(i.second);
+
+  // exhaustive search
+  double best_ts = INFINITY;
+  std::unique_ptr<caller> ret;
+
+  auto benchmark = [&](std::vector<std::string> params) {
+    // extract options
+    options_t opt;
+    unsigned i = 0;
+    opt.num_warps = std::stoi(params[i++]);
+    for(auto it: opt_space_.defines){
+      opt.defines[it.first] = params[i++];
+    }
+    // pre-process
+    TokenSequence tokens;
+    Preprocessor cpp(&src_, true);
+    for(auto it: opt_space_.defines)
+      cpp.AddMacro(it.first, &opt.defines.at(it.first));
+    cpp.Process(tokens);
+
+    // parse
+    Parser parser(tokens);
+    parser.Parse();
+    // triton-ir code-gen
+    auto ir = make_ir(parser);
+    // binary code-gen
+    std::unique_ptr<driver::module> bin;
+    try{
+      bin = make_bin(*ir, stream->context(), opt);
+    }catch(const std::runtime_error& e){
+      return;
+    }
+    // kernel uses too much resources
+    if(!bin)
+      return;
+    // copy constants
+    std::unique_ptr<driver::buffer> buffer;
+    for(ir::alloc_const* alloc: ir->allocs()){
+      std::string name = alloc->get_name();
+      auto it = cst_.find(name);
+      if(it == cst_.end())
+        throw std::runtime_error("constant not set before execution");
+      buffer = bin->symbol(name.c_str());
+      stream->write(&*buffer, true, 0, it->second);
+    }
+    // benchmark
+    ir::function *tmp = ir->get_function_list()[0];
+    caller call(tmp, std::move(bin), opt);
+    double ts = tools::bench([&]() { call(stream, grid_fn(opt), args); }, stream, true);
+    // save best
+    if(ts < best_ts) {
+      best_ts = ts;
+      ret.reset(new caller(call));
+    }
+  };
+  _parallel_loop_nest<std::string>(space, benchmark, 1);
+  if(!ret)
+    throw std::runtime_error("could not find valid option in provided space");
+  return *ret;
+}
+
+
+std::unique_ptr<driver::module> function::make_bin(ir::module &module, driver::context *context, const options_t& opt) {
+  std::unique_ptr<codegen::target> target = context->device()->make_target();
+  // generate llvm code
+  llvm::LLVMContext ctx;
+  std::unique_ptr<llvm::Module> llvm(new llvm::Module(module.get_name(), ctx));
+  // create passes
+  codegen::analysis::align align;
+  codegen::analysis::axes axes;
+  codegen::transform::disassociate disassociate;
+  codegen::analysis::layouts layouts(&axes, &align, opt.num_warps);
+  codegen::analysis::liveness liveness(&layouts);
+  codegen::analysis::allocation allocation(&liveness);
+  codegen::transform::membar barriers(&liveness, &layouts, &allocation);
+  codegen::transform::dce dce;
+  codegen::transform::peephole peephole;
+  codegen::transform::reassociate reassociate;
+  codegen::transform::coalesce coalesce(&align, &layouts);
+  codegen::transform::cts cts;
+  codegen::generator isel(&axes, &layouts, &align, &allocation, target.get(), opt.num_warps);
+  // run passes
+  dce.run(module);
+  disassociate.run(module);
+  dce.run(module);
+  peephole.run(module);
+  dce.run(module);
+  align.run(module);
+  cts.run(module);
+  axes.run(module);
+  layouts.run(module);
+  coalesce.run(module);
+  dce.run(module);
+  align.run(module);
+  dce.run(module);
+  reassociate.run(module);
+  cts.run(module);
+  dce.run(module);
+  align.run(module);
+  axes.run(module);
+  layouts.run(module);
+  liveness.run(module);
+  allocation.run(module);
+  if(allocation.allocated_size() > context->device()->max_shared_memory())
+    return std::unique_ptr<driver::module>();
+  barriers.run(module);
+  isel.visit(module, *llvm);
+  // return binary
+  std::unique_ptr<driver::module> res(driver::module::create(context, std::move(llvm)));
+  // done
+//  exit(EXIT_FAILURE);
+  return res;
+}
+
+std::string function::preheader() {
+return
+R"(
+#define bool _Bool
+#define true 1
+#define false 0
+
+#define __readonly      __attribute__((readonly))
+#define __writeonly     __attribute__((writeonly))
+#define __noalias       __attribute__((noalias))
+#define __aligned(A)    __attribute__((aligned(A)))
+#define __multipleof(A) __attribute__((multipleof(A)))
+
+extern int atomic_cas(int*, int, int);
+extern int atomic_xchg(int*, int);
+extern int get_program_id(int);
+extern int get_num_programs(int);
+extern float sqrtf(float);
+extern int select(bool, int, int);
+extern char __constant__ * calloc(int);
+)";
+}
+
+function::function(const std::string &src, const options_space_t& opt):  src_(src), opt_space_(opt) {
+  src_ = preheader() + src_;
+}
+
+void function::operator()(const std::vector<arg>& args, const grid_fn_ty& grid_fn, driver::stream *stream) {
+  cache_key_t key;
+
+  /* figure out if the kernel should be re-tuned */
+  // re-tune if device is different
+  key.first = stream->context()->device();
+  // re-tune if any int argument is different
+  for(size_t i = 0; i < args.size(); i++){
+    arg_type ty = args.at(i).type();
+    if(is_int_type(ty)){
+      long val = 0;
+      std::memcpy((void*)&val, args.at(i).data(), size_of(ty));
+      key.second.push_back(val);
+    }
+  }
+
+  /* find existing configuration */
+  auto it = cache_.find(key);
+  if(it != cache_.end()){
+    it->second(stream, grid_fn(it->second.opt()), args);
+    return;
+  }
+
+  /* re-tune and re-compile */
+  {
+    std::lock_guard<std::mutex> lock(mut);
+    cache_.insert({key, autotune(stream, grid_fn, args)});
+  }
+}
+
+void function::operator()(const std::vector<arg>& args, const grid_t& grid, driver::stream *stream) {
+  return this->operator()(args, [&grid](const options_t&){ return grid; }, stream);
+}
+
+void function::set_cst(const std::string& name, void* data, size_t n_bytes) {
+   cst_[name] = std::vector<char>((char*)data, (char*)data + n_bytes);
+}
+
+}
+}
diff --git a/python/examples/attention/bench.py b/python/examples/attention/bench.py
new file mode 100644
index 000000000..abd4ed24c
--- /dev/null
+++ b/python/examples/attention/bench.py
@@ -0,0 +1,48 @@
+import torch
+import numpy as np
+import reference
+import optimized
+from time import time
+
+use_half = True
+def cast(x):
+    if use_half:
+        return x.half()
+    else:
+        return x
+
+# GPU device
+device = torch.device("cuda:0")
+# shapes
+batch, nhead = 8, 28
+dm, dk, dv = 1024, 1024, 1024
+lq, lk, lv = 1024, 1024, 1024
+# initialize tensors
+torch.manual_seed(0)
+np.random.seed(0)
+query = cast(torch.randn(batch, lq, dm)).cuda()
+key   = cast(torch.randn(batch, lk, dm)).cuda()
+value = cast(torch.randn(batch, lv, dm)).cuda()
+# initialize layers
+torch.manual_seed(0)
+np.random.seed(0)
+rattn = cast(reference.MultiHeadAttention(nhead, dm, dk, dv).to(device))
+torch.manual_seed(0)
+np.random.seed(0)
+tattn = cast(optimized.MultiHeadAttention(nhead, dm, dk, dv).to(device))
+# test
+routput, _ = rattn(query, key, value)
+toutput, _ = tattn(query, key, value)
+diff = torch.max(torch.abs(routput - toutput))
+assert diff < 1e-2
+# benchmark
+start = time()
+routput, _ = rattn(query, key, value)
+end = time()
+rtime = end - start
+start = time()
+toutput, _ = tattn(query, key, value)
+end = time()
+ttime = end - start
+print(f'Torch:  {rtime} s')
+print(f'Triton: {ttime} s')
\ No newline at end of file
diff --git a/python/examples/attention/optimized.py b/python/examples/attention/optimized.py
new file mode 100644
index 000000000..96cc14262
--- /dev/null
+++ b/python/examples/attention/optimized.py
@@ -0,0 +1,50 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import triton
+
+class MultiHeadAttention(nn.Module):
+    ''' Multi-Head Attention module '''
+
+    def __init__(self, n_head, d_model, d_k, d_v):
+        super().__init__()
+        self.n_head = n_head
+        self.d_k = d_k
+        self.d_v = d_v
+        # linear layers
+        self.w_qs = nn.Linear(d_model, n_head * d_k)
+        self.w_ks = nn.Linear(d_model, n_head * d_k)
+        self.w_vs = nn.Linear(d_model, n_head * d_v)
+        self.fc = nn.Linear(n_head * d_v, d_model)
+        # initialize weights
+        nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
+        nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
+        nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))
+        nn.init.xavier_normal_(self.fc.weight)
+        # layer normalization
+        self.layer_norm = nn.LayerNorm(d_model)
+
+
+    def forward(self, q, k, v, mask=None):
+        # dimensions
+        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
+        sz_b, len_q, _ = q.size()
+        sz_b, len_k, _ = k.size()
+        sz_b, len_v, _ = v.size()
+        # linear transformations
+        residual = q
+        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
+        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
+        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
+        # scaled dot-product attention        
+        attn = triton.ops.einsum('blhk,bthk->hblt', q, k, [n_head, sz_b, len_q, len_k])
+        attn = attn / np.sqrt(d_k)
+        if mask is not None:
+            attn = attn.masked_fill(mask[None], -np.inf)
+        attn = torch.softmax(attn, dim=3)
+        output = triton.ops.einsum('hblt,bthv->blhv', attn, v, [sz_b, len_q, n_head, d_v])
+        output = output.view(sz_b, len_q, -1)
+        output = self.fc(output)
+        # epilogue
+        output = self.layer_norm(output + residual)
+        return output, attn
\ No newline at end of file
diff --git a/python/examples/attention/reference.py b/python/examples/attention/reference.py
new file mode 100644
index 000000000..e60f474f6
--- /dev/null
+++ b/python/examples/attention/reference.py
@@ -0,0 +1,72 @@
+import numpy as np
+import torch
+import torch.nn as nn
+
+class ScaledDotProductAttention(nn.Module):
+    ''' Scaled Dot-Product Attention '''
+
+    def __init__(self, temperature, attn_dropout=0.1):
+        super().__init__()
+        self.temperature = temperature
+        self.softmax = nn.Softmax(dim=2)
+
+    def forward(self, q, k, v, mask=None):
+        attn = torch.bmm(q, k.transpose(1, 2))
+        attn = attn / self.temperature
+        if mask is not None:
+            attn = attn.masked_fill(mask, -np.inf)
+        attn = self.softmax(attn)
+        output = torch.bmm(attn, v)
+        return output, attn
+
+
+
+class MultiHeadAttention(nn.Module):
+    ''' Multi-Head Attention module '''
+
+    def __init__(self, n_head, d_model, d_k, d_v):
+        super().__init__()
+        self.n_head = n_head
+        self.d_k = d_k
+        self.d_v = d_v
+        # linear layers
+        self.w_qs = nn.Linear(d_model, n_head * d_k)
+        self.w_ks = nn.Linear(d_model, n_head * d_k)
+        self.w_vs = nn.Linear(d_model, n_head * d_v)
+        self.fc = nn.Linear(n_head * d_v, d_model)
+        # initialize weights
+        nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
+        nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
+        nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))
+        nn.init.xavier_normal_(self.fc.weight)
+        # normalization
+        self.layer_norm = nn.LayerNorm(d_model)
+        # scaled dot-product
+        self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5))
+
+
+    def forward(self, q, k, v, mask=None):
+        # dimensions
+        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
+        sz_b, len_q, _ = q.size()
+        sz_b, len_k, _ = k.size()
+        sz_b, len_v, _ = v.size()
+        # linear transformations
+        residual = q
+        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
+        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
+        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
+        # scaled dot-product attention
+        q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk
+        k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk
+        v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv
+        if mask:
+            mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x ..
+        output, attn = self.attention(q, k, v, mask=mask)
+        # linear transformation
+        output = output.view(n_head, sz_b, len_q, d_v)
+        output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv)
+        output = self.fc(output)
+        # normalization
+        output = self.layer_norm(output + residual)
+        return output, attn
\ No newline at end of file
diff --git a/python/examples/batchnorm.py b/python/examples/batchnorm.py
new file mode 100644
index 000000000..a69d127c4
--- /dev/null
+++ b/python/examples/batchnorm.py
@@ -0,0 +1,56 @@
+import triton
+import numpy as np
+from enum import Enum
+
+class MODE(Enum):
+  TF = 1
+  TORCH = 2
+
+try:
+  import tensorflow as tf
+  mode = MODE.TF
+except ModuleNotFoundError:
+  pass
+
+try:
+  import torch
+  mode = MODE.TORCH
+except ModuleNotFoundError:
+  pass
+
+
+C, H, W, B = 32, 1, 1, 128
+
+x = np.random.uniform(-1, 1, (C, H, W, B)).astype(np.float32)
+gamma = np.random.uniform(-1, 1, C).astype(np.float32)
+beta = np.random.uniform(-1, 1, C).astype(np.float32)
+dy = np.random.uniform(-1, 1, (C, H, W, B)).astype(np.float32)
+
+if mode == MODE.TORCH:
+    fw_x = torch.from_numpy(x).cuda()
+    fw_gamma = torch.from_numpy(gamma).cuda()
+    fw_beta = torch.from_numpy(beta).cuda()
+    fw_dy = torch.from_numpy(dy).cuda()
+    # register gradients
+    fw_x.requires_grad_(True)
+    fw_gamma.requires_grad_(True)
+    fw_beta.requires_grad_(True)
+    # execute
+    fw_y = triton.ops.batchnorm(fw_x, fw_gamma, fw_beta, 1e-4)
+    fw_y.backward(fw_dy)
+ 
+if mode == MODE.TF:
+   fw_x = tf.placeholder(shape=x.shape, dtype=x.dtype)
+   fw_gamma = tf.placeholder(shape=gamma.shape, dtype=gamma.dtype)
+   fw_beta = tf.placeholder(shape=beta.shape, dtype=beta.dtype)
+   fw_dy = tf.placeholder(shape=dy.shape, dtype=dy.dtype)
+   # execute
+   fw_y = triton.ops.batchnorm(fw_x, fw_gamma, fw_beta, 1e-4)
+   fw_mean, fw_var = tf.nn.moments(fw_x, [1, 2, 3])
+   fw_dx, fw_dgamma, fw_dbeta = tf.gradients(fw_y, [fw_x, fw_gamma, fw_beta], fw_dy)
+   # run
+   sess = tf.InteractiveSession()
+   feed_dict = {fw_x: x, fw_gamma: gamma, fw_beta: beta, fw_dy: dy}
+   sess.run(tf.global_variables_initializer())
+   result = sess.run([fw_dx, fw_dgamma, fw_dbeta], feed_dict=feed_dict)
+   print(result)
\ No newline at end of file
diff --git a/python/examples/einsum.py b/python/examples/einsum.py
new file mode 100644
index 000000000..4ec2dea36
--- /dev/null
+++ b/python/examples/einsum.py
@@ -0,0 +1,197 @@
+import triton
+import torch
+from torch.utils.cpp_extension import load
+import numpy as np
+#import utils
+from time import time
+
+#torch.backends.cudnn.benchmark = True
+
+configs = []
+
+# Matrix multiplication
+MNK = [
+        (512, 512 ,512), 
+        (2048, 2048, 2048),
+        (8192, 8192, 8192),
+       
+        # (64, 64, 64000),
+        # (64, 64, 128000),
+        # (256, 256, 64000),
+        # (256, 256, 128000),
+
+        # (1536, 16, 1536),
+        # (1536, 32, 1536),
+        # (1536, 64, 1536),
+        # (1536, 128, 1536),
+        # (4096, 16, 4096),
+        # (4096, 32, 4096),
+        # (4096, 64, 4096),
+        # (4096, 128, 4096),
+    
+        # (127008, 768, 576) 
+      ]
+for M, N, K in MNK:
+    matmul = lambda a, b: torch.matmul(a, b)
+    configs += [([M, K], [K, N], [M, N], matmul, 'mk,kn->mn', dict())]
+for M, N, K in MNK:
+    matmul = lambda a, b: torch.matmul(a.t(), b)
+    configs += [([M, K], [M, N], [K, N], None, 'mk,mn->kn', dict())]
+for M, N, K in MNK:
+    matmul = lambda a, b: torch.matmul(a, b.t())
+    configs += [([M, N], [K, N], [M, K], None, 'mn,kn->mk', dict())]
+
+# Relative attention
+NTHSE = [
+          (16, 512, 1, 64, 64), 
+        #  (16, 512, 1, 128, 128),
+        #  (16, 512, 1, 256, 256),
+        #  (16, 512, 1, 256, 512),
+          (16, 512, 8, 64, 64), 
+        #  (16, 512, 8, 128, 128),
+        #  (16, 512, 8, 256, 256),
+        #  (16, 512, 8, 256, 512),
+
+        #  (64, 1024, 1, 64, 64), 
+          (64, 1024, 1, 128, 128),
+        #  (64, 1024, 1, 256, 256),
+        #  (64, 1024, 1, 256, 512),
+        #  (64, 1024, 8, 64, 64), 
+          (64, 1024, 8, 128, 128),
+        #  (64, 1024, 8, 256, 256),
+        #  (64, 1024, 8, 256, 512),
+
+        #  (128, 1024, 1, 64, 64), 
+        #  (128, 1024, 1, 128, 128),
+        #  (128, 1024, 1, 256, 256),
+          (128, 1024, 1, 256, 512),
+        #  (128, 1024, 8, 64, 64), 
+        #  (128, 1024, 8, 128, 128),
+        #  (128, 1024, 8, 256, 256),
+          (128, 1024, 8, 256, 512)
+        ]
+for N, T, H, S, E in NTHSE:
+    configs += [([N, T, H, S], [H, E, S], [N, H, T, E], None, 'nths,hes->nhte', dict())]
+for N, T, H, S, E in NTHSE:
+    configs += [([N, H, T, E], [N, T, H, S], [H, E, S], None, 'nhte,nths->hes', dict())]
+for N, T, H, S, E in NTHSE:
+    configs += [([N, H, T, E], [H, E, S], [N, T, H, S], None, 'nhte,hes->nths', dict())]
+
+# 1D Dense convolution
+NCHKR = [
+        (1, 1152, 12602, 512, 3)
+        ]
+for N, C, H, K, R in NCHKR:
+    torch_fn = lambda a, b: torch.nn.functional.conv1d(a, b.permute(2, 0, 1))
+    configs += [([N, C, H], 
+                 [C, R, K], 
+                 [N, K, H - R + 1], 
+                 torch_fn, 
+                 'nc(h+r),crk->nkh',
+                 dict())]
+
+# 2D Dense convolution
+NCHWKRS = [
+          (8, 64, 128, 128, 768, 3, 3),
+          (8, 128, 64, 64, 256, 3, 3),
+          (8, 256, 32, 32, 512, 3, 3),
+          (8, 512, 32, 32, 1024, 3, 3)
+          ]
+for N, C, H, W, K, R, S in NCHWKRS:
+    torch_fn = lambda a, b: torch.nn.functional.conv2d(a, b.permute(3, 0, 1, 2))
+    configs += [([N, C, H, W], 
+                  [C, R, S, K], 
+                  [N, K, H - R + 1, W - R + 1], 
+                  torch_fn, 
+                  'nc(h+r)(w+s),crsk->nkhw',
+                  dict())]
+
+# 3D Dense Convolution
+NCDHWKTRS = [
+           (8, 32, 27, 100, 100, 64, 3, 3, 3),
+           (8, 64, 23, 48, 48, 256, 3, 3, 3),
+           (8, 256, 19, 22, 22, 640, 3, 3, 3),
+           (8, 640, 15, 36, 36, 384, 3, 3, 3)
+          ]
+for N, C, D, H, W, K, T, R, S in NCDHWKTRS:
+    torch_fn = lambda a, b: torch.nn.functional.conv3d(a, b.permute(4, 0, 1, 2, 3))
+    configs += [([N, C, D, H, W], 
+                 [C, T, R, S, K], 
+                 [N, K, D - T + 1, H - R + 1, W - R + 1], 
+                 torch_fn, 
+                 'nc(d+t)(h+r)(w+s),ctrsk->nkdhw',
+                 dict())]
+
+
+# Shift convolution
+shift_cuda = torch.utils.cpp_extension.load(
+    'shift_cuda', ['kernels/shift_cuda.cpp', 
+                   'kernels/shift_cuda_kernel.cu'],
+    extra_cflags=['-O3'])
+class shift(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, shift):
+        ctx.save_for_backward(shift)
+        return shift_cuda.forward(x, shift)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        shift, = ctx.saved_tensors
+        grad_output = shift_cuda.backward(grad_output, shift)
+
+        return grad_output, None
+
+NCHWKRS = [
+          #(8, 64, 128, 128, 128, 3, 3),
+          #(8, 128, 64, 64, 256, 3, 3),
+          #(8, 256, 32, 32, 512, 3, 3),
+          #(8, 512, 32, 32, 1024, 3, 3)
+          ]
+for N, C, H, W, K, R, S in NCHWKRS:
+    shift_h = np.random.randint(R, size=C, dtype=np.int32) - R//2
+    shift_w = np.random.randint(S, size=C, dtype=np.int32) - S//2
+    def shift_conv(a, b, **kwargs):
+        shift_h, shift_w = kwargs['sh'], kwargs['sw']
+        shift_torch =  np.column_stack((shift_w*-1, shift_h*-1))
+        shift_torch = torch.from_numpy(shift_torch).cuda()
+        a = shift.apply(a, shift_torch)
+        b = b.permute(1, 0)
+        b = b.reshape(b.shape[0], b.shape[1], 1, 1)
+        return torch.nn.functional.conv2d(a, b)
+    configs += [([N, C, H, W], 
+                  [C, K], 
+                  [N, K, H, W], 
+                  shift_conv, 
+                  'nc(h + sh[c])(w + sw[c]),ck->nkhw',
+                  {'sh': shift_h, 'sw': shift_w})]
+
+# Benchmark
+torch.set_num_threads(1)
+for a_shape, b_shape, c_shape, torch_fn, expr, arrays in configs:
+    dtype = torch.cuda.FloatTensor
+    # initialize input tensors
+    a = torch.rand(*a_shape).type(dtype).cuda()
+    b = torch.rand(*b_shape).type(dtype).cuda()
+    # triton output
+    tc = triton.ops.einsum(expr, a, b, c_shape, arrays = arrays, bench = True)
+    # reference output
+    if torch_fn:
+        rc = torch_fn(a, b, **arrays)
+    else:
+        rc = torch.einsum(expr, a, b)
+    # performance relative to equivalent matrix multiplication
+    ctx = triton.ctx_registry[tc]
+    B, M, N, K = ctx.matmul_B, ctx.matmul_M, ctx.matmul_N, ctx.matmul_K
+    cmp_eqbmm = True
+    if cmp_eqbmm:
+        a = torch.rand(B, M, K).type(dtype).cuda()
+        b = torch.rand(B, K, N).type(dtype).cuda()
+        tmmc = triton.ops.einsum('bmk,bkn->bmn', a, b, [B, M, N], bench = True)
+        ratio = triton.bench_registry[tmmc] / triton.bench_registry[tc]
+        cmp_str = f'({ratio:4.2f})'
+    else:
+        cmp_str = ''
+    # test and benchmark
+    bench = 2. * B * M * N * K / triton.bench_registry[tc] * 1e-3
+    diff = (tc - rc).abs().max() / rc.abs().max()
+    print(f'{expr:>15}; {str(a_shape):>20}; {str(b_shape):>20};          {bench:4.2f} {cmp_str};          {diff:4.2f}')
diff --git a/python/examples/kernels/shift_cuda.cpp b/python/examples/kernels/shift_cuda.cpp
new file mode 100644
index 000000000..b7a769feb
--- /dev/null
+++ b/python/examples/kernels/shift_cuda.cpp
@@ -0,0 +1,42 @@
+#include <torch/torch.h>
+
+#include <vector>
+
+// CUDA forward declarations
+
+at::Tensor shift_cuda_forward(
+    const at::Tensor input,
+    const at::Tensor shift);
+
+at::Tensor shift_cuda_backward(
+    const at::Tensor grad_input,
+    const at::Tensor shift);
+
+// C++ interface
+
+// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+at::Tensor shift_forward(
+    const at::Tensor input,
+    const at::Tensor shift) {
+  CHECK_INPUT(input);
+  CHECK_INPUT(shift);
+
+  return shift_cuda_forward(input, shift);
+}
+
+at::Tensor shift_backward(
+    const at::Tensor grad_input,
+    const at::Tensor shift) {
+  CHECK_INPUT(grad_input);
+  CHECK_INPUT(shift);
+  return shift_cuda_backward(grad_input, shift);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &shift_forward, "Shift forward (CUDA)");
+  m.def("backward", &shift_backward, "Shift backward (CUDA)");
+}
diff --git a/python/examples/kernels/shift_cuda_kernel.cu b/python/examples/kernels/shift_cuda_kernel.cu
new file mode 100644
index 000000000..ca56b6b0f
--- /dev/null
+++ b/python/examples/kernels/shift_cuda_kernel.cu
@@ -0,0 +1,111 @@
+#include <ATen/ATen.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+
+namespace {
+template <typename scalar_t>
+__global__ void shift_cuda_forward_kernel(
+    const scalar_t* __restrict__ input,
+    const int32_t* __restrict__ shift,
+    scalar_t* __restrict__ output,
+    const int32_t B,
+    const int32_t C,
+    const int32_t H,
+    const int32_t W) {
+  const int32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t size = B*C*H*W;
+  
+  const int32_t CHW = C*H*W;
+  const int32_t HW = H*W;
+  const int32_t b = idx / CHW;
+  const int32_t c = (idx - b*CHW) / HW;
+  const int32_t h = (idx - b*CHW - c*HW) / W;
+  const int32_t w = idx - b*CHW - c*HW - h*W;
+  const int32_t target_w = w + shift[2*c];
+  const int32_t target_h = h + shift[2*c + 1];
+  const int32_t target_idx = b*CHW + c*HW + target_h*W + target_w;
+  if (idx < size && target_w >= 0 && target_w < W && target_h >= 0 && target_h < H) {
+      output[target_idx] = input[idx];
+  }
+}
+
+template <typename scalar_t>
+__global__ void shift_cuda_backward_kernel(
+    const scalar_t* __restrict__ grad_input,
+    scalar_t* __restrict__ grad_output,
+    const int32_t* __restrict__ shift,
+    const int32_t B,
+    const int32_t C,
+    const int32_t W,
+    const int32_t H) {
+  const int32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t size = B*C*W*H;
+  const int32_t CWH = C*W*H;
+  const int32_t WH = W*H;
+  const int32_t b = idx / CWH;
+  const int32_t c = (idx - b*CWH) / WH;
+  const int32_t w = (idx - b*CWH - c*WH) / W;
+  const int32_t h = idx - b*CWH - c*WH - w*H;
+  const int32_t target_w = w - shift[2*c];
+  const int32_t target_h = h - shift[2*c + 1];
+  const int32_t target_idx = b*CWH + c*WH + target_w*W + target_h;
+  if (idx < size && target_w >= 0 && target_w < W && target_h >= 0 && target_h < H) {
+      grad_output[target_idx] = grad_input[idx];
+  }
+}
+} // namespace
+
+at::Tensor shift_cuda_forward(
+    const at::Tensor input,
+    const at::Tensor shift) {
+  const auto B = input.size(0);
+  const auto C = input.size(1);
+  const auto H = input.size(2);
+  const auto W = input.size(3);
+  const auto size = B*C*W*H;
+  const int threads = 1024;
+  const int blocks = (size + threads - 1) / threads;
+  auto output = at::zeros_like(input);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "shift_forward_cuda", ([&] {
+    shift_cuda_forward_kernel<scalar_t><<<blocks, threads>>>(
+        input.data<scalar_t>(),
+        shift.data<int32_t>(),
+        output.data<scalar_t>(),
+        B,
+        C,
+        H,
+        W);
+  }));
+
+  return output;
+}
+
+at::Tensor shift_cuda_backward(
+    const at::Tensor grad_input,
+    const at::Tensor shift) {
+  const auto B = grad_input.size(0);
+  const auto C = grad_input.size(1);
+  const auto H = grad_input.size(2);
+  const auto W = grad_input.size(3);
+  const auto size = B*C*W*H;
+  const int threads = 1024;
+  const int blocks = (size + threads - 1) / threads;
+  auto grad_output = at::zeros_like(grad_input);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad_input.type(), "shift_backward_cuda", ([&] {
+    shift_cuda_backward_kernel<scalar_t><<<blocks, threads>>>(
+        grad_input.data<scalar_t>(),
+        grad_output.data<scalar_t>(),
+        shift.data<int32_t>(),
+        B,
+        C,
+        H,
+        W);
+  }));
+
+  return grad_output;
+}
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 000000000..060a1c450
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,122 @@
+import os
+import re
+import sys
+import sysconfig
+import platform
+import subprocess
+import distutils
+import glob
+from distutils.version import LooseVersion
+from setuptools import setup, Extension, find_packages
+from setuptools.command.build_ext import build_ext
+from setuptools.command.test import test as TestCommand
+import distutils.spawn
+
+
+def find_llvm():
+    versions = ['9.0', '9', '90', '8.0', '8', '80']
+    supported = ['llvm-config-{v}'.format(v=v) for v in versions]
+    paths = [distutils.spawn.find_executable(cfg) for cfg in supported]
+    paths = [p for p in paths if p is not None]
+    if paths:
+      return paths[0]
+    config = distutils.spawn.find_executable('llvm-config')
+    instructions = 'Please install llvm-{8, 9, 10}-dev'
+    if config is None:
+        raise RuntimeError('Could not find llvm-config. ' + instructions)
+    version = os.popen('{config} --version'.format(config=config)).read()
+    raise RuntimeError('Version {v} not supported. '.format(v=version) + instructions)
+
+
+class CMakeExtension(Extension):
+    def __init__(self, name, path, sourcedir=''):
+        Extension.__init__(self, name, sources=[])
+        self.sourcedir = os.path.abspath(sourcedir)
+        self.path = path
+
+
+class CMakeBuild(build_ext):
+
+    def run(self):
+        try:
+            out = subprocess.check_output(['cmake', '--version'])
+        except OSError:
+            raise RuntimeError("CMake must be installed to build the following extensions: " +
+                               ", ".join(e.name for e in self.extensions))
+
+        if platform.system() == "Windows":
+            cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
+            if cmake_version < '3.1.0':
+                raise RuntimeError("CMake >= 3.1.0 is required on Windows")
+
+        for ext in self.extensions:
+            self.build_extension(ext)
+
+    def build_extension(self, ext):
+        self.debug = True
+        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.path)))
+        # python directories
+        python_include_dirs = distutils.sysconfig.get_python_inc()
+        python_lib_dirs = distutils.sysconfig.get_config_var('LIBDIR')
+        cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
+                      '-DBUILD_TESTS=OFF',
+                      '-DBUILD_PYTHON_MODULE=ON',
+                      '-DPYTHON_INCLUDE_DIRS=' + python_include_dirs,
+                      '-DLLVM_CONFIG=' + find_llvm()]
+        # tensorflow compatibility
+        try:
+            import tensorflow as tf
+            tf_abi = tf.__cxx11_abi_flag__ if "__cxx11_abi_flag__" in tf.__dict__ else 0
+            tf_include_dirs = tf.sysconfig.get_include()
+            tf_libs = tf.sysconfig.get_link_flags()[1].replace('-l', '')
+            cmake_args += ['-DTF_INCLUDE_DIRS=' + tf_include_dirs,
+                        '-DTF_LIB_DIRS='     + tf.sysconfig.get_lib(),
+                        '-DTF_LIBS='         + tf_libs,
+                        '-DTF_ABI='          + str(tf_abi)]
+        except ModuleNotFoundError:
+            pass
+
+        cfg = 'Debug' if self.debug else 'Release'
+        cfg = 'Release'
+        build_args = ['--config', cfg]
+
+        if platform.system() == "Windows":
+            cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)]
+            if sys.maxsize > 2**32:
+                cmake_args += ['-A', 'x64']
+            build_args += ['--', '/m']
+        else:
+            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
+            build_args += ['--', '-j4']
+
+        env = os.environ.copy()
+        env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
+                                                              self.distribution.get_version())
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+        sourcedir =  os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
+        subprocess.check_call(['cmake', sourcedir] + cmake_args, cwd=self.build_temp, env=env)
+        subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
+       
+
+find_llvm()
+
+directories = [x[0] for x in os.walk(os.path.join(os.path.pardir, 'include'))]
+data = []
+for d in directories:
+    files = glob.glob(os.path.join(d, '*.h'), recursive=False)
+    data += [os.path.relpath(f, os.path.pardir) for f in files]
+
+setup(
+    name='triton',
+    version='0.1',
+    author='Philippe Tillet',
+    author_email='ptillet@g.harvard.edu',
+    description='A language and compiler for custom Deep Learning operations',
+    long_description='',
+    packages=['triton', 'triton/_C', 'triton/ops'],
+    package_data={'': data},
+    ext_modules=[CMakeExtension('triton', 'triton/_C/')],
+    cmdclass=dict(build_ext=CMakeBuild),
+    zip_safe=False,
+)
diff --git a/python/src/bindings.cc b/python/src/bindings.cc
new file mode 100644
index 000000000..8b3ee2971
--- /dev/null
+++ b/python/src/bindings.cc
@@ -0,0 +1,642 @@
+﻿#include <pybind11/pybind11.h>
+#include <pybind11/buffer_info.h>
+#include <pybind11/stl.h>
+#include <pybind11/functional.h>
+#include <string>
+#include <regex>
+#include <algorithm>
+#include "triton/runtime/function.h"
+#include "triton/lang/code_gen.h"
+#include "triton/lang/parser.h"
+#include "triton/lang/cpp.h"
+#include "triton/driver/device.h"
+#include "triton/driver/stream.h"
+#include "triton/driver/kernel.h"
+#include "triton/driver/module.h"
+#include "triton/ir/module.h"
+#include "triton/ir/function.h"
+#include "triton/tools/bench.hpp"
+
+using namespace triton;
+
+namespace rt = triton::runtime;
+
+std::map<size_t, std::shared_ptr<rt::function::grid_fn_ty>> id_grid_map;
+std::map<size_t, std::shared_ptr<rt::function>> id_fn_map;
+std::map<size_t, double> fp64scalar_map;
+std::map<size_t, int64_t> i64scalar_map;
+
+/* Grid map */
+
+void register_grid(size_t id,
+                   const rt::function::grid_fn_ty& grid_fn) {
+  id_grid_map[id].reset(new rt::function::grid_fn_ty(grid_fn));
+}
+
+void delete_grid(size_t id) {
+  id_grid_map.erase(id);
+}
+
+/* Function map */
+
+void register_fn(size_t id,
+                   const std::string& src,
+                   const rt::function::options_space_t& opt) {
+  id_fn_map[id].reset(new rt::function(src, opt));
+}
+
+void delete_fn(size_t id) {
+  id_fn_map.erase(id);
+}
+
+void register_cst(size_t id, const std::string& name, pybind11::buffer& data) {
+  pybind11::buffer_info info = data.request();
+  id_fn_map[id]->set_cst(name, info.ptr, info.size*info.itemsize);
+}
+
+void cleanup() {
+  id_grid_map.clear();
+  id_fn_map.clear();
+  i64scalar_map.clear();
+}
+
+size_t make_op_id() {
+  return id_fn_map.size();
+}
+
+/* TF scalar wrapper */
+size_t make_scalar_id() {
+  size_t ret = i64scalar_map.size();
+  i64scalar_map[ret] = int64_t();
+  return ret;
+}
+
+bool has_scalar(size_t id) {
+  return i64scalar_map.find(id) != i64scalar_map.end();
+}
+
+int64_t retrieve_scalar(size_t id) {
+  return i64scalar_map.at(id);
+}
+
+/* TF source-code generation */
+
+inline std::string to_tf_ty(ir::type *ty) {
+  if(ty->is_integer_ty(1))
+    return "bool";
+  if(ty->is_integer_ty(8))
+    return "int8";
+  if(ty->is_integer_ty(16))
+    return "int16";
+  if(ty->is_integer_ty(32))
+    return "int32";
+  if(ty->is_integer_ty(64))
+    return "int64";
+  if(ty->is_half_ty())
+    return "float16";
+  if(ty->is_float_ty())
+    return "float";
+  if(ty->is_double_ty())
+    return "double";
+  if(ty->is_pointer_ty())
+    return "Tensor";
+  throw std::runtime_error("unknown type");
+}
+
+inline std::string to_tf_scalar_ty(ir::type *ty) {
+  if(ty->is_pointer_ty())
+    return to_tf_ty(ty->get_pointer_element_ty());
+  else {
+    return to_tf_ty(ty);
+  }
+}
+
+inline std::string ref_to_tf_ty(ir::type *ty) {
+  std::string res = to_tf_ty(ty);
+  if(ty->is_pointer_ty())
+    res = "const " + res + "&";
+  return res;
+}
+
+std::string tf_normalize(const std::string& name) {
+  std::string ret = name;
+  auto tolower = [](char c) { return std::tolower(c);};
+  std::transform(ret.begin(), ret.end(), ret.begin(), tolower);
+  return ret;
+}
+
+struct tf_alloc_t{
+  enum type_t{
+    OUTPUT,
+    TEMP
+  };
+
+  tf_alloc_t(const std::string& _name, type_t _type)
+    : name(_name), type(_type), tf_name(tf_normalize(_name)){ }
+
+  std::string tf_name;
+  std::string name;
+  type_t type;
+  size_t shape_id;
+};
+
+typedef std::vector<tf_alloc_t> alloc_map_t;
+
+
+void gen_extract_inputs(std::ostream &os, const std::vector<ir::argument*>& args, const alloc_map_t& allocs) {
+  for(unsigned i = 0; i < args.size(); i++){
+    ir::value *arg = args[i];
+    const std::string& name = arg->get_name();
+    std::string ty = to_tf_ty(arg->get_type());
+    if(!arg->get_type()->is_pointer_ty())
+      os << "  " << ty << " " << name << " = context->input(" << i << ").scalar<" << ty << ">()();\n  ";
+    else if(std::find_if(allocs.begin(), allocs.end(), 
+                         [&](tf_alloc_t x) { 
+                              return x.name == name; 
+                             }) == allocs.end())
+      os << "  const Tensor* " << name << " = &context->input(" << i << ");\n  ";
+    else
+      os << "  Tensor* " << name << " = nullptr;\n  ";
+  }
+}
+
+void gen_set_outputs(std::ostream &os, const std::vector<ir::argument*>& args, const alloc_map_t& allocs) {
+  // initialize shapes
+  for(const auto& x: allocs)
+    os << "  TensorShape " << x.name << "_shape;\n  ";  
+  for(const auto& x: allocs)
+    os << "  const Tensor& " << x.name << "_shape_tensor = context->input(" << x.shape_id << ");\n  ";
+  for(const auto& x: allocs)
+    os << "  const int32* " << x.name << "_shape_data = (const int32*)" << x.name << "_shape_tensor.tensor_data().data();\n  ";
+  for(const auto& x: allocs)
+    os << "  size_t " << x.name << "_rank = " << x.name << "_shape_tensor.dim_size(0);\n  ";
+  for(const auto& x: allocs)
+    os << "  for(size_t d = 0; d < " << x.name << "_rank ; d++) "
+        <<  x.name << "_shape.AddDim(" << x.name << "_shape_data[d]);\n  ";
+  
+  // allocate
+  int output = 0;
+  for(const auto& x: allocs){
+    if(x.type == tf_alloc_t::OUTPUT)
+      os << "  OP_REQUIRES_OK(context, context->allocate_output(" << output++ << ", " << x.name << "_shape, &" << x.name << "));\n  ";
+    else
+      os << "  OP_REQUIRES_OK(context, context->allocate_temp(" << x.name << "_type, " << x.name << "_shape, " << x.name << "));\n  ";
+  }
+}
+
+void gen_make_handles(std::ostream &os, const std::vector<ir::argument*>& args) {
+  for(unsigned i = 0; i < args.size(); i++){
+    ir::argument *arg = args[i];
+    if(!arg->get_type()->is_pointer_ty())
+      continue;
+    const std::string& name = arg->get_name();
+    os << "  drv::cu_buffer cu_" + name + "(ctx, " + name + "->tensor_data().size(), (CUdeviceptr)" + name + "->tensor_data().data(), false);\n  ";
+  }
+}
+
+void gen_make_launch_function(std::ostream &os, const std::vector<ir::argument*>& args) {
+  os << "  std::function<void()> run = [&](){\n  ";
+  os << "    (*id_fn_map.at(id_))({";
+  for(unsigned i = 0; i < args.size() ; i++){
+    ir::argument *arg = args[i];
+    std::string name = arg->get_name();
+    if(arg->get_type()->is_pointer_ty())
+      name = "&cu_" + name;
+    if(i > 0)
+      os << ", ";
+    os << name;
+  }
+  os << "}, *id_grid_map.at(id_), stream);\n  ";
+  os << "  };\n  ";
+  os << "  run();\n  ";
+  os << "  if(bench_ > 0)\n  ";
+  os << "    i64scalar_map[bench_id_] = triton::tools::bench(run, stream);\n  ";
+}
+
+void gen_tf_register_kernel_builder(std::ostream &os, const std::string &name,
+                                 const std::string &opname,
+                                 const std::vector<ir::argument*>& args,
+                                 const alloc_map_t& allocs){
+
+  os << "REGISTER_KERNEL_BUILDER(Name(\"" + name + "\").Device(DEVICE_GPU)";
+  for(size_t i = 0; i < args.size(); i++){
+    ir::argument *arg = args[i];
+    std::string name = tf_normalize(arg->get_name());
+    if(!arg->get_type()->is_pointer_ty())
+      os << ".HostMemory(\"" + name + "\")";
+  }
+  for(const auto& x: allocs)
+    os << ".HostMemory(\"" << x.tf_name << "_shape\")";
+  os <<  ", " + opname << ");\n";
+}
+
+void gen_tf_register_op(std::ostream &os, const std::string &name,
+                     const std::vector<ir::argument*>& args,
+                     const alloc_map_t& allocs){
+  
+  
+  os << "REGISTER_OP(\"" << name << "\")\n";
+  for(size_t i = 0; i < args.size(); i++)
+    os << "  .Attr(\"T" << i << " : {bool, int8, int16, int32, int64, float16, float32, float64}\")" << std::endl;
+  for(size_t i = 0; i < args.size(); i++){
+    ir::argument *arg = args[i];
+    std::string name = tf_normalize(arg->get_name());
+    if(std::find_if(allocs.begin(), allocs.end(),
+                    [&](tf_alloc_t x) { 
+                      return name == x.tf_name;
+                    }) == allocs.end())
+      os << "  .Input(\"" << name << ": T" << i << "\")\n";
+    else
+      os << "  .Input(\"" << name << "_shape: int32\")\n";
+  }
+  for(const auto& x: allocs)
+    if(x.type == tf_alloc_t::OUTPUT)
+      os << "  .Output(\"" << x.tf_name << ": T" << x.shape_id << "\")\n";
+  os << "  .Attr(\"id: int\")\n";
+  os << "  .Attr(\"bench: int\")\n";
+  os << "  .Attr(\"bench_id: int\")\n";
+  os << "  .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* ctx) {\n";
+  size_t current = 0;
+  for(const auto& x: allocs)
+    if(x.type == tf_alloc_t::OUTPUT){
+      os << "    shape_inference::ShapeHandle " << x.tf_name << "_handle;\n";
+      os << "    ctx->MakeShapeFromShapeTensor(" << x.shape_id << ", &" << x.tf_name << "_handle);\n";
+      os << "    ctx->set_output(" << current++ << ", " << x.tf_name << "_handle);\n";
+    }
+  os << "      return Status::OK();\n";
+  os << "  })\n";
+
+  os << ";\n";
+}
+
+void make_module(const std::string& src, ir::module* ir,
+                 const runtime::function::options_space_t& opt) {
+  std::string copy = triton::runtime::function::preheader() + src;
+  // pre-process
+  TokenSequence tokens;
+  Preprocessor cpp(&copy, true);
+  for(auto it: opt.defines){
+    cpp.AddMacro(it.first, &it.second[0]);
+  }
+  cpp.Process(tokens);
+  // parse
+  Parser parser(tokens);
+  parser.Parse();
+  Generator gen(&parser);
+  gen.Gen(ir);
+}
+
+std::tuple<std::string,
+           std::string> make_tensorflow_src(const std::string& src,
+                                const std::vector<std::string>& outputs,
+                                const std::vector<std::string>& tmp,
+                                const runtime::function::options_space_t& opt)
+{
+  // triton-ir code-gen
+  ir::context ctx;
+  auto ir = std::shared_ptr<ir::module>(new ir::module("", ctx));
+  make_module(src, &*ir, opt);
+
+  // function
+  ir::function* fn = ir->get_function_list().front();
+  const std::vector<ir::argument*>& args = fn->args();
+  std::string name = fn->get_name();
+  std::string cc_name = name;
+  cc_name[0] = static_cast<char>(std::toupper(cc_name[0]));
+  std::string opname = cc_name + "Op";
+  
+  // allocation info
+  alloc_map_t allocs;
+  for(size_t i = 0; i < outputs.size(); i++)
+    allocs.push_back(tf_alloc_t(outputs[i], tf_alloc_t::OUTPUT));
+  for(size_t i = 0; i < tmp.size(); i++)
+    allocs.push_back(tf_alloc_t(tmp[i], tf_alloc_t::TEMP));
+
+  for(auto &x: allocs){
+    size_t idx;
+    for(idx = 0; idx < args.size(); idx++)
+      if(args[idx]->get_name() == x.name)
+        break;
+    if(idx == args.size())
+      throw std::runtime_error("unknown output");
+    x.shape_id = idx;
+  }
+
+  std::ostringstream oss;
+  oss << R"(
+#include "triton/driver/buffer.h"
+#include "triton/driver/backend.h"
+#include "triton/driver/stream.h"
+#include "triton/runtime/function.h"
+#include "triton/tools/bench.hpp"
+
+#define EIGEN_USE_GPU
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+using namespace tensorflow;
+using GPUDevice = Eigen::GpuDevice;
+namespace rt = triton::runtime;
+namespace drv = triton::driver;
+
+extern std::map<size_t, std::shared_ptr<rt::function::grid_fn_ty>> id_grid_map;
+extern std::map<size_t, std::shared_ptr<rt::function>> id_fn_map;
+extern std::map<size_t, int64_t> i64scalar_map;
+
+class )" << opname << R"(: public OpKernel {
+ public:
+  explicit )" << opname << R"((OpKernelConstruction* context)
+    : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("id", &id_));
+    OP_REQUIRES_OK(context, context->GetAttr("bench", &bench_));
+    OP_REQUIRES_OK(context, context->GetAttr("bench_id", &bench_id_));
+  )";
+for(const auto& alloc: allocs)
+  oss << "  OP_REQUIRES_OK(context, context->GetAttr(\"T" << alloc.shape_id << "\", &" << alloc.name << "_type));\n  ";
+
+oss << R"(
+  }
+
+  void Compute(OpKernelContext* context){
+
+    // get device/stream
+    GPUDevice device =  context->eigen_device<GPUDevice>();
+    drv::cu_stream sstream(device.stream(), false);
+    drv::context* ctx = sstream.context();
+    drv::stream* stream = &sstream;
+    
+    // extract inputs
+    )";
+gen_extract_inputs(oss, args, allocs);
+oss << R"(
+    // set outputs
+    )";
+gen_set_outputs(oss, args, allocs);
+oss << R"(
+    // wrap tensors
+    )";
+gen_make_handles(oss, args);
+oss << R"(
+    )";
+oss << R"(
+    // launch function
+    )";
+gen_make_launch_function(oss, args);
+oss << R"(
+  }
+
+private:
+  int id_;
+  int bench_;
+  int64 bench_id_;
+  )";
+for(const auto& alloc: allocs)
+  oss << "DataType " << alloc.name << "_type;\n  ";
+  
+oss << R"(
+};
+
+// register kernel builder
+)";
+gen_tf_register_kernel_builder(oss, cc_name, opname, args, allocs);
+oss << R"(
+// register op
+)";
+gen_tf_register_op(oss, cc_name, args, allocs);
+
+  return {oss.str(), name};
+}
+
+
+inline std::string to_torch_ty(ir::type *ty) {
+  if(ty->is_integer_ty())
+    return "int64_t";
+  if(ty->is_half_ty())
+    return "double";
+  if(ty->is_float_ty())
+    return "double";
+  if(ty->is_double_ty())
+    return "double";
+  if(ty->is_pointer_ty())
+    return "torch::Tensor";
+  throw std::runtime_error("unknown type");
+}
+
+inline std::string to_c_ty(ir::type *ty) {
+  if(ty->is_integer_ty(1))
+    return "bool";
+  if(ty->is_integer_ty(8))
+    return "int8_t";
+  if(ty->is_integer_ty(16))
+    return "int16_t";
+  if(ty->is_integer_ty(32))
+    return "int32_t";
+  if(ty->is_integer_ty(64))
+    return "int64_t";
+  if(ty->is_half_ty())
+    return "half";
+  if(ty->is_float_ty())
+    return "float";
+  if(ty->is_double_ty())
+    return "double";
+  if(ty->is_pointer_ty())
+    return "drv::cu_buffer";
+  throw std::runtime_error("unknown type");
+}
+
+
+
+void gen_torch_signature(std::ostringstream& oss,
+                         ir::function* fn,
+                         const std::vector<std::string>& outputs,
+                         const std::string& name) {
+  const auto& args = fn->args();
+  std::vector<ir::type*> out_types;
+  for(const std::string& out: outputs) {
+    auto it = std::find_if(args.begin(), args.end(),
+                           [&](ir::argument* arg) { return arg->get_name() == out; });
+    if(it == args.end())
+      throw std::runtime_error("unknown argument");
+    out_types.push_back((*it)->get_type());
+  }
+
+  std::string ret_ty;
+  if(out_types.empty())
+    ret_ty = "void";
+  else{
+    ir::type* ty = out_types[0];
+    ret_ty = to_torch_ty(ty);
+    if(out_types.size() > 1){
+      for(size_t i = 1; i < out_types.size(); i++)
+        if(out_types[i] != ty)
+          throw std::runtime_error("outputs of different types not supported by pytorch");
+      ret_ty = "std::vector<" + ret_ty + ">";
+    }
+  }
+
+  oss << ret_ty << " " << name << "(";
+  oss << "int64_t id, ";
+  oss << "int64_t bench, ";
+  oss << "int64_t bench_id,  ";
+  for(size_t i = 0; i < args.size(); i++) {
+    ir::argument* arg = args[i];
+    if(i > 0)
+      oss << ", ";
+    oss << to_torch_ty(arg->get_type()) << " " << arg->get_name();
+  }
+  oss << ")";
+}
+
+void gen_torch_init_driver(std::ostringstream &oss,
+                           const std::vector<ir::argument*>&args) {
+  ir::argument* tensor = nullptr;
+  for(ir::argument* arg: args)
+    if(arg->get_type()->is_pointer_ty()){
+      tensor = arg;
+      break;
+    }
+  oss <<  "  // Wrap CUDA handles" << std::endl;
+  oss <<  "  c10::DeviceIndex device = " << tensor->get_name() << ".storage().device().index();" << std::endl;
+  oss <<  "  // Get stream" << std::endl;
+  oss <<  "  CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream();" << std::endl;
+  oss <<  "  triton::driver::cu_stream stream(custream, false);" << std::endl;
+  oss <<  "  triton::driver::context* ctx = stream.context();" << std::endl;
+}
+
+void gen_torch_make_handles(std::ostream &os,
+                            const std::vector<ir::argument*>& args) {
+  for(unsigned i = 0; i < args.size(); i++){
+    ir::argument *arg = args[i];
+    const std::string& name = arg->get_name();
+    ir::type* ty = arg->get_type();
+    if(!ty->is_pointer_ty())
+      os << "  " << to_c_ty(ty) << " arg_" << name << " = " << name << ";" << std::endl;
+    else{
+      os << "  CHECK_INPUT(" << name << ");" << std::endl;
+      os << "  drv::cu_buffer arg_" + name + "(ctx, " + name + ".storage().size(), "
+            " (CUdeviceptr)((char*)" + name + ".storage().data() + " + name + ".storage_offset() * " + name + ".itemsize()), false);" << std::endl;
+    }
+  }
+}
+
+void gen_torch_make_launch_function(std::ostream &os, const std::vector<ir::argument*>& args) {
+  os << "  std::function<void()> run = [&](){\n  ";
+  os << "    (*id_fn_map.at(id))({";
+  for(unsigned i = 0; i < args.size() ; i++){
+    ir::argument *arg = args[i];
+    std::string name = "arg_" + arg->get_name();
+    if(arg->get_type()->is_pointer_ty())
+      name = "&" + name;
+    if(i > 0)
+      os << ", ";
+    os << name;
+  }
+  os << "}, *id_grid_map.at(id), &stream);\n";
+  os << "  };\n";
+  os << "  run();\n";
+  os << "  if(bench > 0)\n  ";
+  os << "    i64scalar_map[bench_id] = triton::tools::bench(run, &stream);\n  ";
+  }
+
+void gen_torch_ret(std::ostream &os, const std::vector<std::string>& outputs) {
+  if(outputs.size() == 1){
+    os << "  return " << outputs[0] << ";" << std::endl;
+    return;
+  }
+  os << "  return {";
+  for(size_t i = 0; i < outputs.size(); i++){
+    if(i > 0)
+      os << ", ";
+    os << outputs[i];
+  }
+  os << "};" << std::endl;
+}
+
+std::tuple<std::string,
+           std::string> make_torch_src(const std::string& src,
+                                         const std::vector<std::string>& outputs,
+                                         const std::vector<std::string>& tmp,
+                                         const runtime::function::options_space_t& opt) {
+  // triton-ir code-gen
+  ir::context ctx;
+  auto ir = std::shared_ptr<ir::module>(new ir::module("", ctx));
+  make_module(src, &*ir, opt);
+  // function
+  ir::function* fn = ir->get_function_list().front();
+  std::string name = fn->get_name();
+  // generate framework code
+  std::ostringstream oss;
+  oss << R"(
+#include "triton/driver/buffer.h"
+#include "triton/driver/stream.h"
+#include "triton/runtime/function.h"
+#include "triton/tools/bench.hpp"
+#include "torch/script.h"
+#include "ATen/cuda/CUDAContext.h"
+
+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x);
+
+namespace rt = triton::runtime;
+namespace drv = triton::driver;
+
+extern std::map<size_t, std::shared_ptr<rt::function::grid_fn_ty>> id_grid_map;
+extern std::map<size_t, std::shared_ptr<rt::function>> id_fn_map;
+extern std::map<size_t, int64_t> i64scalar_map;
+
+)";
+
+  gen_torch_signature(oss, fn, outputs, name);
+  oss << " {" << std::endl;
+  gen_torch_init_driver(oss, fn->args());
+  gen_torch_make_handles(oss, fn->args());
+  gen_torch_make_launch_function(oss, fn->args());
+  gen_torch_ret(oss, outputs);
+  oss << "}" << std::endl;
+
+  oss << std::endl;
+  oss << std::endl;
+  oss << "static auto registry = torch::RegisterOperators(\"triton::" << name << "\", &" << name << ");" << std::endl;
+
+  return {oss.str(), name};
+}
+
+
+typedef triton::runtime::function::options_t options_t;
+typedef triton::runtime::function::options_space_t options_space_t;
+
+PYBIND11_MODULE(libtriton, m) {
+    m.doc() = "Python bindings to the C++ Triton API";
+
+    // framework binding source code generation
+    m.def("make_tensorflow_src", &make_tensorflow_src,
+          "Creates C++ source code for a custom Tensorflow op "
+          "corresponding to the specified Triton kernel");
+    m.def("make_torch_src", &make_torch_src,
+          "Creates C++ source code for a custom PyTorch op ");
+
+    // bindings for triton classes
+    pybind11::class_<options_t>(m, "options")
+        .def(pybind11::init<>())
+        .def("d", &options_t::D<int>)
+        .def_readonly("num_warps", &options_t::num_warps);
+
+    pybind11::class_<options_space_t>(m, "options_space")
+        .def(pybind11::init<>())
+        .def_readwrite("defines", &options_space_t::defines)
+        .def_readwrite("num_warps", &options_space_t::num_warps);
+
+    // hooks into triton constructs since frameworks may not use pybind11
+    m.def("register_grid", &register_grid);
+    m.def("delete_grid", &delete_grid);
+    m.def("register_fn", &register_fn);
+    m.def("register_cst", &register_cst);
+    m.def("delete_fn", &delete_fn);
+    m.def("make_op_id", &make_op_id);
+    m.def("make_scalar_id", &make_scalar_id);
+    m.def("retrieve_scalar", &retrieve_scalar);
+    m.def("cleanup", &cleanup);
+    ;
+}
diff --git a/python/src/pybind11/attr.h b/python/src/pybind11/attr.h
new file mode 100644
index 000000000..6962d6fc5
--- /dev/null
+++ b/python/src/pybind11/attr.h
@@ -0,0 +1,493 @@
+/*
+    pybind11/attr.h: Infrastructure for processing custom
+    type and function attributes
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "cast.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// \addtogroup annotations
+/// @{
+
+/// Annotation for methods
+struct is_method { handle class_; is_method(const handle &c) : class_(c) { } };
+
+/// Annotation for operators
+struct is_operator { };
+
+/// Annotation for parent scope
+struct scope { handle value; scope(const handle &s) : value(s) { } };
+
+/// Annotation for documentation
+struct doc { const char *value; doc(const char *value) : value(value) { } };
+
+/// Annotation for function names
+struct name { const char *value; name(const char *value) : value(value) { } };
+
+/// Annotation indicating that a function is an overload associated with a given "sibling"
+struct sibling { handle value; sibling(const handle &value) : value(value.ptr()) { } };
+
+/// Annotation indicating that a class derives from another given type
+template <typename T> struct base {
+    PYBIND11_DEPRECATED("base<T>() was deprecated in favor of specifying 'T' as a template argument to class_")
+    base() { }
+};
+
+/// Keep patient alive while nurse lives
+template <size_t Nurse, size_t Patient> struct keep_alive { };
+
+/// Annotation indicating that a class is involved in a multiple inheritance relationship
+struct multiple_inheritance { };
+
+/// Annotation which enables dynamic attributes, i.e. adds `__dict__` to a class
+struct dynamic_attr { };
+
+/// Annotation which enables the buffer protocol for a type
+struct buffer_protocol { };
+
+/// Annotation which requests that a special metaclass is created for a type
+struct metaclass {
+    handle value;
+
+    PYBIND11_DEPRECATED("py::metaclass() is no longer required. It's turned on by default now.")
+    metaclass() {}
+
+    /// Override pybind11's default metaclass
+    explicit metaclass(handle value) : value(value) { }
+};
+
+/// Annotation that marks a class as local to the module:
+struct module_local { const bool value; constexpr module_local(bool v = true) : value(v) { } };
+
+/// Annotation to mark enums as an arithmetic type
+struct arithmetic { };
+
+/** \rst
+    A call policy which places one or more guard variables (``Ts...``) around the function call.
+
+    For example, this definition:
+
+    .. code-block:: cpp
+
+        m.def("foo", foo, py::call_guard<T>());
+
+    is equivalent to the following pseudocode:
+
+    .. code-block:: cpp
+
+        m.def("foo", [](args...) {
+            T scope_guard;
+            return foo(args...); // forwarded arguments
+        });
+ \endrst */
+template <typename... Ts> struct call_guard;
+
+template <> struct call_guard<> { using type = detail::void_type; };
+
+template <typename T>
+struct call_guard<T> {
+    static_assert(std::is_default_constructible<T>::value,
+                  "The guard type must be default constructible");
+
+    using type = T;
+};
+
+template <typename T, typename... Ts>
+struct call_guard<T, Ts...> {
+    struct type {
+        T guard{}; // Compose multiple guard types with left-to-right default-constructor order
+        typename call_guard<Ts...>::type next{};
+    };
+};
+
+/// @} annotations
+
+NAMESPACE_BEGIN(detail)
+/* Forward declarations */
+enum op_id : int;
+enum op_type : int;
+struct undefined_t;
+template <op_id id, op_type ot, typename L = undefined_t, typename R = undefined_t> struct op_;
+inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret);
+
+/// Internal data structure which holds metadata about a keyword argument
+struct argument_record {
+    const char *name;  ///< Argument name
+    const char *descr; ///< Human-readable version of the argument value
+    handle value;      ///< Associated Python object
+    bool convert : 1;  ///< True if the argument is allowed to convert when loading
+    bool none : 1;     ///< True if None is allowed when loading
+
+    argument_record(const char *name, const char *descr, handle value, bool convert, bool none)
+        : name(name), descr(descr), value(value), convert(convert), none(none) { }
+};
+
+/// Internal data structure which holds metadata about a bound function (signature, overloads, etc.)
+struct function_record {
+    function_record()
+        : is_constructor(false), is_new_style_constructor(false), is_stateless(false),
+          is_operator(false), has_args(false), has_kwargs(false), is_method(false) { }
+
+    /// Function name
+    char *name = nullptr; /* why no C++ strings? They generate heavier code.. */
+
+    // User-specified documentation string
+    char *doc = nullptr;
+
+    /// Human-readable version of the function signature
+    char *signature = nullptr;
+
+    /// List of registered keyword arguments
+    std::vector<argument_record> args;
+
+    /// Pointer to lambda function which converts arguments and performs the actual call
+    handle (*impl) (function_call &) = nullptr;
+
+    /// Storage for the wrapped function pointer and captured data, if any
+    void *data[3] = { };
+
+    /// Pointer to custom destructor for 'data' (if needed)
+    void (*free_data) (function_record *ptr) = nullptr;
+
+    /// Return value policy associated with this function
+    return_value_policy policy = return_value_policy::automatic;
+
+    /// True if name == '__init__'
+    bool is_constructor : 1;
+
+    /// True if this is a new-style `__init__` defined in `detail/init.h`
+    bool is_new_style_constructor : 1;
+
+    /// True if this is a stateless function pointer
+    bool is_stateless : 1;
+
+    /// True if this is an operator (__add__), etc.
+    bool is_operator : 1;
+
+    /// True if the function has a '*args' argument
+    bool has_args : 1;
+
+    /// True if the function has a '**kwargs' argument
+    bool has_kwargs : 1;
+
+    /// True if this is a method
+    bool is_method : 1;
+
+    /// Number of arguments (including py::args and/or py::kwargs, if present)
+    std::uint16_t nargs;
+
+    /// Python method object
+    PyMethodDef *def = nullptr;
+
+    /// Python handle to the parent scope (a class or a module)
+    handle scope;
+
+    /// Python handle to the sibling function representing an overload chain
+    handle sibling;
+
+    /// Pointer to next overload
+    function_record *next = nullptr;
+};
+
+/// Special data structure which (temporarily) holds metadata about a bound class
+struct type_record {
+    PYBIND11_NOINLINE type_record()
+        : multiple_inheritance(false), dynamic_attr(false), buffer_protocol(false),
+          default_holder(true), module_local(false) { }
+
+    /// Handle to the parent scope
+    handle scope;
+
+    /// Name of the class
+    const char *name = nullptr;
+
+    // Pointer to RTTI type_info data structure
+    const std::type_info *type = nullptr;
+
+    /// How large is the underlying C++ type?
+    size_t type_size = 0;
+
+    /// What is the alignment of the underlying C++ type?
+    size_t type_align = 0;
+
+    /// How large is the type's holder?
+    size_t holder_size = 0;
+
+    /// The global operator new can be overridden with a class-specific variant
+    void *(*operator_new)(size_t) = nullptr;
+
+    /// Function pointer to class_<..>::init_instance
+    void (*init_instance)(instance *, const void *) = nullptr;
+
+    /// Function pointer to class_<..>::dealloc
+    void (*dealloc)(detail::value_and_holder &) = nullptr;
+
+    /// List of base classes of the newly created type
+    list bases;
+
+    /// Optional docstring
+    const char *doc = nullptr;
+
+    /// Custom metaclass (optional)
+    handle metaclass;
+
+    /// Multiple inheritance marker
+    bool multiple_inheritance : 1;
+
+    /// Does the class manage a __dict__?
+    bool dynamic_attr : 1;
+
+    /// Does the class implement the buffer protocol?
+    bool buffer_protocol : 1;
+
+    /// Is the default (unique_ptr) holder type used?
+    bool default_holder : 1;
+
+    /// Is the class definition local to the module shared object?
+    bool module_local : 1;
+
+    PYBIND11_NOINLINE void add_base(const std::type_info &base, void *(*caster)(void *)) {
+        auto base_info = detail::get_type_info(base, false);
+        if (!base_info) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name) +
+                          "\" referenced unknown base type \"" + tname + "\"");
+        }
+
+        if (default_holder != base_info->default_holder) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name) + "\" " +
+                    (default_holder ? "does not have" : "has") +
+                    " a non-default holder type while its base \"" + tname + "\" " +
+                    (base_info->default_holder ? "does not" : "does"));
+        }
+
+        bases.append((PyObject *) base_info->type);
+
+        if (base_info->type->tp_dictoffset != 0)
+            dynamic_attr = true;
+
+        if (caster)
+            base_info->implicit_casts.emplace_back(type, caster);
+    }
+};
+
+inline function_call::function_call(const function_record &f, handle p) :
+        func(f), parent(p) {
+    args.reserve(f.nargs);
+    args_convert.reserve(f.nargs);
+}
+
+/// Tag for a new-style `__init__` defined in `detail/init.h`
+struct is_new_style_constructor { };
+
+/**
+ * Partial template specializations to process custom attributes provided to
+ * cpp_function_ and class_. These are either used to initialize the respective
+ * fields in the type_record and function_record data structures or executed at
+ * runtime to deal with custom call policies (e.g. keep_alive).
+ */
+template <typename T, typename SFINAE = void> struct process_attribute;
+
+template <typename T> struct process_attribute_default {
+    /// Default implementation: do nothing
+    static void init(const T &, function_record *) { }
+    static void init(const T &, type_record *) { }
+    static void precall(function_call &) { }
+    static void postcall(function_call &, handle) { }
+};
+
+/// Process an attribute specifying the function's name
+template <> struct process_attribute<name> : process_attribute_default<name> {
+    static void init(const name &n, function_record *r) { r->name = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring
+template <> struct process_attribute<doc> : process_attribute_default<doc> {
+    static void init(const doc &n, function_record *r) { r->doc = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring (provided as a C-style string)
+template <> struct process_attribute<const char *> : process_attribute_default<const char *> {
+    static void init(const char *d, function_record *r) { r->doc = const_cast<char *>(d); }
+    static void init(const char *d, type_record *r) { r->doc = const_cast<char *>(d); }
+};
+template <> struct process_attribute<char *> : process_attribute<const char *> { };
+
+/// Process an attribute indicating the function's return value policy
+template <> struct process_attribute<return_value_policy> : process_attribute_default<return_value_policy> {
+    static void init(const return_value_policy &p, function_record *r) { r->policy = p; }
+};
+
+/// Process an attribute which indicates that this is an overloaded function associated with a given sibling
+template <> struct process_attribute<sibling> : process_attribute_default<sibling> {
+    static void init(const sibling &s, function_record *r) { r->sibling = s.value; }
+};
+
+/// Process an attribute which indicates that this function is a method
+template <> struct process_attribute<is_method> : process_attribute_default<is_method> {
+    static void init(const is_method &s, function_record *r) { r->is_method = true; r->scope = s.class_; }
+};
+
+/// Process an attribute which indicates the parent scope of a method
+template <> struct process_attribute<scope> : process_attribute_default<scope> {
+    static void init(const scope &s, function_record *r) { r->scope = s.value; }
+};
+
+/// Process an attribute which indicates that this function is an operator
+template <> struct process_attribute<is_operator> : process_attribute_default<is_operator> {
+    static void init(const is_operator &, function_record *r) { r->is_operator = true; }
+};
+
+template <> struct process_attribute<is_new_style_constructor> : process_attribute_default<is_new_style_constructor> {
+    static void init(const is_new_style_constructor &, function_record *r) { r->is_new_style_constructor = true; }
+};
+
+/// Process a keyword argument attribute (*without* a default value)
+template <> struct process_attribute<arg> : process_attribute_default<arg> {
+    static void init(const arg &a, function_record *r) {
+        if (r->is_method && r->args.empty())
+            r->args.emplace_back("self", nullptr, handle(), true /*convert*/, false /*none not allowed*/);
+        r->args.emplace_back(a.name, nullptr, handle(), !a.flag_noconvert, a.flag_none);
+    }
+};
+
+/// Process a keyword argument attribute (*with* a default value)
+template <> struct process_attribute<arg_v> : process_attribute_default<arg_v> {
+    static void init(const arg_v &a, function_record *r) {
+        if (r->is_method && r->args.empty())
+            r->args.emplace_back("self", nullptr /*descr*/, handle() /*parent*/, true /*convert*/, false /*none not allowed*/);
+
+        if (!a.value) {
+#if !defined(NDEBUG)
+            std::string descr("'");
+            if (a.name) descr += std::string(a.name) + ": ";
+            descr += a.type + "'";
+            if (r->is_method) {
+                if (r->name)
+                    descr += " in method '" + (std::string) str(r->scope) + "." + (std::string) r->name + "'";
+                else
+                    descr += " in method of '" + (std::string) str(r->scope) + "'";
+            } else if (r->name) {
+                descr += " in function '" + (std::string) r->name + "'";
+            }
+            pybind11_fail("arg(): could not convert default argument "
+                          + descr + " into a Python object (type not registered yet?)");
+#else
+            pybind11_fail("arg(): could not convert default argument "
+                          "into a Python object (type not registered yet?). "
+                          "Compile in debug mode for more information.");
+#endif
+        }
+        r->args.emplace_back(a.name, a.descr, a.value.inc_ref(), !a.flag_noconvert, a.flag_none);
+    }
+};
+
+/// Process a parent class attribute.  Single inheritance only (class_ itself already guarantees that)
+template <typename T>
+struct process_attribute<T, enable_if_t<is_pyobject<T>::value>> : process_attribute_default<handle> {
+    static void init(const handle &h, type_record *r) { r->bases.append(h); }
+};
+
+/// Process a parent class attribute (deprecated, does not support multiple inheritance)
+template <typename T>
+struct process_attribute<base<T>> : process_attribute_default<base<T>> {
+    static void init(const base<T> &, type_record *r) { r->add_base(typeid(T), nullptr); }
+};
+
+/// Process a multiple inheritance attribute
+template <>
+struct process_attribute<multiple_inheritance> : process_attribute_default<multiple_inheritance> {
+    static void init(const multiple_inheritance &, type_record *r) { r->multiple_inheritance = true; }
+};
+
+template <>
+struct process_attribute<dynamic_attr> : process_attribute_default<dynamic_attr> {
+    static void init(const dynamic_attr &, type_record *r) { r->dynamic_attr = true; }
+};
+
+template <>
+struct process_attribute<buffer_protocol> : process_attribute_default<buffer_protocol> {
+    static void init(const buffer_protocol &, type_record *r) { r->buffer_protocol = true; }
+};
+
+template <>
+struct process_attribute<metaclass> : process_attribute_default<metaclass> {
+    static void init(const metaclass &m, type_record *r) { r->metaclass = m.value; }
+};
+
+template <>
+struct process_attribute<module_local> : process_attribute_default<module_local> {
+    static void init(const module_local &l, type_record *r) { r->module_local = l.value; }
+};
+
+/// Process an 'arithmetic' attribute for enums (does nothing here)
+template <>
+struct process_attribute<arithmetic> : process_attribute_default<arithmetic> {};
+
+template <typename... Ts>
+struct process_attribute<call_guard<Ts...>> : process_attribute_default<call_guard<Ts...>> { };
+
+/**
+ * Process a keep_alive call policy -- invokes keep_alive_impl during the
+ * pre-call handler if both Nurse, Patient != 0 and use the post-call handler
+ * otherwise
+ */
+template <size_t Nurse, size_t Patient> struct process_attribute<keep_alive<Nurse, Patient>> : public process_attribute_default<keep_alive<Nurse, Patient>> {
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void precall(function_call &call) { keep_alive_impl(Nurse, Patient, call, handle()); }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void postcall(function_call &, handle) { }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void precall(function_call &) { }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void postcall(function_call &call, handle ret) { keep_alive_impl(Nurse, Patient, call, ret); }
+};
+
+/// Recursively iterate over variadic template arguments
+template <typename... Args> struct process_attributes {
+    static void init(const Args&... args, function_record *r) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::init(args, r), 0) ... };
+        ignore_unused(unused);
+    }
+    static void init(const Args&... args, type_record *r) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::init(args, r), 0) ... };
+        ignore_unused(unused);
+    }
+    static void precall(function_call &call) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::precall(call), 0) ... };
+        ignore_unused(unused);
+    }
+    static void postcall(function_call &call, handle fn_ret) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::postcall(call, fn_ret), 0) ... };
+        ignore_unused(unused);
+    }
+};
+
+template <typename T>
+using is_call_guard = is_instantiation<call_guard, T>;
+
+/// Extract the ``type`` from the first `call_guard` in `Extras...` (or `void_type` if none found)
+template <typename... Extra>
+using extract_guard_t = typename exactly_one_t<is_call_guard, call_guard<>, Extra...>::type;
+
+/// Check the number of named arguments at compile time
+template <typename... Extra,
+          size_t named = constexpr_sum(std::is_base_of<arg, Extra>::value...),
+          size_t self  = constexpr_sum(std::is_same<is_method, Extra>::value...)>
+constexpr bool expected_num_args(size_t nargs, bool has_args, bool has_kwargs) {
+    return named == 0 || (self + named + has_args + has_kwargs) == nargs;
+}
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/buffer_info.h b/python/src/pybind11/buffer_info.h
new file mode 100644
index 000000000..9f072fa73
--- /dev/null
+++ b/python/src/pybind11/buffer_info.h
@@ -0,0 +1,108 @@
+/*
+    pybind11/buffer_info.h: Python buffer object interface
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// Information record describing a Python buffer object
+struct buffer_info {
+    void *ptr = nullptr;          // Pointer to the underlying storage
+    ssize_t itemsize = 0;         // Size of individual items in bytes
+    ssize_t size = 0;             // Total number of entries
+    std::string format;           // For homogeneous buffers, this should be set to format_descriptor<T>::format()
+    ssize_t ndim = 0;             // Number of dimensions
+    std::vector<ssize_t> shape;   // Shape of the tensor (1 entry per dimension)
+    std::vector<ssize_t> strides; // Number of entries between adjacent entries (for each per dimension)
+
+    buffer_info() { }
+
+    buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim,
+                detail::any_container<ssize_t> shape_in, detail::any_container<ssize_t> strides_in)
+    : ptr(ptr), itemsize(itemsize), size(1), format(format), ndim(ndim),
+      shape(std::move(shape_in)), strides(std::move(strides_in)) {
+        if (ndim != (ssize_t) shape.size() || ndim != (ssize_t) strides.size())
+            pybind11_fail("buffer_info: ndim doesn't match shape and/or strides length");
+        for (size_t i = 0; i < (size_t) ndim; ++i)
+            size *= shape[i];
+    }
+
+    template <typename T>
+    buffer_info(T *ptr, detail::any_container<ssize_t> shape_in, detail::any_container<ssize_t> strides_in)
+    : buffer_info(private_ctr_tag(), ptr, sizeof(T), format_descriptor<T>::format(), static_cast<ssize_t>(shape_in->size()), std::move(shape_in), std::move(strides_in)) { }
+
+    buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t size)
+    : buffer_info(ptr, itemsize, format, 1, {size}, {itemsize}) { }
+
+    template <typename T>
+    buffer_info(T *ptr, ssize_t size)
+    : buffer_info(ptr, sizeof(T), format_descriptor<T>::format(), size) { }
+
+    explicit buffer_info(Py_buffer *view, bool ownview = true)
+    : buffer_info(view->buf, view->itemsize, view->format, view->ndim,
+            {view->shape, view->shape + view->ndim}, {view->strides, view->strides + view->ndim}) {
+        this->view = view;
+        this->ownview = ownview;
+    }
+
+    buffer_info(const buffer_info &) = delete;
+    buffer_info& operator=(const buffer_info &) = delete;
+
+    buffer_info(buffer_info &&other) {
+        (*this) = std::move(other);
+    }
+
+    buffer_info& operator=(buffer_info &&rhs) {
+        ptr = rhs.ptr;
+        itemsize = rhs.itemsize;
+        size = rhs.size;
+        format = std::move(rhs.format);
+        ndim = rhs.ndim;
+        shape = std::move(rhs.shape);
+        strides = std::move(rhs.strides);
+        std::swap(view, rhs.view);
+        std::swap(ownview, rhs.ownview);
+        return *this;
+    }
+
+    ~buffer_info() {
+        if (view && ownview) { PyBuffer_Release(view); delete view; }
+    }
+
+private:
+    struct private_ctr_tag { };
+
+    buffer_info(private_ctr_tag, void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim,
+                detail::any_container<ssize_t> &&shape_in, detail::any_container<ssize_t> &&strides_in)
+    : buffer_info(ptr, itemsize, format, ndim, std::move(shape_in), std::move(strides_in)) { }
+
+    Py_buffer *view = nullptr;
+    bool ownview = false;
+};
+
+NAMESPACE_BEGIN(detail)
+
+template <typename T, typename SFINAE = void> struct compare_buffer_info {
+    static bool compare(const buffer_info& b) {
+        return b.format == format_descriptor<T>::format() && b.itemsize == (ssize_t) sizeof(T);
+    }
+};
+
+template <typename T> struct compare_buffer_info<T, detail::enable_if_t<std::is_integral<T>::value>> {
+    static bool compare(const buffer_info& b) {
+        return (size_t) b.itemsize == sizeof(T) && (b.format == format_descriptor<T>::value ||
+            ((sizeof(T) == sizeof(long)) && b.format == (std::is_unsigned<T>::value ? "L" : "l")) ||
+            ((sizeof(T) == sizeof(size_t)) && b.format == (std::is_unsigned<T>::value ? "N" : "n")));
+    }
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/cast.h b/python/src/pybind11/cast.h
new file mode 100644
index 000000000..8d0fd5d90
--- /dev/null
+++ b/python/src/pybind11/cast.h
@@ -0,0 +1,2128 @@
+/*
+    pybind11/cast.h: Partial template specializations to cast between
+    C++ and Python types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pytypes.h"
+#include "detail/typeid.h"
+#include "detail/descr.h"
+#include "detail/internals.h"
+#include <array>
+#include <limits>
+#include <tuple>
+#include <type_traits>
+
+#if defined(PYBIND11_CPP17)
+#  if defined(__has_include)
+#    if __has_include(<string_view>)
+#      define PYBIND11_HAS_STRING_VIEW
+#    endif
+#  elif defined(_MSC_VER)
+#    define PYBIND11_HAS_STRING_VIEW
+#  endif
+#endif
+#ifdef PYBIND11_HAS_STRING_VIEW
+#include <string_view>
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// A life support system for temporary objects created by `type_caster::load()`.
+/// Adding a patient will keep it alive up until the enclosing function returns.
+class loader_life_support {
+public:
+    /// A new patient frame is created when a function is entered
+    loader_life_support() {
+        get_internals().loader_patient_stack.push_back(nullptr);
+    }
+
+    /// ... and destroyed after it returns
+    ~loader_life_support() {
+        auto &stack = get_internals().loader_patient_stack;
+        if (stack.empty())
+            pybind11_fail("loader_life_support: internal error");
+
+        auto ptr = stack.back();
+        stack.pop_back();
+        Py_CLEAR(ptr);
+
+        // A heuristic to reduce the stack's capacity (e.g. after long recursive calls)
+        if (stack.capacity() > 16 && stack.size() != 0 && stack.capacity() / stack.size() > 2)
+            stack.shrink_to_fit();
+    }
+
+    /// This can only be used inside a pybind11-bound function, either by `argument_loader`
+    /// at argument preparation time or by `py::cast()` at execution time.
+    PYBIND11_NOINLINE static void add_patient(handle h) {
+        auto &stack = get_internals().loader_patient_stack;
+        if (stack.empty())
+            throw cast_error("When called outside a bound function, py::cast() cannot "
+                             "do Python -> C++ conversions which require the creation "
+                             "of temporary values");
+
+        auto &list_ptr = stack.back();
+        if (list_ptr == nullptr) {
+            list_ptr = PyList_New(1);
+            if (!list_ptr)
+                pybind11_fail("loader_life_support: error allocating list");
+            PyList_SET_ITEM(list_ptr, 0, h.inc_ref().ptr());
+        } else {
+            auto result = PyList_Append(list_ptr, h.ptr());
+            if (result == -1)
+                pybind11_fail("loader_life_support: error adding patient");
+        }
+    }
+};
+
+// Gets the cache entry for the given type, creating it if necessary.  The return value is the pair
+// returned by emplace, i.e. an iterator for the entry and a bool set to `true` if the entry was
+// just created.
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_type_info_get_cache(PyTypeObject *type);
+
+// Populates a just-created cache entry.
+PYBIND11_NOINLINE inline void all_type_info_populate(PyTypeObject *t, std::vector<type_info *> &bases) {
+    std::vector<PyTypeObject *> check;
+    for (handle parent : reinterpret_borrow<tuple>(t->tp_bases))
+        check.push_back((PyTypeObject *) parent.ptr());
+
+    auto const &type_dict = get_internals().registered_types_py;
+    for (size_t i = 0; i < check.size(); i++) {
+        auto type = check[i];
+        // Ignore Python2 old-style class super type:
+        if (!PyType_Check((PyObject *) type)) continue;
+
+        // Check `type` in the current set of registered python types:
+        auto it = type_dict.find(type);
+        if (it != type_dict.end()) {
+            // We found a cache entry for it, so it's either pybind-registered or has pre-computed
+            // pybind bases, but we have to make sure we haven't already seen the type(s) before: we
+            // want to follow Python/virtual C++ rules that there should only be one instance of a
+            // common base.
+            for (auto *tinfo : it->second) {
+                // NB: Could use a second set here, rather than doing a linear search, but since
+                // having a large number of immediate pybind11-registered types seems fairly
+                // unlikely, that probably isn't worthwhile.
+                bool found = false;
+                for (auto *known : bases) {
+                    if (known == tinfo) { found = true; break; }
+                }
+                if (!found) bases.push_back(tinfo);
+            }
+        }
+        else if (type->tp_bases) {
+            // It's some python type, so keep follow its bases classes to look for one or more
+            // registered types
+            if (i + 1 == check.size()) {
+                // When we're at the end, we can pop off the current element to avoid growing
+                // `check` when adding just one base (which is typical--i.e. when there is no
+                // multiple inheritance)
+                check.pop_back();
+                i--;
+            }
+            for (handle parent : reinterpret_borrow<tuple>(type->tp_bases))
+                check.push_back((PyTypeObject *) parent.ptr());
+        }
+    }
+}
+
+/**
+ * Extracts vector of type_info pointers of pybind-registered roots of the given Python type.  Will
+ * be just 1 pybind type for the Python type of a pybind-registered class, or for any Python-side
+ * derived class that uses single inheritance.  Will contain as many types as required for a Python
+ * class that uses multiple inheritance to inherit (directly or indirectly) from multiple
+ * pybind-registered classes.  Will be empty if neither the type nor any base classes are
+ * pybind-registered.
+ *
+ * The value is cached for the lifetime of the Python type.
+ */
+inline const std::vector<detail::type_info *> &all_type_info(PyTypeObject *type) {
+    auto ins = all_type_info_get_cache(type);
+    if (ins.second)
+        // New cache entry: populate it
+        all_type_info_populate(type, ins.first->second);
+
+    return ins.first->second;
+}
+
+/**
+ * Gets a single pybind11 type info for a python type.  Returns nullptr if neither the type nor any
+ * ancestors are pybind11-registered.  Throws an exception if there are multiple bases--use
+ * `all_type_info` instead if you want to support multiple bases.
+ */
+PYBIND11_NOINLINE inline detail::type_info* get_type_info(PyTypeObject *type) {
+    auto &bases = all_type_info(type);
+    if (bases.size() == 0)
+        return nullptr;
+    if (bases.size() > 1)
+        pybind11_fail("pybind11::detail::get_type_info: type has multiple pybind11-registered bases");
+    return bases.front();
+}
+
+inline detail::type_info *get_local_type_info(const std::type_index &tp) {
+    auto &locals = registered_local_types_cpp();
+    auto it = locals.find(tp);
+    if (it != locals.end())
+        return it->second;
+    return nullptr;
+}
+
+inline detail::type_info *get_global_type_info(const std::type_index &tp) {
+    auto &types = get_internals().registered_types_cpp;
+    auto it = types.find(tp);
+    if (it != types.end())
+        return it->second;
+    return nullptr;
+}
+
+/// Return the type info for a given C++ type; on lookup failure can either throw or return nullptr.
+PYBIND11_NOINLINE inline detail::type_info *get_type_info(const std::type_index &tp,
+                                                          bool throw_if_missing = false) {
+    if (auto ltype = get_local_type_info(tp))
+        return ltype;
+    if (auto gtype = get_global_type_info(tp))
+        return gtype;
+
+    if (throw_if_missing) {
+        std::string tname = tp.name();
+        detail::clean_type_id(tname);
+        pybind11_fail("pybind11::detail::get_type_info: unable to find type info for \"" + tname + "\"");
+    }
+    return nullptr;
+}
+
+PYBIND11_NOINLINE inline handle get_type_handle(const std::type_info &tp, bool throw_if_missing) {
+    detail::type_info *type_info = get_type_info(tp, throw_if_missing);
+    return handle(type_info ? ((PyObject *) type_info->type) : nullptr);
+}
+
+struct value_and_holder {
+    instance *inst = nullptr;
+    size_t index = 0u;
+    const detail::type_info *type = nullptr;
+    void **vh = nullptr;
+
+    // Main constructor for a found value/holder:
+    value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index) :
+        inst{i}, index{index}, type{type},
+        vh{inst->simple_layout ? inst->simple_value_holder : &inst->nonsimple.values_and_holders[vpos]}
+    {}
+
+    // Default constructor (used to signal a value-and-holder not found by get_value_and_holder())
+    value_and_holder() {}
+
+    // Used for past-the-end iterator
+    value_and_holder(size_t index) : index{index} {}
+
+    template <typename V = void> V *&value_ptr() const {
+        return reinterpret_cast<V *&>(vh[0]);
+    }
+    // True if this `value_and_holder` has a non-null value pointer
+    explicit operator bool() const { return value_ptr(); }
+
+    template <typename H> H &holder() const {
+        return reinterpret_cast<H &>(vh[1]);
+    }
+    bool holder_constructed() const {
+        return inst->simple_layout
+            ? inst->simple_holder_constructed
+            : inst->nonsimple.status[index] & instance::status_holder_constructed;
+    }
+    void set_holder_constructed(bool v = true) {
+        if (inst->simple_layout)
+            inst->simple_holder_constructed = v;
+        else if (v)
+            inst->nonsimple.status[index] |= instance::status_holder_constructed;
+        else
+            inst->nonsimple.status[index] &= (uint8_t) ~instance::status_holder_constructed;
+    }
+    bool instance_registered() const {
+        return inst->simple_layout
+            ? inst->simple_instance_registered
+            : inst->nonsimple.status[index] & instance::status_instance_registered;
+    }
+    void set_instance_registered(bool v = true) {
+        if (inst->simple_layout)
+            inst->simple_instance_registered = v;
+        else if (v)
+            inst->nonsimple.status[index] |= instance::status_instance_registered;
+        else
+            inst->nonsimple.status[index] &= (uint8_t) ~instance::status_instance_registered;
+    }
+};
+
+// Container for accessing and iterating over an instance's values/holders
+struct values_and_holders {
+private:
+    instance *inst;
+    using type_vec = std::vector<detail::type_info *>;
+    const type_vec &tinfo;
+
+public:
+    values_and_holders(instance *inst) : inst{inst}, tinfo(all_type_info(Py_TYPE(inst))) {}
+
+    struct iterator {
+    private:
+        instance *inst = nullptr;
+        const type_vec *types = nullptr;
+        value_and_holder curr;
+        friend struct values_and_holders;
+        iterator(instance *inst, const type_vec *tinfo)
+            : inst{inst}, types{tinfo},
+            curr(inst /* instance */,
+                 types->empty() ? nullptr : (*types)[0] /* type info */,
+                 0, /* vpos: (non-simple types only): the first vptr comes first */
+                 0 /* index */)
+        {}
+        // Past-the-end iterator:
+        iterator(size_t end) : curr(end) {}
+    public:
+        bool operator==(const iterator &other) { return curr.index == other.curr.index; }
+        bool operator!=(const iterator &other) { return curr.index != other.curr.index; }
+        iterator &operator++() {
+            if (!inst->simple_layout)
+                curr.vh += 1 + (*types)[curr.index]->holder_size_in_ptrs;
+            ++curr.index;
+            curr.type = curr.index < types->size() ? (*types)[curr.index] : nullptr;
+            return *this;
+        }
+        value_and_holder &operator*() { return curr; }
+        value_and_holder *operator->() { return &curr; }
+    };
+
+    iterator begin() { return iterator(inst, &tinfo); }
+    iterator end() { return iterator(tinfo.size()); }
+
+    iterator find(const type_info *find_type) {
+        auto it = begin(), endit = end();
+        while (it != endit && it->type != find_type) ++it;
+        return it;
+    }
+
+    size_t size() { return tinfo.size(); }
+};
+
+/**
+ * Extracts C++ value and holder pointer references from an instance (which may contain multiple
+ * values/holders for python-side multiple inheritance) that match the given type.  Throws an error
+ * if the given type (or ValueType, if omitted) is not a pybind11 base of the given instance.  If
+ * `find_type` is omitted (or explicitly specified as nullptr) the first value/holder are returned,
+ * regardless of type (and the resulting .type will be nullptr).
+ *
+ * The returned object should be short-lived: in particular, it must not outlive the called-upon
+ * instance.
+ */
+PYBIND11_NOINLINE inline value_and_holder instance::get_value_and_holder(const type_info *find_type /*= nullptr default in common.h*/, bool throw_if_missing /*= true in common.h*/) {
+    // Optimize common case:
+    if (!find_type || Py_TYPE(this) == find_type->type)
+        return value_and_holder(this, find_type, 0, 0);
+
+    detail::values_and_holders vhs(this);
+    auto it = vhs.find(find_type);
+    if (it != vhs.end())
+        return *it;
+
+    if (!throw_if_missing)
+        return value_and_holder();
+
+#if defined(NDEBUG)
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: "
+            "type is not a pybind11 base of the given instance "
+            "(compile in debug mode for type details)");
+#else
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: `" +
+            std::string(find_type->type->tp_name) + "' is not a pybind11 base of the given `" +
+            std::string(Py_TYPE(this)->tp_name) + "' instance");
+#endif
+}
+
+PYBIND11_NOINLINE inline void instance::allocate_layout() {
+    auto &tinfo = all_type_info(Py_TYPE(this));
+
+    const size_t n_types = tinfo.size();
+
+    if (n_types == 0)
+        pybind11_fail("instance allocation failed: new instance has no pybind11-registered base types");
+
+    simple_layout =
+        n_types == 1 && tinfo.front()->holder_size_in_ptrs <= instance_simple_holder_in_ptrs();
+
+    // Simple path: no python-side multiple inheritance, and a small-enough holder
+    if (simple_layout) {
+        simple_value_holder[0] = nullptr;
+        simple_holder_constructed = false;
+        simple_instance_registered = false;
+    }
+    else { // multiple base types or a too-large holder
+        // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a value pointer,
+        // [hN] is the (uninitialized) holder instance for value N, and [bb...] is a set of bool
+        // values that tracks whether each associated holder has been initialized.  Each [block] is
+        // padded, if necessary, to an integer multiple of sizeof(void *).
+        size_t space = 0;
+        for (auto t : tinfo) {
+            space += 1; // value pointer
+            space += t->holder_size_in_ptrs; // holder instance
+        }
+        size_t flags_at = space;
+        space += size_in_ptrs(n_types); // status bytes (holder_constructed and instance_registered)
+
+        // Allocate space for flags, values, and holders, and initialize it to 0 (flags and values,
+        // in particular, need to be 0).  Use Python's memory allocation functions: in Python 3.6
+        // they default to using pymalloc, which is designed to be efficient for small allocations
+        // like the one we're doing here; in earlier versions (and for larger allocations) they are
+        // just wrappers around malloc.
+#if PY_VERSION_HEX >= 0x03050000
+        nonsimple.values_and_holders = (void **) PyMem_Calloc(space, sizeof(void *));
+        if (!nonsimple.values_and_holders) throw std::bad_alloc();
+#else
+        nonsimple.values_and_holders = (void **) PyMem_New(void *, space);
+        if (!nonsimple.values_and_holders) throw std::bad_alloc();
+        std::memset(nonsimple.values_and_holders, 0, space * sizeof(void *));
+#endif
+        nonsimple.status = reinterpret_cast<uint8_t *>(&nonsimple.values_and_holders[flags_at]);
+    }
+    owned = true;
+}
+
+PYBIND11_NOINLINE inline void instance::deallocate_layout() {
+    if (!simple_layout)
+        PyMem_Free(nonsimple.values_and_holders);
+}
+
+PYBIND11_NOINLINE inline bool isinstance_generic(handle obj, const std::type_info &tp) {
+    handle type = detail::get_type_handle(tp, false);
+    if (!type)
+        return false;
+    return isinstance(obj, type);
+}
+
+PYBIND11_NOINLINE inline std::string error_string() {
+    if (!PyErr_Occurred()) {
+        PyErr_SetString(PyExc_RuntimeError, "Unknown internal error occurred");
+        return "Unknown internal error occurred";
+    }
+
+    error_scope scope; // Preserve error state
+
+    std::string errorString;
+    if (scope.type) {
+        errorString += handle(scope.type).attr("__name__").cast<std::string>();
+        errorString += ": ";
+    }
+    if (scope.value)
+        errorString += (std::string) str(scope.value);
+
+    PyErr_NormalizeException(&scope.type, &scope.value, &scope.trace);
+
+#if PY_MAJOR_VERSION >= 3
+    if (scope.trace != nullptr)
+        PyException_SetTraceback(scope.value, scope.trace);
+#endif
+
+#if !defined(PYPY_VERSION)
+    if (scope.trace) {
+        PyTracebackObject *trace = (PyTracebackObject *) scope.trace;
+
+        /* Get the deepest trace possible */
+        while (trace->tb_next)
+            trace = trace->tb_next;
+
+        PyFrameObject *frame = trace->tb_frame;
+        errorString += "\n\nAt:\n";
+        while (frame) {
+            int lineno = PyFrame_GetLineNumber(frame);
+            errorString +=
+                "  " + handle(frame->f_code->co_filename).cast<std::string>() +
+                "(" + std::to_string(lineno) + "): " +
+                handle(frame->f_code->co_name).cast<std::string>() + "\n";
+            frame = frame->f_back;
+        }
+    }
+#endif
+
+    return errorString;
+}
+
+PYBIND11_NOINLINE inline handle get_object_handle(const void *ptr, const detail::type_info *type ) {
+    auto &instances = get_internals().registered_instances;
+    auto range = instances.equal_range(ptr);
+    for (auto it = range.first; it != range.second; ++it) {
+        for (auto vh : values_and_holders(it->second)) {
+            if (vh.type == type)
+                return handle((PyObject *) it->second);
+        }
+    }
+    return handle();
+}
+
+inline PyThreadState *get_thread_state_unchecked() {
+#if defined(PYPY_VERSION)
+    return PyThreadState_GET();
+#elif PY_VERSION_HEX < 0x03000000
+    return _PyThreadState_Current;
+#elif PY_VERSION_HEX < 0x03050000
+    return (PyThreadState*) _Py_atomic_load_relaxed(&_PyThreadState_Current);
+#elif PY_VERSION_HEX < 0x03050200
+    return (PyThreadState*) _PyThreadState_Current.value;
+#else
+    return _PyThreadState_UncheckedGet();
+#endif
+}
+
+// Forward declarations
+inline void keep_alive_impl(handle nurse, handle patient);
+inline PyObject *make_new_instance(PyTypeObject *type);
+
+class type_caster_generic {
+public:
+    PYBIND11_NOINLINE type_caster_generic(const std::type_info &type_info)
+        : typeinfo(get_type_info(type_info)), cpptype(&type_info) { }
+
+    type_caster_generic(const type_info *typeinfo)
+        : typeinfo(typeinfo), cpptype(typeinfo ? typeinfo->cpptype : nullptr) { }
+
+    bool load(handle src, bool convert) {
+        return load_impl<type_caster_generic>(src, convert);
+    }
+
+    PYBIND11_NOINLINE static handle cast(const void *_src, return_value_policy policy, handle parent,
+                                         const detail::type_info *tinfo,
+                                         void *(*copy_constructor)(const void *),
+                                         void *(*move_constructor)(const void *),
+                                         const void *existing_holder = nullptr) {
+        if (!tinfo) // no type info: error will be set already
+            return handle();
+
+        void *src = const_cast<void *>(_src);
+        if (src == nullptr)
+            return none().release();
+
+        auto it_instances = get_internals().registered_instances.equal_range(src);
+        for (auto it_i = it_instances.first; it_i != it_instances.second; ++it_i) {
+            for (auto instance_type : detail::all_type_info(Py_TYPE(it_i->second))) {
+                if (instance_type && same_type(*instance_type->cpptype, *tinfo->cpptype))
+                    return handle((PyObject *) it_i->second).inc_ref();
+            }
+        }
+
+        auto inst = reinterpret_steal<object>(make_new_instance(tinfo->type));
+        auto wrapper = reinterpret_cast<instance *>(inst.ptr());
+        wrapper->owned = false;
+        void *&valueptr = values_and_holders(wrapper).begin()->value_ptr();
+
+        switch (policy) {
+            case return_value_policy::automatic:
+            case return_value_policy::take_ownership:
+                valueptr = src;
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::automatic_reference:
+            case return_value_policy::reference:
+                valueptr = src;
+                wrapper->owned = false;
+                break;
+
+            case return_value_policy::copy:
+                if (copy_constructor)
+                    valueptr = copy_constructor(src);
+                else
+                    throw cast_error("return_value_policy = copy, but the "
+                                     "object is non-copyable!");
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::move:
+                if (move_constructor)
+                    valueptr = move_constructor(src);
+                else if (copy_constructor)
+                    valueptr = copy_constructor(src);
+                else
+                    throw cast_error("return_value_policy = move, but the "
+                                     "object is neither movable nor copyable!");
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::reference_internal:
+                valueptr = src;
+                wrapper->owned = false;
+                keep_alive_impl(inst, parent);
+                break;
+
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        }
+
+        tinfo->init_instance(wrapper, existing_holder);
+
+        return inst.release();
+    }
+
+    // Base methods for generic caster; there are overridden in copyable_holder_caster
+    void load_value(value_and_holder &&v_h) {
+        auto *&vptr = v_h.value_ptr();
+        // Lazy allocation for unallocated values:
+        if (vptr == nullptr) {
+            auto *type = v_h.type ? v_h.type : typeinfo;
+            if (type->operator_new) {
+                vptr = type->operator_new(type->type_size);
+            } else {
+                #if defined(PYBIND11_CPP17)
+                    if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
+                        vptr = ::operator new(type->type_size,
+                                              (std::align_val_t) type->type_align);
+                    else
+                #endif
+                vptr = ::operator new(type->type_size);
+            }
+        }
+        value = vptr;
+    }
+    bool try_implicit_casts(handle src, bool convert) {
+        for (auto &cast : typeinfo->implicit_casts) {
+            type_caster_generic sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                return true;
+            }
+        }
+        return false;
+    }
+    bool try_direct_conversions(handle src) {
+        for (auto &converter : *typeinfo->direct_conversions) {
+            if (converter(src.ptr(), value))
+                return true;
+        }
+        return false;
+    }
+    void check_holder_compat() {}
+
+    PYBIND11_NOINLINE static void *local_load(PyObject *src, const type_info *ti) {
+        auto caster = type_caster_generic(ti);
+        if (caster.load(src, false))
+            return caster.value;
+        return nullptr;
+    }
+
+    /// Try to load with foreign typeinfo, if available. Used when there is no
+    /// native typeinfo, or when the native one wasn't able to produce a value.
+    PYBIND11_NOINLINE bool try_load_foreign_module_local(handle src) {
+        constexpr auto *local_key = PYBIND11_MODULE_LOCAL_ID;
+        const auto pytype = src.get_type();
+        if (!hasattr(pytype, local_key))
+            return false;
+
+        type_info *foreign_typeinfo = reinterpret_borrow<capsule>(getattr(pytype, local_key));
+        // Only consider this foreign loader if actually foreign and is a loader of the correct cpp type
+        if (foreign_typeinfo->module_local_load == &local_load
+            || (cpptype && !same_type(*cpptype, *foreign_typeinfo->cpptype)))
+            return false;
+
+        if (auto result = foreign_typeinfo->module_local_load(src.ptr(), foreign_typeinfo)) {
+            value = result;
+            return true;
+        }
+        return false;
+    }
+
+    // Implementation of `load`; this takes the type of `this` so that it can dispatch the relevant
+    // bits of code between here and copyable_holder_caster where the two classes need different
+    // logic (without having to resort to virtual inheritance).
+    template <typename ThisT>
+    PYBIND11_NOINLINE bool load_impl(handle src, bool convert) {
+        if (!src) return false;
+        if (!typeinfo) return try_load_foreign_module_local(src);
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
+            value = nullptr;
+            return true;
+        }
+
+        auto &this_ = static_cast<ThisT &>(*this);
+        this_.check_holder_compat();
+
+        PyTypeObject *srctype = Py_TYPE(src.ptr());
+
+        // Case 1: If src is an exact type match for the target type then we can reinterpret_cast
+        // the instance's value pointer to the target type:
+        if (srctype == typeinfo->type) {
+            this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+            return true;
+        }
+        // Case 2: We have a derived class
+        else if (PyType_IsSubtype(srctype, typeinfo->type)) {
+            auto &bases = all_type_info(srctype);
+            bool no_cpp_mi = typeinfo->simple_type;
+
+            // Case 2a: the python type is a Python-inherited derived class that inherits from just
+            // one simple (no MI) pybind11 class, or is an exact match, so the C++ instance is of
+            // the right type and we can use reinterpret_cast.
+            // (This is essentially the same as case 2b, but because not using multiple inheritance
+            // is extremely common, we handle it specially to avoid the loop iterator and type
+            // pointer lookup overhead)
+            if (bases.size() == 1 && (no_cpp_mi || bases.front()->type == typeinfo->type)) {
+                this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+                return true;
+            }
+            // Case 2b: the python type inherits from multiple C++ bases.  Check the bases to see if
+            // we can find an exact match (or, for a simple C++ type, an inherited match); if so, we
+            // can safely reinterpret_cast to the relevant pointer.
+            else if (bases.size() > 1) {
+                for (auto base : bases) {
+                    if (no_cpp_mi ? PyType_IsSubtype(base->type, typeinfo->type) : base->type == typeinfo->type) {
+                        this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder(base));
+                        return true;
+                    }
+                }
+            }
+
+            // Case 2c: C++ multiple inheritance is involved and we couldn't find an exact type match
+            // in the registered bases, above, so try implicit casting (needed for proper C++ casting
+            // when MI is involved).
+            if (this_.try_implicit_casts(src, convert))
+                return true;
+        }
+
+        // Perform an implicit conversion
+        if (convert) {
+            for (auto &converter : typeinfo->implicit_conversions) {
+                auto temp = reinterpret_steal<object>(converter(src.ptr(), typeinfo->type));
+                if (load_impl<ThisT>(temp, false)) {
+                    loader_life_support::add_patient(temp);
+                    return true;
+                }
+            }
+            if (this_.try_direct_conversions(src))
+                return true;
+        }
+
+        // Failed to match local typeinfo. Try again with global.
+        if (typeinfo->module_local) {
+            if (auto gtype = get_global_type_info(*typeinfo->cpptype)) {
+                typeinfo = gtype;
+                return load(src, false);
+            }
+        }
+
+        // Global typeinfo has precedence over foreign module_local
+        return try_load_foreign_module_local(src);
+    }
+
+
+    // Called to do type lookup and wrap the pointer and type in a pair when a dynamic_cast
+    // isn't needed or can't be used.  If the type is unknown, sets the error and returns a pair
+    // with .second = nullptr.  (p.first = nullptr is not an error: it becomes None).
+    PYBIND11_NOINLINE static std::pair<const void *, const type_info *> src_and_type(
+            const void *src, const std::type_info &cast_type, const std::type_info *rtti_type = nullptr) {
+        if (auto *tpi = get_type_info(cast_type))
+            return {src, const_cast<const type_info *>(tpi)};
+
+        // Not found, set error:
+        std::string tname = rtti_type ? rtti_type->name() : cast_type.name();
+        detail::clean_type_id(tname);
+        std::string msg = "Unregistered type : " + tname;
+        PyErr_SetString(PyExc_TypeError, msg.c_str());
+        return {nullptr, nullptr};
+    }
+
+    const type_info *typeinfo = nullptr;
+    const std::type_info *cpptype = nullptr;
+    void *value = nullptr;
+};
+
+/**
+ * Determine suitable casting operator for pointer-or-lvalue-casting type casters.  The type caster
+ * needs to provide `operator T*()` and `operator T&()` operators.
+ *
+ * If the type supports moving the value away via an `operator T&&() &&` method, it should use
+ * `movable_cast_op_type` instead.
+ */
+template <typename T>
+using cast_op_type =
+    conditional_t<std::is_pointer<remove_reference_t<T>>::value,
+        typename std::add_pointer<intrinsic_t<T>>::type,
+        typename std::add_lvalue_reference<intrinsic_t<T>>::type>;
+
+/**
+ * Determine suitable casting operator for a type caster with a movable value.  Such a type caster
+ * needs to provide `operator T*()`, `operator T&()`, and `operator T&&() &&`.  The latter will be
+ * called in appropriate contexts where the value can be moved rather than copied.
+ *
+ * These operator are automatically provided when using the PYBIND11_TYPE_CASTER macro.
+ */
+template <typename T>
+using movable_cast_op_type =
+    conditional_t<std::is_pointer<typename std::remove_reference<T>::type>::value,
+        typename std::add_pointer<intrinsic_t<T>>::type,
+    conditional_t<std::is_rvalue_reference<T>::value,
+        typename std::add_rvalue_reference<intrinsic_t<T>>::type,
+        typename std::add_lvalue_reference<intrinsic_t<T>>::type>>;
+
+// std::is_copy_constructible isn't quite enough: it lets std::vector<T> (and similar) through when
+// T is non-copyable, but code containing such a copy constructor fails to actually compile.
+template <typename T, typename SFINAE = void> struct is_copy_constructible : std::is_copy_constructible<T> {};
+
+// Specialization for types that appear to be copy constructible but also look like stl containers
+// (we specifically check for: has `value_type` and `reference` with `reference = value_type&`): if
+// so, copy constructability depends on whether the value_type is copy constructible.
+template <typename Container> struct is_copy_constructible<Container, enable_if_t<all_of<
+        std::is_copy_constructible<Container>,
+        std::is_same<typename Container::value_type &, typename Container::reference>
+    >::value>> : is_copy_constructible<typename Container::value_type> {};
+
+#if !defined(PYBIND11_CPP17)
+// Likewise for std::pair before C++17 (which mandates that the copy constructor not exist when the
+// two types aren't themselves copy constructible).
+template <typename T1, typename T2> struct is_copy_constructible<std::pair<T1, T2>>
+    : all_of<is_copy_constructible<T1>, is_copy_constructible<T2>> {};
+#endif
+
+NAMESPACE_END(detail)
+
+// polymorphic_type_hook<itype>::get(src, tinfo) determines whether the object pointed
+// to by `src` actually is an instance of some class derived from `itype`.
+// If so, it sets `tinfo` to point to the std::type_info representing that derived
+// type, and returns a pointer to the start of the most-derived object of that type
+// (in which `src` is a subobject; this will be the same address as `src` in most
+// single inheritance cases). If not, or if `src` is nullptr, it simply returns `src`
+// and leaves `tinfo` at its default value of nullptr.
+//
+// The default polymorphic_type_hook just returns src. A specialization for polymorphic
+// types determines the runtime type of the passed object and adjusts the this-pointer
+// appropriately via dynamic_cast<void*>. This is what enables a C++ Animal* to appear
+// to Python as a Dog (if Dog inherits from Animal, Animal is polymorphic, Dog is
+// registered with pybind11, and this Animal is in fact a Dog).
+//
+// You may specialize polymorphic_type_hook yourself for types that want to appear
+// polymorphic to Python but do not use C++ RTTI. (This is a not uncommon pattern
+// in performance-sensitive applications, used most notably in LLVM.)
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook
+{
+    static const void *get(const itype *src, const std::type_info*&) { return src; }
+};
+template <typename itype>
+struct polymorphic_type_hook<itype, detail::enable_if_t<std::is_polymorphic<itype>::value>>
+{
+    static const void *get(const itype *src, const std::type_info*& type) {
+        type = src ? &typeid(*src) : nullptr;
+        return dynamic_cast<const void*>(src);
+    }
+};
+
+NAMESPACE_BEGIN(detail)
+
+/// Generic type caster for objects stored on the heap
+template <typename type> class type_caster_base : public type_caster_generic {
+    using itype = intrinsic_t<type>;
+
+public:
+    static constexpr auto name = _<type>();
+
+    type_caster_base() : type_caster_base(typeid(type)) { }
+    explicit type_caster_base(const std::type_info &info) : type_caster_generic(info) { }
+
+    static handle cast(const itype &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast(&src, policy, parent);
+    }
+
+    static handle cast(itype &&src, return_value_policy, handle parent) {
+        return cast(&src, return_value_policy::move, parent);
+    }
+
+    // Returns a (pointer, type_info) pair taking care of necessary type lookup for a
+    // polymorphic type (using RTTI by default, but can be overridden by specializing
+    // polymorphic_type_hook). If the instance isn't derived, returns the base version.
+    static std::pair<const void *, const type_info *> src_and_type(const itype *src) {
+        auto &cast_type = typeid(itype);
+        const std::type_info *instance_type = nullptr;
+        const void *vsrc = polymorphic_type_hook<itype>::get(src, instance_type);
+        if (instance_type && !same_type(cast_type, *instance_type)) {
+            // This is a base pointer to a derived type. If the derived type is registered
+            // with pybind11, we want to make the full derived object available.
+            // In the typical case where itype is polymorphic, we get the correct
+            // derived pointer (which may be != base pointer) by a dynamic_cast to
+            // most derived type. If itype is not polymorphic, we won't get here
+            // except via a user-provided specialization of polymorphic_type_hook,
+            // and the user has promised that no this-pointer adjustment is
+            // required in that case, so it's OK to use static_cast.
+            if (const auto *tpi = get_type_info(*instance_type))
+                return {vsrc, tpi};
+        }
+        // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer, so
+        // don't do a cast
+        return type_caster_generic::src_and_type(src, cast_type, instance_type);
+    }
+
+    static handle cast(const itype *src, return_value_policy policy, handle parent) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(
+            st.first, policy, parent, st.second,
+            make_copy_constructor(src), make_move_constructor(src));
+    }
+
+    static handle cast_holder(const itype *src, const void *holder) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(
+            st.first, return_value_policy::take_ownership, {}, st.second,
+            nullptr, nullptr, holder);
+    }
+
+    template <typename T> using cast_op_type = detail::cast_op_type<T>;
+
+    operator itype*() { return (type *) value; }
+    operator itype&() { if (!value) throw reference_cast_error(); return *((itype *) value); }
+
+protected:
+    using Constructor = void *(*)(const void *);
+
+    /* Only enabled when the types are {copy,move}-constructible *and* when the type
+       does not have a private operator new implementation. */
+    template <typename T, typename = enable_if_t<is_copy_constructible<T>::value>>
+    static auto make_copy_constructor(const T *x) -> decltype(new T(*x), Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(*reinterpret_cast<const T *>(arg));
+        };
+    }
+
+    template <typename T, typename = enable_if_t<std::is_move_constructible<T>::value>>
+    static auto make_move_constructor(const T *x) -> decltype(new T(std::move(*const_cast<T *>(x))), Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(std::move(*const_cast<T *>(reinterpret_cast<const T *>(arg))));
+        };
+    }
+
+    static Constructor make_copy_constructor(...) { return nullptr; }
+    static Constructor make_move_constructor(...) { return nullptr; }
+};
+
+template <typename type, typename SFINAE = void> class type_caster : public type_caster_base<type> { };
+template <typename type> using make_caster = type_caster<intrinsic_t<type>>;
+
+// Shortcut for calling a caster's `cast_op_type` cast operator for casting a type_caster to a T
+template <typename T> typename make_caster<T>::template cast_op_type<T> cast_op(make_caster<T> &caster) {
+    return caster.operator typename make_caster<T>::template cast_op_type<T>();
+}
+template <typename T> typename make_caster<T>::template cast_op_type<typename std::add_rvalue_reference<T>::type>
+cast_op(make_caster<T> &&caster) {
+    return std::move(caster).operator
+        typename make_caster<T>::template cast_op_type<typename std::add_rvalue_reference<T>::type>();
+}
+
+template <typename type> class type_caster<std::reference_wrapper<type>> {
+private:
+    using caster_t = make_caster<type>;
+    caster_t subcaster;
+    using subcaster_cast_op_type = typename caster_t::template cast_op_type<type>;
+    static_assert(std::is_same<typename std::remove_const<type>::type &, subcaster_cast_op_type>::value,
+            "std::reference_wrapper<T> caster requires T to have a caster with an `T &` operator");
+public:
+    bool load(handle src, bool convert) { return subcaster.load(src, convert); }
+    static constexpr auto name = caster_t::name;
+    static handle cast(const std::reference_wrapper<type> &src, return_value_policy policy, handle parent) {
+        // It is definitely wrong to take ownership of this pointer, so mask that rvp
+        if (policy == return_value_policy::take_ownership || policy == return_value_policy::automatic)
+            policy = return_value_policy::automatic_reference;
+        return caster_t::cast(&src.get(), policy, parent);
+    }
+    template <typename T> using cast_op_type = std::reference_wrapper<type>;
+    operator std::reference_wrapper<type>() { return subcaster.operator subcaster_cast_op_type&(); }
+};
+
+#define PYBIND11_TYPE_CASTER(type, py_name) \
+    protected: \
+        type value; \
+    public: \
+        static constexpr auto name = py_name; \
+        template <typename T_, enable_if_t<std::is_same<type, remove_cv_t<T_>>::value, int> = 0> \
+        static handle cast(T_ *src, return_value_policy policy, handle parent) { \
+            if (!src) return none().release(); \
+            if (policy == return_value_policy::take_ownership) { \
+                auto h = cast(std::move(*src), policy, parent); delete src; return h; \
+            } else { \
+                return cast(*src, policy, parent); \
+            } \
+        } \
+        operator type*() { return &value; } \
+        operator type&() { return value; } \
+        operator type&&() && { return std::move(value); } \
+        template <typename T_> using cast_op_type = pybind11::detail::movable_cast_op_type<T_>
+
+
+template <typename CharT> using is_std_char_type = any_of<
+    std::is_same<CharT, char>, /* std::string */
+    std::is_same<CharT, char16_t>, /* std::u16string */
+    std::is_same<CharT, char32_t>, /* std::u32string */
+    std::is_same<CharT, wchar_t> /* std::wstring */
+>;
+
+template <typename T>
+struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_type<T>::value>> {
+    using _py_type_0 = conditional_t<sizeof(T) <= sizeof(long), long, long long>;
+    using _py_type_1 = conditional_t<std::is_signed<T>::value, _py_type_0, typename std::make_unsigned<_py_type_0>::type>;
+    using py_type = conditional_t<std::is_floating_point<T>::value, double, _py_type_1>;
+public:
+
+    bool load(handle src, bool convert) {
+        py_type py_value;
+
+        if (!src)
+            return false;
+
+        if (std::is_floating_point<T>::value) {
+            if (convert || PyFloat_Check(src.ptr()))
+                py_value = (py_type) PyFloat_AsDouble(src.ptr());
+            else
+                return false;
+        } else if (PyFloat_Check(src.ptr())) {
+            return false;
+        } else if (std::is_unsigned<py_type>::value) {
+            py_value = as_unsigned<py_type>(src.ptr());
+        } else { // signed integer:
+            py_value = sizeof(T) <= sizeof(long)
+                ? (py_type) PyLong_AsLong(src.ptr())
+                : (py_type) PYBIND11_LONG_AS_LONGLONG(src.ptr());
+        }
+
+        bool py_err = py_value == (py_type) -1 && PyErr_Occurred();
+        if (py_err || (std::is_integral<T>::value && sizeof(py_type) != sizeof(T) &&
+                       (py_value < (py_type) std::numeric_limits<T>::min() ||
+                        py_value > (py_type) std::numeric_limits<T>::max()))) {
+            bool type_error = py_err && PyErr_ExceptionMatches(
+#if PY_VERSION_HEX < 0x03000000 && !defined(PYPY_VERSION)
+                PyExc_SystemError
+#else
+                PyExc_TypeError
+#endif
+            );
+            PyErr_Clear();
+            if (type_error && convert && PyNumber_Check(src.ptr())) {
+                auto tmp = reinterpret_steal<object>(std::is_floating_point<T>::value
+                                                     ? PyNumber_Float(src.ptr())
+                                                     : PyNumber_Long(src.ptr()));
+                PyErr_Clear();
+                return load(tmp, false);
+            }
+            return false;
+        }
+
+        value = (T) py_value;
+        return true;
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<std::is_floating_point<U>::value, handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyFloat_FromDouble((double) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value && (sizeof(U) <= sizeof(long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PYBIND11_LONG_FROM_SIGNED((long) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value && (sizeof(U) <= sizeof(unsigned long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PYBIND11_LONG_FROM_UNSIGNED((unsigned long) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value && (sizeof(U) > sizeof(long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyLong_FromLongLong((long long) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value && (sizeof(U) > sizeof(unsigned long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyLong_FromUnsignedLongLong((unsigned long long) src);
+    }
+
+    PYBIND11_TYPE_CASTER(T, _<std::is_integral<T>::value>("int", "float"));
+};
+
+template<typename T> struct void_caster {
+public:
+    bool load(handle src, bool) {
+        if (src && src.is_none())
+            return true;
+        return false;
+    }
+    static handle cast(T, return_value_policy /* policy */, handle /* parent */) {
+        return none().inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(T, _("None"));
+};
+
+template <> class type_caster<void_type> : public void_caster<void_type> {};
+
+template <> class type_caster<void> : public type_caster<void_type> {
+public:
+    using type_caster<void_type>::cast;
+
+    bool load(handle h, bool) {
+        if (!h) {
+            return false;
+        } else if (h.is_none()) {
+            value = nullptr;
+            return true;
+        }
+
+        /* Check if this is a capsule */
+        if (isinstance<capsule>(h)) {
+            value = reinterpret_borrow<capsule>(h);
+            return true;
+        }
+
+        /* Check if this is a C++ type */
+        auto &bases = all_type_info((PyTypeObject *) h.get_type().ptr());
+        if (bases.size() == 1) { // Only allowing loading from a single-value type
+            value = values_and_holders(reinterpret_cast<instance *>(h.ptr())).begin()->value_ptr();
+            return true;
+        }
+
+        /* Fail */
+        return false;
+    }
+
+    static handle cast(const void *ptr, return_value_policy /* policy */, handle /* parent */) {
+        if (ptr)
+            return capsule(ptr).release();
+        else
+            return none().inc_ref();
+    }
+
+    template <typename T> using cast_op_type = void*&;
+    operator void *&() { return value; }
+    static constexpr auto name = _("capsule");
+private:
+    void *value = nullptr;
+};
+
+template <> class type_caster<std::nullptr_t> : public void_caster<std::nullptr_t> { };
+
+template <> class type_caster<bool> {
+public:
+    bool load(handle src, bool convert) {
+        if (!src) return false;
+        else if (src.ptr() == Py_True) { value = true; return true; }
+        else if (src.ptr() == Py_False) { value = false; return true; }
+        else if (convert || !strcmp("numpy.bool_", Py_TYPE(src.ptr())->tp_name)) {
+            // (allow non-implicit conversion for numpy booleans)
+
+            Py_ssize_t res = -1;
+            if (src.is_none()) {
+                res = 0;  // None is implicitly converted to False
+            }
+            #if defined(PYPY_VERSION)
+            // On PyPy, check that "__bool__" (or "__nonzero__" on Python 2.7) attr exists
+            else if (hasattr(src, PYBIND11_BOOL_ATTR)) {
+                res = PyObject_IsTrue(src.ptr());
+            }
+            #else
+            // Alternate approach for CPython: this does the same as the above, but optimized
+            // using the CPython API so as to avoid an unneeded attribute lookup.
+            else if (auto tp_as_number = src.ptr()->ob_type->tp_as_number) {
+                if (PYBIND11_NB_BOOL(tp_as_number)) {
+                    res = (*PYBIND11_NB_BOOL(tp_as_number))(src.ptr());
+                }
+            }
+            #endif
+            if (res == 0 || res == 1) {
+                value = (bool) res;
+                return true;
+            }
+        }
+        return false;
+    }
+    static handle cast(bool src, return_value_policy /* policy */, handle /* parent */) {
+        return handle(src ? Py_True : Py_False).inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(bool, _("bool"));
+};
+
+// Helper class for UTF-{8,16,32} C++ stl strings:
+template <typename StringType, bool IsView = false> struct string_caster {
+    using CharT = typename StringType::value_type;
+
+    // Simplify life by being able to assume standard char sizes (the standard only guarantees
+    // minimums, but Python requires exact sizes)
+    static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1, "Unsupported char size != 1");
+    static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2");
+    static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4");
+    // wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
+    static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 || sizeof(CharT) == 4,
+            "Unsupported wchar_t size != 2/4");
+    static constexpr size_t UTF_N = 8 * sizeof(CharT);
+
+    bool load(handle src, bool) {
+#if PY_MAJOR_VERSION < 3
+        object temp;
+#endif
+        handle load_src = src;
+        if (!src) {
+            return false;
+        } else if (!PyUnicode_Check(load_src.ptr())) {
+#if PY_MAJOR_VERSION >= 3
+            return load_bytes(load_src);
+#else
+            if (sizeof(CharT) == 1) {
+                return load_bytes(load_src);
+            }
+
+            // The below is a guaranteed failure in Python 3 when PyUnicode_Check returns false
+            if (!PYBIND11_BYTES_CHECK(load_src.ptr()))
+                return false;
+
+            temp = reinterpret_steal<object>(PyUnicode_FromObject(load_src.ptr()));
+            if (!temp) { PyErr_Clear(); return false; }
+            load_src = temp;
+#endif
+        }
+
+        object utfNbytes = reinterpret_steal<object>(PyUnicode_AsEncodedString(
+            load_src.ptr(), UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr));
+        if (!utfNbytes) { PyErr_Clear(); return false; }
+
+        const CharT *buffer = reinterpret_cast<const CharT *>(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));
+        size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
+        if (UTF_N > 8) { buffer++; length--; } // Skip BOM for UTF-16/32
+        value = StringType(buffer, length);
+
+        // If we're loading a string_view we need to keep the encoded Python object alive:
+        if (IsView)
+            loader_life_support::add_patient(utfNbytes);
+
+        return true;
+    }
+
+    static handle cast(const StringType &src, return_value_policy /* policy */, handle /* parent */) {
+        const char *buffer = reinterpret_cast<const char *>(src.data());
+        ssize_t nbytes = ssize_t(src.size() * sizeof(CharT));
+        handle s = decode_utfN(buffer, nbytes);
+        if (!s) throw error_already_set();
+        return s;
+    }
+
+    PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME));
+
+private:
+    static handle decode_utfN(const char *buffer, ssize_t nbytes) {
+#if !defined(PYPY_VERSION)
+        return
+            UTF_N == 8  ? PyUnicode_DecodeUTF8(buffer, nbytes, nullptr) :
+            UTF_N == 16 ? PyUnicode_DecodeUTF16(buffer, nbytes, nullptr, nullptr) :
+                          PyUnicode_DecodeUTF32(buffer, nbytes, nullptr, nullptr);
+#else
+        // PyPy seems to have multiple problems related to PyUnicode_UTF*: the UTF8 version
+        // sometimes segfaults for unknown reasons, while the UTF16 and 32 versions require a
+        // non-const char * arguments, which is also a nuisance, so bypass the whole thing by just
+        // passing the encoding as a string value, which works properly:
+        return PyUnicode_Decode(buffer, nbytes, UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr);
+#endif
+    }
+
+    // When loading into a std::string or char*, accept a bytes object as-is (i.e.
+    // without any encoding/decoding attempt).  For other C++ char sizes this is a no-op.
+    // which supports loading a unicode from a str, doesn't take this path.
+    template <typename C = CharT>
+    bool load_bytes(enable_if_t<sizeof(C) == 1, handle> src) {
+        if (PYBIND11_BYTES_CHECK(src.ptr())) {
+            // We were passed a Python 3 raw bytes; accept it into a std::string or char*
+            // without any encoding attempt.
+            const char *bytes = PYBIND11_BYTES_AS_STRING(src.ptr());
+            if (bytes) {
+                value = StringType(bytes, (size_t) PYBIND11_BYTES_SIZE(src.ptr()));
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    template <typename C = CharT>
+    bool load_bytes(enable_if_t<sizeof(C) != 1, handle>) { return false; }
+};
+
+template <typename CharT, class Traits, class Allocator>
+struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string<CharT, Traits, Allocator>> {};
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+template <typename CharT, class Traits>
+struct type_caster<std::basic_string_view<CharT, Traits>, enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string_view<CharT, Traits>, true> {};
+#endif
+
+// Type caster for C-style strings.  We basically use a std::string type caster, but also add the
+// ability to use None as a nullptr char* (which the string caster doesn't allow).
+template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>> {
+    using StringType = std::basic_string<CharT>;
+    using StringCaster = type_caster<StringType>;
+    StringCaster str_caster;
+    bool none = false;
+    CharT one_char = 0;
+public:
+    bool load(handle src, bool convert) {
+        if (!src) return false;
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
+            none = true;
+            return true;
+        }
+        return str_caster.load(src, convert);
+    }
+
+    static handle cast(const CharT *src, return_value_policy policy, handle parent) {
+        if (src == nullptr) return pybind11::none().inc_ref();
+        return StringCaster::cast(StringType(src), policy, parent);
+    }
+
+    static handle cast(CharT src, return_value_policy policy, handle parent) {
+        if (std::is_same<char, CharT>::value) {
+            handle s = PyUnicode_DecodeLatin1((const char *) &src, 1, nullptr);
+            if (!s) throw error_already_set();
+            return s;
+        }
+        return StringCaster::cast(StringType(1, src), policy, parent);
+    }
+
+    operator CharT*() { return none ? nullptr : const_cast<CharT *>(static_cast<StringType &>(str_caster).c_str()); }
+    operator CharT&() {
+        if (none)
+            throw value_error("Cannot convert None to a character");
+
+        auto &value = static_cast<StringType &>(str_caster);
+        size_t str_len = value.size();
+        if (str_len == 0)
+            throw value_error("Cannot convert empty string to a character");
+
+        // If we're in UTF-8 mode, we have two possible failures: one for a unicode character that
+        // is too high, and one for multiple unicode characters (caught later), so we need to figure
+        // out how long the first encoded character is in bytes to distinguish between these two
+        // errors.  We also allow want to allow unicode characters U+0080 through U+00FF, as those
+        // can fit into a single char value.
+        if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) {
+            unsigned char v0 = static_cast<unsigned char>(value[0]);
+            size_t char0_bytes = !(v0 & 0x80) ? 1 : // low bits only: 0-127
+                (v0 & 0xE0) == 0xC0 ? 2 : // 0b110xxxxx - start of 2-byte sequence
+                (v0 & 0xF0) == 0xE0 ? 3 : // 0b1110xxxx - start of 3-byte sequence
+                4; // 0b11110xxx - start of 4-byte sequence
+
+            if (char0_bytes == str_len) {
+                // If we have a 128-255 value, we can decode it into a single char:
+                if (char0_bytes == 2 && (v0 & 0xFC) == 0xC0) { // 0x110000xx 0x10xxxxxx
+                    one_char = static_cast<CharT>(((v0 & 3) << 6) + (static_cast<unsigned char>(value[1]) & 0x3F));
+                    return one_char;
+                }
+                // Otherwise we have a single character, but it's > U+00FF
+                throw value_error("Character code point not in range(0x100)");
+            }
+        }
+
+        // UTF-16 is much easier: we can only have a surrogate pair for values above U+FFFF, thus a
+        // surrogate pair with total length 2 instantly indicates a range error (but not a "your
+        // string was too long" error).
+        else if (StringCaster::UTF_N == 16 && str_len == 2) {
+            one_char = static_cast<CharT>(value[0]);
+            if (one_char >= 0xD800 && one_char < 0xE000)
+                throw value_error("Character code point not in range(0x10000)");
+        }
+
+        if (str_len != 1)
+            throw value_error("Expected a character, but multi-character string found");
+
+        one_char = value[0];
+        return one_char;
+    }
+
+    static constexpr auto name = _(PYBIND11_STRING_NAME);
+    template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+};
+
+// Base implementation for std::tuple and std::pair
+template <template<typename...> class Tuple, typename... Ts> class tuple_caster {
+    using type = Tuple<Ts...>;
+    static constexpr auto size = sizeof...(Ts);
+    using indices = make_index_sequence<size>;
+public:
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src))
+            return false;
+        const auto seq = reinterpret_borrow<sequence>(src);
+        if (seq.size() != size)
+            return false;
+        return load_impl(seq, convert, indices{});
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        return cast_impl(std::forward<T>(src), policy, parent, indices{});
+    }
+
+    static constexpr auto name = _("Tuple[") + concat(make_caster<Ts>::name...) + _("]");
+
+    template <typename T> using cast_op_type = type;
+
+    operator type() & { return implicit_cast(indices{}); }
+    operator type() && { return std::move(*this).implicit_cast(indices{}); }
+
+protected:
+    template <size_t... Is>
+    type implicit_cast(index_sequence<Is...>) & { return type(cast_op<Ts>(std::get<Is>(subcasters))...); }
+    template <size_t... Is>
+    type implicit_cast(index_sequence<Is...>) && { return type(cast_op<Ts>(std::move(std::get<Is>(subcasters)))...); }
+
+    static constexpr bool load_impl(const sequence &, bool, index_sequence<>) { return true; }
+
+    template <size_t... Is>
+    bool load_impl(const sequence &seq, bool convert, index_sequence<Is...>) {
+        for (bool r : {std::get<Is>(subcasters).load(seq[Is], convert)...})
+            if (!r)
+                return false;
+        return true;
+    }
+
+    /* Implementation: Convert a C++ tuple into a Python tuple */
+    template <typename T, size_t... Is>
+    static handle cast_impl(T &&src, return_value_policy policy, handle parent, index_sequence<Is...>) {
+        std::array<object, size> entries{{
+            reinterpret_steal<object>(make_caster<Ts>::cast(std::get<Is>(std::forward<T>(src)), policy, parent))...
+        }};
+        for (const auto &entry: entries)
+            if (!entry)
+                return handle();
+        tuple result(size);
+        int counter = 0;
+        for (auto & entry: entries)
+            PyTuple_SET_ITEM(result.ptr(), counter++, entry.release().ptr());
+        return result.release();
+    }
+
+    Tuple<make_caster<Ts>...> subcasters;
+};
+
+template <typename T1, typename T2> class type_caster<std::pair<T1, T2>>
+    : public tuple_caster<std::pair, T1, T2> {};
+
+template <typename... Ts> class type_caster<std::tuple<Ts...>>
+    : public tuple_caster<std::tuple, Ts...> {};
+
+/// Helper class which abstracts away certain actions. Users can provide specializations for
+/// custom holders, but it's only necessary if the type has a non-standard interface.
+template <typename T>
+struct holder_helper {
+    static auto get(const T &p) -> decltype(p.get()) { return p.get(); }
+};
+
+/// Type caster for holder types like std::shared_ptr, etc.
+template <typename type, typename holder_type>
+struct copyable_holder_caster : public type_caster_base<type> {
+public:
+    using base = type_caster_base<type>;
+    static_assert(std::is_base_of<base, type_caster<type>>::value,
+            "Holder classes are only supported for custom types");
+    using base::base;
+    using base::cast;
+    using base::typeinfo;
+    using base::value;
+
+    bool load(handle src, bool convert) {
+        return base::template load_impl<copyable_holder_caster<type, holder_type>>(src, convert);
+    }
+
+    explicit operator type*() { return this->value; }
+    explicit operator type&() { return *(this->value); }
+    explicit operator holder_type*() { return std::addressof(holder); }
+
+    // Workaround for Intel compiler bug
+    // see pybind11 issue 94
+    #if defined(__ICC) || defined(__INTEL_COMPILER)
+    operator holder_type&() { return holder; }
+    #else
+    explicit operator holder_type&() { return holder; }
+    #endif
+
+    static handle cast(const holder_type &src, return_value_policy, handle) {
+        const auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, &src);
+    }
+
+protected:
+    friend class type_caster_generic;
+    void check_holder_compat() {
+        if (typeinfo->default_holder)
+            throw cast_error("Unable to load a custom holder type from a default-holder instance");
+    }
+
+    bool load_value(value_and_holder &&v_h) {
+        if (v_h.holder_constructed()) {
+            value = v_h.value_ptr();
+            holder = v_h.template holder<holder_type>();
+            return true;
+        } else {
+            throw cast_error("Unable to cast from non-held to held instance (T& to Holder<T>) "
+#if defined(NDEBUG)
+                             "(compile in debug mode for type information)");
+#else
+                             "of type '" + type_id<holder_type>() + "''");
+#endif
+        }
+    }
+
+    template <typename T = holder_type, detail::enable_if_t<!std::is_constructible<T, const T &, type*>::value, int> = 0>
+    bool try_implicit_casts(handle, bool) { return false; }
+
+    template <typename T = holder_type, detail::enable_if_t<std::is_constructible<T, const T &, type*>::value, int> = 0>
+    bool try_implicit_casts(handle src, bool convert) {
+        for (auto &cast : typeinfo->implicit_casts) {
+            copyable_holder_caster sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                holder = holder_type(sub_caster.holder, (type *) value);
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static bool try_direct_conversions(handle) { return false; }
+
+
+    holder_type holder;
+};
+
+/// Specialize for the common std::shared_ptr, so users don't need to
+template <typename T>
+class type_caster<std::shared_ptr<T>> : public copyable_holder_caster<T, std::shared_ptr<T>> { };
+
+template <typename type, typename holder_type>
+struct move_only_holder_caster {
+    static_assert(std::is_base_of<type_caster_base<type>, type_caster<type>>::value,
+            "Holder classes are only supported for custom types");
+
+    static handle cast(holder_type &&src, return_value_policy, handle) {
+        auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, std::addressof(src));
+    }
+    static constexpr auto name = type_caster_base<type>::name;
+};
+
+template <typename type, typename deleter>
+class type_caster<std::unique_ptr<type, deleter>>
+    : public move_only_holder_caster<type, std::unique_ptr<type, deleter>> { };
+
+template <typename type, typename holder_type>
+using type_caster_holder = conditional_t<is_copy_constructible<holder_type>::value,
+                                         copyable_holder_caster<type, holder_type>,
+                                         move_only_holder_caster<type, holder_type>>;
+
+template <typename T, bool Value = false> struct always_construct_holder { static constexpr bool value = Value; };
+
+/// Create a specialization for custom holder types (silently ignores std::shared_ptr)
+#define PYBIND11_DECLARE_HOLDER_TYPE(type, holder_type, ...) \
+    namespace pybind11 { namespace detail { \
+    template <typename type> \
+    struct always_construct_holder<holder_type> : always_construct_holder<void, ##__VA_ARGS__>  { }; \
+    template <typename type> \
+    class type_caster<holder_type, enable_if_t<!is_shared_ptr<holder_type>::value>> \
+        : public type_caster_holder<type, holder_type> { }; \
+    }}
+
+// PYBIND11_DECLARE_HOLDER_TYPE holder types:
+template <typename base, typename holder> struct is_holder_type :
+    std::is_base_of<detail::type_caster_holder<base, holder>, detail::type_caster<holder>> {};
+// Specialization for always-supported unique_ptr holders:
+template <typename base, typename deleter> struct is_holder_type<base, std::unique_ptr<base, deleter>> :
+    std::true_type {};
+
+template <typename T> struct handle_type_name { static constexpr auto name = _<T>(); };
+template <> struct handle_type_name<bytes> { static constexpr auto name = _(PYBIND11_BYTES_NAME); };
+template <> struct handle_type_name<args> { static constexpr auto name = _("*args"); };
+template <> struct handle_type_name<kwargs> { static constexpr auto name = _("**kwargs"); };
+
+template <typename type>
+struct pyobject_caster {
+    template <typename T = type, enable_if_t<std::is_same<T, handle>::value, int> = 0>
+    bool load(handle src, bool /* convert */) { value = src; return static_cast<bool>(value); }
+
+    template <typename T = type, enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+    bool load(handle src, bool /* convert */) {
+        if (!isinstance<type>(src))
+            return false;
+        value = reinterpret_borrow<type>(src);
+        return true;
+    }
+
+    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
+        return src.inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+};
+
+template <typename T>
+class type_caster<T, enable_if_t<is_pyobject<T>::value>> : public pyobject_caster<T> { };
+
+// Our conditions for enabling moving are quite restrictive:
+// At compile time:
+// - T needs to be a non-const, non-pointer, non-reference type
+// - type_caster<T>::operator T&() must exist
+// - the type must be move constructible (obviously)
+// At run-time:
+// - if the type is non-copy-constructible, the object must be the sole owner of the type (i.e. it
+//   must have ref_count() == 1)h
+// If any of the above are not satisfied, we fall back to copying.
+template <typename T> using move_is_plain_type = satisfies_none_of<T,
+    std::is_void, std::is_pointer, std::is_reference, std::is_const
+>;
+template <typename T, typename SFINAE = void> struct move_always : std::false_type {};
+template <typename T> struct move_always<T, enable_if_t<all_of<
+    move_is_plain_type<T>,
+    negation<is_copy_constructible<T>>,
+    std::is_move_constructible<T>,
+    std::is_same<decltype(std::declval<make_caster<T>>().operator T&()), T&>
+>::value>> : std::true_type {};
+template <typename T, typename SFINAE = void> struct move_if_unreferenced : std::false_type {};
+template <typename T> struct move_if_unreferenced<T, enable_if_t<all_of<
+    move_is_plain_type<T>,
+    negation<move_always<T>>,
+    std::is_move_constructible<T>,
+    std::is_same<decltype(std::declval<make_caster<T>>().operator T&()), T&>
+>::value>> : std::true_type {};
+template <typename T> using move_never = none_of<move_always<T>, move_if_unreferenced<T>>;
+
+// Detect whether returning a `type` from a cast on type's type_caster is going to result in a
+// reference or pointer to a local variable of the type_caster.  Basically, only
+// non-reference/pointer `type`s and reference/pointers from a type_caster_generic are safe;
+// everything else returns a reference/pointer to a local variable.
+template <typename type> using cast_is_temporary_value_reference = bool_constant<
+    (std::is_reference<type>::value || std::is_pointer<type>::value) &&
+    !std::is_base_of<type_caster_generic, make_caster<type>>::value &&
+    !std::is_same<intrinsic_t<type>, void>::value
+>;
+
+// When a value returned from a C++ function is being cast back to Python, we almost always want to
+// force `policy = move`, regardless of the return value policy the function/method was declared
+// with.
+template <typename Return, typename SFINAE = void> struct return_value_policy_override {
+    static return_value_policy policy(return_value_policy p) { return p; }
+};
+
+template <typename Return> struct return_value_policy_override<Return,
+        detail::enable_if_t<std::is_base_of<type_caster_generic, make_caster<Return>>::value, void>> {
+    static return_value_policy policy(return_value_policy p) {
+        return !std::is_lvalue_reference<Return>::value &&
+               !std::is_pointer<Return>::value
+                   ? return_value_policy::move : p;
+    }
+};
+
+// Basic python -> C++ casting; throws if casting fails
+template <typename T, typename SFINAE> type_caster<T, SFINAE> &load_type(type_caster<T, SFINAE> &conv, const handle &handle) {
+    if (!conv.load(handle, true)) {
+#if defined(NDEBUG)
+        throw cast_error("Unable to cast Python instance to C++ type (compile in debug mode for details)");
+#else
+        throw cast_error("Unable to cast Python instance of type " +
+            (std::string) str(handle.get_type()) + " to C++ type '" + type_id<T>() + "'");
+#endif
+    }
+    return conv;
+}
+// Wrapper around the above that also constructs and returns a type_caster
+template <typename T> make_caster<T> load_type(const handle &handle) {
+    make_caster<T> conv;
+    load_type(conv, handle);
+    return conv;
+}
+
+NAMESPACE_END(detail)
+
+// pytype -> C++ type
+template <typename T, detail::enable_if_t<!detail::is_pyobject<T>::value, int> = 0>
+T cast(const handle &handle) {
+    using namespace detail;
+    static_assert(!cast_is_temporary_value_reference<T>::value,
+            "Unable to cast type to reference: value is local to type caster");
+    return cast_op<T>(load_type<T>(handle));
+}
+
+// pytype -> pytype (calls converting constructor)
+template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+T cast(const handle &handle) { return T(reinterpret_borrow<object>(handle)); }
+
+// C++ type -> py::object
+template <typename T, detail::enable_if_t<!detail::is_pyobject<T>::value, int> = 0>
+object cast(const T &value, return_value_policy policy = return_value_policy::automatic_reference,
+            handle parent = handle()) {
+    if (policy == return_value_policy::automatic)
+        policy = std::is_pointer<T>::value ? return_value_policy::take_ownership : return_value_policy::copy;
+    else if (policy == return_value_policy::automatic_reference)
+        policy = std::is_pointer<T>::value ? return_value_policy::reference : return_value_policy::copy;
+    return reinterpret_steal<object>(detail::make_caster<T>::cast(value, policy, parent));
+}
+
+template <typename T> T handle::cast() const { return pybind11::cast<T>(*this); }
+template <> inline void handle::cast() const { return; }
+
+template <typename T>
+detail::enable_if_t<!detail::move_never<T>::value, T> move(object &&obj) {
+    if (obj.ref_count() > 1)
+#if defined(NDEBUG)
+        throw cast_error("Unable to cast Python instance to C++ rvalue: instance has multiple references"
+            " (compile in debug mode for details)");
+#else
+        throw cast_error("Unable to move from Python " + (std::string) str(obj.get_type()) +
+                " instance to C++ " + type_id<T>() + " instance: instance has multiple references");
+#endif
+
+    // Move into a temporary and return that, because the reference may be a local value of `conv`
+    T ret = std::move(detail::load_type<T>(obj).operator T&());
+    return ret;
+}
+
+// Calling cast() on an rvalue calls pybind::cast with the object rvalue, which does:
+// - If we have to move (because T has no copy constructor), do it.  This will fail if the moved
+//   object has multiple references, but trying to copy will fail to compile.
+// - If both movable and copyable, check ref count: if 1, move; otherwise copy
+// - Otherwise (not movable), copy.
+template <typename T> detail::enable_if_t<detail::move_always<T>::value, T> cast(object &&object) {
+    return move<T>(std::move(object));
+}
+template <typename T> detail::enable_if_t<detail::move_if_unreferenced<T>::value, T> cast(object &&object) {
+    if (object.ref_count() > 1)
+        return cast<T>(object);
+    else
+        return move<T>(std::move(object));
+}
+template <typename T> detail::enable_if_t<detail::move_never<T>::value, T> cast(object &&object) {
+    return cast<T>(object);
+}
+
+template <typename T> T object::cast() const & { return pybind11::cast<T>(*this); }
+template <typename T> T object::cast() && { return pybind11::cast<T>(std::move(*this)); }
+template <> inline void object::cast() const & { return; }
+template <> inline void object::cast() && { return; }
+
+NAMESPACE_BEGIN(detail)
+
+// Declared in pytypes.h:
+template <typename T, enable_if_t<!is_pyobject<T>::value, int>>
+object object_or_cast(T &&o) { return pybind11::cast(std::forward<T>(o)); }
+
+struct overload_unused {}; // Placeholder type for the unneeded (and dead code) static variable in the OVERLOAD_INT macro
+template <typename ret_type> using overload_caster_t = conditional_t<
+    cast_is_temporary_value_reference<ret_type>::value, make_caster<ret_type>, overload_unused>;
+
+// Trampoline use: for reference/pointer types to value-converted values, we do a value cast, then
+// store the result in the given variable.  For other types, this is a no-op.
+template <typename T> enable_if_t<cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&o, make_caster<T> &caster) {
+    return cast_op<T>(load_type(caster, o));
+}
+template <typename T> enable_if_t<!cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&, overload_unused &) {
+    pybind11_fail("Internal error: cast_ref fallback invoked"); }
+
+// Trampoline use: Having a pybind11::cast with an invalid reference type is going to static_assert, even
+// though if it's in dead code, so we provide a "trampoline" to pybind11::cast that only does anything in
+// cases where pybind11::cast is valid.
+template <typename T> enable_if_t<!cast_is_temporary_value_reference<T>::value, T> cast_safe(object &&o) {
+    return pybind11::cast<T>(std::move(o)); }
+template <typename T> enable_if_t<cast_is_temporary_value_reference<T>::value, T> cast_safe(object &&) {
+    pybind11_fail("Internal error: cast_safe fallback invoked"); }
+template <> inline void cast_safe<void>(object &&) {}
+
+NAMESPACE_END(detail)
+
+template <return_value_policy policy = return_value_policy::automatic_reference>
+tuple make_tuple() { return tuple(0); }
+
+template <return_value_policy policy = return_value_policy::automatic_reference,
+          typename... Args> tuple make_tuple(Args&&... args_) {
+    constexpr size_t size = sizeof...(Args);
+    std::array<object, size> args {
+        { reinterpret_steal<object>(detail::make_caster<Args>::cast(
+            std::forward<Args>(args_), policy, nullptr))... }
+    };
+    for (size_t i = 0; i < args.size(); i++) {
+        if (!args[i]) {
+#if defined(NDEBUG)
+            throw cast_error("make_tuple(): unable to convert arguments to Python object (compile in debug mode for details)");
+#else
+            std::array<std::string, size> argtypes { {type_id<Args>()...} };
+            throw cast_error("make_tuple(): unable to convert argument of type '" +
+                argtypes[i] + "' to Python object");
+#endif
+        }
+    }
+    tuple result(size);
+    int counter = 0;
+    for (auto &arg_value : args)
+        PyTuple_SET_ITEM(result.ptr(), counter++, arg_value.release().ptr());
+    return result;
+}
+
+/// \ingroup annotations
+/// Annotation for arguments
+struct arg {
+    /// Constructs an argument with the name of the argument; if null or omitted, this is a positional argument.
+    constexpr explicit arg(const char *name = nullptr) : name(name), flag_noconvert(false), flag_none(true) { }
+    /// Assign a value to this argument
+    template <typename T> arg_v operator=(T &&value) const;
+    /// Indicate that the type should not be converted in the type caster
+    arg &noconvert(bool flag = true) { flag_noconvert = flag; return *this; }
+    /// Indicates that the argument should/shouldn't allow None (e.g. for nullable pointer args)
+    arg &none(bool flag = true) { flag_none = flag; return *this; }
+
+    const char *name; ///< If non-null, this is a named kwargs argument
+    bool flag_noconvert : 1; ///< If set, do not allow conversion (requires a supporting type caster!)
+    bool flag_none : 1; ///< If set (the default), allow None to be passed to this argument
+};
+
+/// \ingroup annotations
+/// Annotation for arguments with values
+struct arg_v : arg {
+private:
+    template <typename T>
+    arg_v(arg &&base, T &&x, const char *descr = nullptr)
+        : arg(base),
+          value(reinterpret_steal<object>(
+              detail::make_caster<T>::cast(x, return_value_policy::automatic, {})
+          )),
+          descr(descr)
+#if !defined(NDEBUG)
+        , type(type_id<T>())
+#endif
+    { }
+
+public:
+    /// Direct construction with name, default, and description
+    template <typename T>
+    arg_v(const char *name, T &&x, const char *descr = nullptr)
+        : arg_v(arg(name), std::forward<T>(x), descr) { }
+
+    /// Called internally when invoking `py::arg("a") = value`
+    template <typename T>
+    arg_v(const arg &base, T &&x, const char *descr = nullptr)
+        : arg_v(arg(base), std::forward<T>(x), descr) { }
+
+    /// Same as `arg::noconvert()`, but returns *this as arg_v&, not arg&
+    arg_v &noconvert(bool flag = true) { arg::noconvert(flag); return *this; }
+
+    /// Same as `arg::nonone()`, but returns *this as arg_v&, not arg&
+    arg_v &none(bool flag = true) { arg::none(flag); return *this; }
+
+    /// The default value
+    object value;
+    /// The (optional) description of the default value
+    const char *descr;
+#if !defined(NDEBUG)
+    /// The C++ type name of the default value (only available when compiled in debug mode)
+    std::string type;
+#endif
+};
+
+template <typename T>
+arg_v arg::operator=(T &&value) const { return {std::move(*this), std::forward<T>(value)}; }
+
+/// Alias for backward compatibility -- to be removed in version 2.0
+template <typename /*unused*/> using arg_t = arg_v;
+
+inline namespace literals {
+/** \rst
+    String literal version of `arg`
+ \endrst */
+constexpr arg operator"" _a(const char *name, size_t) { return arg(name); }
+}
+
+NAMESPACE_BEGIN(detail)
+
+// forward declaration (definition in attr.h)
+struct function_record;
+
+/// Internal data associated with a single function call
+struct function_call {
+    function_call(const function_record &f, handle p); // Implementation in attr.h
+
+    /// The function data:
+    const function_record &func;
+
+    /// Arguments passed to the function:
+    std::vector<handle> args;
+
+    /// The `convert` value the arguments should be loaded with
+    std::vector<bool> args_convert;
+
+    /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
+    /// present, are also in `args` but without a reference).
+    object args_ref, kwargs_ref;
+
+    /// The parent, if any
+    handle parent;
+
+    /// If this is a call to an initializer, this argument contains `self`
+    handle init_self;
+};
+
+
+/// Helper class which loads arguments for C++ functions called from Python
+template <typename... Args>
+class argument_loader {
+    using indices = make_index_sequence<sizeof...(Args)>;
+
+    template <typename Arg> using argument_is_args   = std::is_same<intrinsic_t<Arg>, args>;
+    template <typename Arg> using argument_is_kwargs = std::is_same<intrinsic_t<Arg>, kwargs>;
+    // Get args/kwargs argument positions relative to the end of the argument list:
+    static constexpr auto args_pos = constexpr_first<argument_is_args, Args...>() - (int) sizeof...(Args),
+                        kwargs_pos = constexpr_first<argument_is_kwargs, Args...>() - (int) sizeof...(Args);
+
+    static constexpr bool args_kwargs_are_last = kwargs_pos >= - 1 && args_pos >= kwargs_pos - 1;
+
+    static_assert(args_kwargs_are_last, "py::args/py::kwargs are only permitted as the last argument(s) of a function");
+
+public:
+    static constexpr bool has_kwargs = kwargs_pos < 0;
+    static constexpr bool has_args = args_pos < 0;
+
+    static constexpr auto arg_names = concat(type_descr(make_caster<Args>::name)...);
+
+    bool load_args(function_call &call) {
+        return load_impl_sequence(call, indices{});
+    }
+
+    template <typename Return, typename Guard, typename Func>
+    enable_if_t<!std::is_void<Return>::value, Return> call(Func &&f) && {
+        return std::move(*this).template call_impl<Return>(std::forward<Func>(f), indices{}, Guard{});
+    }
+
+    template <typename Return, typename Guard, typename Func>
+    enable_if_t<std::is_void<Return>::value, void_type> call(Func &&f) && {
+        std::move(*this).template call_impl<Return>(std::forward<Func>(f), indices{}, Guard{});
+        return void_type();
+    }
+
+private:
+
+    static bool load_impl_sequence(function_call &, index_sequence<>) { return true; }
+
+    template <size_t... Is>
+    bool load_impl_sequence(function_call &call, index_sequence<Is...>) {
+        for (bool r : {std::get<Is>(argcasters).load(call.args[Is], call.args_convert[Is])...})
+            if (!r)
+                return false;
+        return true;
+    }
+
+    template <typename Return, typename Func, size_t... Is, typename Guard>
+    Return call_impl(Func &&f, index_sequence<Is...>, Guard &&) {
+        return std::forward<Func>(f)(cast_op<Args>(std::move(std::get<Is>(argcasters)))...);
+    }
+
+    std::tuple<make_caster<Args>...> argcasters;
+};
+
+/// Helper class which collects only positional arguments for a Python function call.
+/// A fancier version below can collect any argument, but this one is optimal for simple calls.
+template <return_value_policy policy>
+class simple_collector {
+public:
+    template <typename... Ts>
+    explicit simple_collector(Ts &&...values)
+        : m_args(pybind11::make_tuple<policy>(std::forward<Ts>(values)...)) { }
+
+    const tuple &args() const & { return m_args; }
+    dict kwargs() const { return {}; }
+
+    tuple args() && { return std::move(m_args); }
+
+    /// Call a Python function and pass the collected arguments
+    object call(PyObject *ptr) const {
+        PyObject *result = PyObject_CallObject(ptr, m_args.ptr());
+        if (!result)
+            throw error_already_set();
+        return reinterpret_steal<object>(result);
+    }
+
+private:
+    tuple m_args;
+};
+
+/// Helper class which collects positional, keyword, * and ** arguments for a Python function call
+template <return_value_policy policy>
+class unpacking_collector {
+public:
+    template <typename... Ts>
+    explicit unpacking_collector(Ts &&...values) {
+        // Tuples aren't (easily) resizable so a list is needed for collection,
+        // but the actual function call strictly requires a tuple.
+        auto args_list = list();
+        int _[] = { 0, (process(args_list, std::forward<Ts>(values)), 0)... };
+        ignore_unused(_);
+
+        m_args = std::move(args_list);
+    }
+
+    const tuple &args() const & { return m_args; }
+    const dict &kwargs() const & { return m_kwargs; }
+
+    tuple args() && { return std::move(m_args); }
+    dict kwargs() && { return std::move(m_kwargs); }
+
+    /// Call a Python function and pass the collected arguments
+    object call(PyObject *ptr) const {
+        PyObject *result = PyObject_Call(ptr, m_args.ptr(), m_kwargs.ptr());
+        if (!result)
+            throw error_already_set();
+        return reinterpret_steal<object>(result);
+    }
+
+private:
+    template <typename T>
+    void process(list &args_list, T &&x) {
+        auto o = reinterpret_steal<object>(detail::make_caster<T>::cast(std::forward<T>(x), policy, {}));
+        if (!o) {
+#if defined(NDEBUG)
+            argument_cast_error();
+#else
+            argument_cast_error(std::to_string(args_list.size()), type_id<T>());
+#endif
+        }
+        args_list.append(o);
+    }
+
+    void process(list &args_list, detail::args_proxy ap) {
+        for (const auto &a : ap)
+            args_list.append(a);
+    }
+
+    void process(list &/*args_list*/, arg_v a) {
+        if (!a.name)
+#if defined(NDEBUG)
+            nameless_argument_error();
+#else
+            nameless_argument_error(a.type);
+#endif
+
+        if (m_kwargs.contains(a.name)) {
+#if defined(NDEBUG)
+            multiple_values_error();
+#else
+            multiple_values_error(a.name);
+#endif
+        }
+        if (!a.value) {
+#if defined(NDEBUG)
+            argument_cast_error();
+#else
+            argument_cast_error(a.name, a.type);
+#endif
+        }
+        m_kwargs[a.name] = a.value;
+    }
+
+    void process(list &/*args_list*/, detail::kwargs_proxy kp) {
+        if (!kp)
+            return;
+        for (const auto &k : reinterpret_borrow<dict>(kp)) {
+            if (m_kwargs.contains(k.first)) {
+#if defined(NDEBUG)
+                multiple_values_error();
+#else
+                multiple_values_error(str(k.first));
+#endif
+            }
+            m_kwargs[k.first] = k.second;
+        }
+    }
+
+    [[noreturn]] static void nameless_argument_error() {
+        throw type_error("Got kwargs without a name; only named arguments "
+                         "may be passed via py::arg() to a python function call. "
+                         "(compile in debug mode for details)");
+    }
+    [[noreturn]] static void nameless_argument_error(std::string type) {
+        throw type_error("Got kwargs without a name of type '" + type + "'; only named "
+                         "arguments may be passed via py::arg() to a python function call. ");
+    }
+    [[noreturn]] static void multiple_values_error() {
+        throw type_error("Got multiple values for keyword argument "
+                         "(compile in debug mode for details)");
+    }
+
+    [[noreturn]] static void multiple_values_error(std::string name) {
+        throw type_error("Got multiple values for keyword argument '" + name + "'");
+    }
+
+    [[noreturn]] static void argument_cast_error() {
+        throw cast_error("Unable to convert call argument to Python object "
+                         "(compile in debug mode for details)");
+    }
+
+    [[noreturn]] static void argument_cast_error(std::string name, std::string type) {
+        throw cast_error("Unable to convert call argument '" + name
+                         + "' of type '" + type + "' to Python object");
+    }
+
+private:
+    tuple m_args;
+    dict m_kwargs;
+};
+
+/// Collect only positional arguments for a Python function call
+template <return_value_policy policy, typename... Args,
+          typename = enable_if_t<all_of<is_positional<Args>...>::value>>
+simple_collector<policy> collect_arguments(Args &&...args) {
+    return simple_collector<policy>(std::forward<Args>(args)...);
+}
+
+/// Collect all arguments, including keywords and unpacking (only instantiated when needed)
+template <return_value_policy policy, typename... Args,
+          typename = enable_if_t<!all_of<is_positional<Args>...>::value>>
+unpacking_collector<policy> collect_arguments(Args &&...args) {
+    // Following argument order rules for generalized unpacking according to PEP 448
+    static_assert(
+        constexpr_last<is_positional, Args...>() < constexpr_first<is_keyword_or_ds, Args...>()
+        && constexpr_last<is_s_unpacking, Args...>() < constexpr_first<is_ds_unpacking, Args...>(),
+        "Invalid function call: positional args must precede keywords and ** unpacking; "
+        "* unpacking must precede ** unpacking"
+    );
+    return unpacking_collector<policy>(std::forward<Args>(args)...);
+}
+
+template <typename Derived>
+template <return_value_policy policy, typename... Args>
+object object_api<Derived>::operator()(Args &&...args) const {
+    return detail::collect_arguments<policy>(std::forward<Args>(args)...).call(derived().ptr());
+}
+
+template <typename Derived>
+template <return_value_policy policy, typename... Args>
+object object_api<Derived>::call(Args &&...args) const {
+    return operator()<policy>(std::forward<Args>(args)...);
+}
+
+NAMESPACE_END(detail)
+
+#define PYBIND11_MAKE_OPAQUE(...) \
+    namespace pybind11 { namespace detail { \
+        template<> class type_caster<__VA_ARGS__> : public type_caster_base<__VA_ARGS__> { }; \
+    }}
+
+/// Lets you pass a type containing a `,` through a macro parameter without needing a separate
+/// typedef, e.g.: `PYBIND11_OVERLOAD(PYBIND11_TYPE(ReturnType<A, B>), PYBIND11_TYPE(Parent<C, D>), f, arg)`
+#define PYBIND11_TYPE(...) __VA_ARGS__
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/chrono.h b/python/src/pybind11/chrono.h
new file mode 100644
index 000000000..95ada76e0
--- /dev/null
+++ b/python/src/pybind11/chrono.h
@@ -0,0 +1,162 @@
+/*
+    pybind11/chrono.h: Transparent conversion between std::chrono and python's datetime
+
+    Copyright (c) 2016 Trent Houliston <trent@houliston.me> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <cmath>
+#include <ctime>
+#include <chrono>
+#include <datetime.h>
+
+// Backport the PyDateTime_DELTA functions from Python3.3 if required
+#ifndef PyDateTime_DELTA_GET_DAYS
+#define PyDateTime_DELTA_GET_DAYS(o)         (((PyDateTime_Delta*)o)->days)
+#endif
+#ifndef PyDateTime_DELTA_GET_SECONDS
+#define PyDateTime_DELTA_GET_SECONDS(o)      (((PyDateTime_Delta*)o)->seconds)
+#endif
+#ifndef PyDateTime_DELTA_GET_MICROSECONDS
+#define PyDateTime_DELTA_GET_MICROSECONDS(o) (((PyDateTime_Delta*)o)->microseconds)
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+template <typename type> class duration_caster {
+public:
+    typedef typename type::rep rep;
+    typedef typename type::period period;
+
+    typedef std::chrono::duration<uint_fast32_t, std::ratio<86400>> days;
+
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        if (!src) return false;
+        // If invoked with datetime.delta object
+        if (PyDelta_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(
+                  days(PyDateTime_DELTA_GET_DAYS(src.ptr()))
+                + seconds(PyDateTime_DELTA_GET_SECONDS(src.ptr()))
+                + microseconds(PyDateTime_DELTA_GET_MICROSECONDS(src.ptr()))));
+            return true;
+        }
+        // If invoked with a float we assume it is seconds and convert
+        else if (PyFloat_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(duration<double>(PyFloat_AsDouble(src.ptr()))));
+            return true;
+        }
+        else return false;
+    }
+
+    // If this is a duration just return it back
+    static const std::chrono::duration<rep, period>& get_duration(const std::chrono::duration<rep, period> &src) {
+        return src;
+    }
+
+    // If this is a time_point get the time_since_epoch
+    template <typename Clock> static std::chrono::duration<rep, period> get_duration(const std::chrono::time_point<Clock, std::chrono::duration<rep, period>> &src) {
+        return src.time_since_epoch();
+    }
+
+    static handle cast(const type &src, return_value_policy /* policy */, handle /* parent */) {
+        using namespace std::chrono;
+
+        // Use overloaded function to get our duration from our source
+        // Works out if it is a duration or time_point and get the duration
+        auto d = get_duration(src);
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        // Declare these special duration types so the conversions happen with the correct primitive types (int)
+        using dd_t = duration<int, std::ratio<86400>>;
+        using ss_t = duration<int, std::ratio<1>>;
+        using us_t = duration<int, std::micro>;
+
+        auto dd = duration_cast<dd_t>(d);
+        auto subd = d - dd;
+        auto ss = duration_cast<ss_t>(subd);
+        auto us = duration_cast<us_t>(subd - ss);
+        return PyDelta_FromDSU(dd.count(), ss.count(), us.count());
+    }
+
+    PYBIND11_TYPE_CASTER(type, _("datetime.timedelta"));
+};
+
+// This is for casting times on the system clock into datetime.datetime instances
+template <typename Duration> class type_caster<std::chrono::time_point<std::chrono::system_clock, Duration>> {
+public:
+    typedef std::chrono::time_point<std::chrono::system_clock, Duration> type;
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        if (!src) return false;
+        if (PyDateTime_Check(src.ptr())) {
+            std::tm cal;
+            cal.tm_sec   = PyDateTime_DATE_GET_SECOND(src.ptr());
+            cal.tm_min   = PyDateTime_DATE_GET_MINUTE(src.ptr());
+            cal.tm_hour  = PyDateTime_DATE_GET_HOUR(src.ptr());
+            cal.tm_mday  = PyDateTime_GET_DAY(src.ptr());
+            cal.tm_mon   = PyDateTime_GET_MONTH(src.ptr()) - 1;
+            cal.tm_year  = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+            cal.tm_isdst = -1;
+
+            value = system_clock::from_time_t(std::mktime(&cal)) + microseconds(PyDateTime_DATE_GET_MICROSECOND(src.ptr()));
+            return true;
+        }
+        else return false;
+    }
+
+    static handle cast(const std::chrono::time_point<std::chrono::system_clock, Duration> &src, return_value_policy /* policy */, handle /* parent */) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        std::time_t tt = system_clock::to_time_t(src);
+        // this function uses static memory so it's best to copy it out asap just in case
+        // otherwise other code that is using localtime may break this (not just python code)
+        std::tm localtime = *std::localtime(&tt);
+
+        // Declare these special duration types so the conversions happen with the correct primitive types (int)
+        using us_t = duration<int, std::micro>;
+
+        return PyDateTime_FromDateAndTime(localtime.tm_year + 1900,
+                                          localtime.tm_mon + 1,
+                                          localtime.tm_mday,
+                                          localtime.tm_hour,
+                                          localtime.tm_min,
+                                          localtime.tm_sec,
+                                          (duration_cast<us_t>(src.time_since_epoch() % seconds(1))).count());
+    }
+    PYBIND11_TYPE_CASTER(type, _("datetime.datetime"));
+};
+
+// Other clocks that are not the system clock are not measured as datetime.datetime objects
+// since they are not measured on calendar time. So instead we just make them timedeltas
+// Or if they have passed us a time as a float we convert that
+template <typename Clock, typename Duration> class type_caster<std::chrono::time_point<Clock, Duration>>
+: public duration_caster<std::chrono::time_point<Clock, Duration>> {
+};
+
+template <typename Rep, typename Period> class type_caster<std::chrono::duration<Rep, Period>>
+: public duration_caster<std::chrono::duration<Rep, Period>> {
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/common.h b/python/src/pybind11/common.h
new file mode 100644
index 000000000..6c8a4f1e8
--- /dev/null
+++ b/python/src/pybind11/common.h
@@ -0,0 +1,2 @@
+#include "detail/common.h"
+#warning "Including 'common.h' is deprecated. It will be removed in v3.0. Use 'pybind11.h'."
diff --git a/python/src/pybind11/complex.h b/python/src/pybind11/complex.h
new file mode 100644
index 000000000..3f8963857
--- /dev/null
+++ b/python/src/pybind11/complex.h
@@ -0,0 +1,65 @@
+/*
+    pybind11/complex.h: Complex number support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <complex>
+
+/// glibc defines I as a macro which breaks things, e.g., boost template names
+#ifdef I
+#  undef I
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+template <typename T> struct format_descriptor<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr const char c = format_descriptor<T>::c;
+    static constexpr const char value[3] = { 'Z', c, '\0' };
+    static std::string format() { return std::string(value); }
+};
+
+#ifndef PYBIND11_CPP17
+
+template <typename T> constexpr const char format_descriptor<
+    std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>>::value[3];
+
+#endif
+
+NAMESPACE_BEGIN(detail)
+
+template <typename T> struct is_fmt_numeric<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index = is_fmt_numeric<T>::index + 3;
+};
+
+template <typename T> class type_caster<std::complex<T>> {
+public:
+    bool load(handle src, bool convert) {
+        if (!src)
+            return false;
+        if (!convert && !PyComplex_Check(src.ptr()))
+            return false;
+        Py_complex result = PyComplex_AsCComplex(src.ptr());
+        if (result.real == -1.0 && PyErr_Occurred()) {
+            PyErr_Clear();
+            return false;
+        }
+        value = std::complex<T>((T) result.real, (T) result.imag);
+        return true;
+    }
+
+    static handle cast(const std::complex<T> &src, return_value_policy /* policy */, handle /* parent */) {
+        return PyComplex_FromDoubles((double) src.real(), (double) src.imag());
+    }
+
+    PYBIND11_TYPE_CASTER(std::complex<T>, _("complex"));
+};
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/detail/class.h b/python/src/pybind11/detail/class.h
new file mode 100644
index 000000000..b1916fcd0
--- /dev/null
+++ b/python/src/pybind11/detail/class.h
@@ -0,0 +1,623 @@
+/*
+    pybind11/detail/class.h: Python C API implementation details for py::class_
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../attr.h"
+#include "../options.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+#if PY_VERSION_HEX >= 0x03030000
+#  define PYBIND11_BUILTIN_QUALNAME
+#  define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)
+#else
+// In pre-3.3 Python, we still set __qualname__ so that we can produce reliable function type
+// signatures; in 3.3+ this macro expands to nothing:
+#  define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj) setattr((PyObject *) obj, "__qualname__", nameobj)
+#endif
+
+inline PyTypeObject *type_incref(PyTypeObject *type) {
+    Py_INCREF(type);
+    return type;
+}
+
+#if !defined(PYPY_VERSION)
+
+/// `pybind11_static_property.__get__()`: Always pass the class instead of the instance.
+extern "C" inline PyObject *pybind11_static_get(PyObject *self, PyObject * /*ob*/, PyObject *cls) {
+    return PyProperty_Type.tp_descr_get(self, cls, cls);
+}
+
+/// `pybind11_static_property.__set__()`: Just like the above `__get__()`.
+extern "C" inline int pybind11_static_set(PyObject *self, PyObject *obj, PyObject *value) {
+    PyObject *cls = PyType_Check(obj) ? obj : (PyObject *) Py_TYPE(obj);
+    return PyProperty_Type.tp_descr_set(self, cls, value);
+}
+
+/** A `static_property` is the same as a `property` but the `__get__()` and `__set__()`
+    methods are modified to always use the object type instead of a concrete instance.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    constexpr auto *name = "pybind11_static_property";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type)
+        pybind11_fail("make_static_property_type(): error allocating type!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyProperty_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+    type->tp_descr_get = pybind11_static_get;
+    type->tp_descr_set = pybind11_static_set;
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("make_static_property_type(): failure in PyType_Ready()!");
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+#else // PYPY
+
+/** PyPy has some issues with the above C API, so we evaluate Python code instead.
+    This function will only be called once so performance isn't really a concern.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    auto d = dict();
+    PyObject *result = PyRun_String(R"(\
+        class pybind11_static_property(property):
+            def __get__(self, obj, cls):
+                return property.__get__(self, cls, cls)
+
+            def __set__(self, obj, value):
+                cls = obj if isinstance(obj, type) else type(obj)
+                property.__set__(self, cls, value)
+        )", Py_file_input, d.ptr(), d.ptr()
+    );
+    if (result == nullptr)
+        throw error_already_set();
+    Py_DECREF(result);
+    return (PyTypeObject *) d["pybind11_static_property"].cast<object>().release().ptr();
+}
+
+#endif // PYPY
+
+/** Types with static properties need to handle `Type.static_prop = x` in a specific way.
+    By default, Python replaces the `static_property` itself, but for wrapped C++ types
+    we need to call `static_property.__set__()` in order to propagate the new value to
+    the underlying C++ data structure. */
+extern "C" inline int pybind11_meta_setattro(PyObject* obj, PyObject* name, PyObject* value) {
+    // Use `_PyType_Lookup()` instead of `PyObject_GetAttr()` in order to get the raw
+    // descriptor (`property`) instead of calling `tp_descr_get` (`property.__get__()`).
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+
+    // The following assignment combinations are possible:
+    //   1. `Type.static_prop = value`             --> descr_set: `Type.static_prop.__set__(value)`
+    //   2. `Type.static_prop = other_static_prop` --> setattro:  replace existing `static_prop`
+    //   3. `Type.regular_attribute = value`       --> setattro:  regular attribute assignment
+    const auto static_prop = (PyObject *) get_internals().static_property_type;
+    const auto call_descr_set = descr && PyObject_IsInstance(descr, static_prop)
+                                && !PyObject_IsInstance(value, static_prop);
+    if (call_descr_set) {
+        // Call `static_property.__set__()` instead of replacing the `static_property`.
+#if !defined(PYPY_VERSION)
+        return Py_TYPE(descr)->tp_descr_set(descr, obj, value);
+#else
+        if (PyObject *result = PyObject_CallMethod(descr, "__set__", "OO", obj, value)) {
+            Py_DECREF(result);
+            return 0;
+        } else {
+            return -1;
+        }
+#endif
+    } else {
+        // Replace existing attribute.
+        return PyType_Type.tp_setattro(obj, name, value);
+    }
+}
+
+#if PY_MAJOR_VERSION >= 3
+/**
+ * Python 3's PyInstanceMethod_Type hides itself via its tp_descr_get, which prevents aliasing
+ * methods via cls.attr("m2") = cls.attr("m1"): instead the tp_descr_get returns a plain function,
+ * when called on a class, or a PyMethod, when called on an instance.  Override that behaviour here
+ * to do a special case bypass for PyInstanceMethod_Types.
+ */
+extern "C" inline PyObject *pybind11_meta_getattro(PyObject *obj, PyObject *name) {
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+    if (descr && PyInstanceMethod_Check(descr)) {
+        Py_INCREF(descr);
+        return descr;
+    }
+    else {
+        return PyType_Type.tp_getattro(obj, name);
+    }
+}
+#endif
+
+/** This metaclass is assigned by default to all pybind11 types and is required in order
+    for static properties to function correctly. Users may override this using `py::metaclass`.
+    Return value: New reference. */
+inline PyTypeObject* make_default_metaclass() {
+    constexpr auto *name = "pybind11_type";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type)
+        pybind11_fail("make_default_metaclass(): error allocating metaclass!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyType_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_setattro = pybind11_meta_setattro;
+#if PY_MAJOR_VERSION >= 3
+    type->tp_getattro = pybind11_meta_getattro;
+#endif
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("make_default_metaclass(): failure in PyType_Ready()!");
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+/// For multiple inheritance types we need to recursively register/deregister base pointers for any
+/// base classes with pointers that are difference from the instance value pointer so that we can
+/// correctly recognize an offset base class pointer. This calls a function with any offset base ptrs.
+inline void traverse_offset_bases(void *valueptr, const detail::type_info *tinfo, instance *self,
+        bool (*f)(void * /*parentptr*/, instance * /*self*/)) {
+    for (handle h : reinterpret_borrow<tuple>(tinfo->type->tp_bases)) {
+        if (auto parent_tinfo = get_type_info((PyTypeObject *) h.ptr())) {
+            for (auto &c : parent_tinfo->implicit_casts) {
+                if (c.first == tinfo->cpptype) {
+                    auto *parentptr = c.second(valueptr);
+                    if (parentptr != valueptr)
+                        f(parentptr, self);
+                    traverse_offset_bases(parentptr, parent_tinfo, self, f);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+inline bool register_instance_impl(void *ptr, instance *self) {
+    get_internals().registered_instances.emplace(ptr, self);
+    return true; // unused, but gives the same signature as the deregister func
+}
+inline bool deregister_instance_impl(void *ptr, instance *self) {
+    auto &registered_instances = get_internals().registered_instances;
+    auto range = registered_instances.equal_range(ptr);
+    for (auto it = range.first; it != range.second; ++it) {
+        if (Py_TYPE(self) == Py_TYPE(it->second)) {
+            registered_instances.erase(it);
+            return true;
+        }
+    }
+    return false;
+}
+
+inline void register_instance(instance *self, void *valptr, const type_info *tinfo) {
+    register_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors)
+        traverse_offset_bases(valptr, tinfo, self, register_instance_impl);
+}
+
+inline bool deregister_instance(instance *self, void *valptr, const type_info *tinfo) {
+    bool ret = deregister_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors)
+        traverse_offset_bases(valptr, tinfo, self, deregister_instance_impl);
+    return ret;
+}
+
+/// Instance creation function for all pybind11 types. It allocates the internal instance layout for
+/// holding C++ objects and holders.  Allocation is done lazily (the first time the instance is cast
+/// to a reference or pointer), and initialization is done by an `__init__` function.
+inline PyObject *make_new_instance(PyTypeObject *type) {
+#if defined(PYPY_VERSION)
+    // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first inherited
+    // object is a a plain Python type (i.e. not derived from an extension type).  Fix it.
+    ssize_t instance_size = static_cast<ssize_t>(sizeof(instance));
+    if (type->tp_basicsize < instance_size) {
+        type->tp_basicsize = instance_size;
+    }
+#endif
+    PyObject *self = type->tp_alloc(type, 0);
+    auto inst = reinterpret_cast<instance *>(self);
+    // Allocate the value/holder internals:
+    inst->allocate_layout();
+
+    inst->owned = true;
+
+    return self;
+}
+
+/// Instance creation function for all pybind11 types. It only allocates space for the
+/// C++ object, but doesn't call the constructor -- an `__init__` function must do that.
+extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *) {
+    return make_new_instance(type);
+}
+
+/// An `__init__` function constructs the C++ object. Users should provide at least one
+/// of these using `py::init` or directly with `.def(__init__, ...)`. Otherwise, the
+/// following default function will be used which simply throws an exception.
+extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject *) {
+    PyTypeObject *type = Py_TYPE(self);
+    std::string msg;
+#if defined(PYPY_VERSION)
+    msg += handle((PyObject *) type).attr("__module__").cast<std::string>() + ".";
+#endif
+    msg += type->tp_name;
+    msg += ": No constructor defined!";
+    PyErr_SetString(PyExc_TypeError, msg.c_str());
+    return -1;
+}
+
+inline void add_patient(PyObject *nurse, PyObject *patient) {
+    auto &internals = get_internals();
+    auto instance = reinterpret_cast<detail::instance *>(nurse);
+    instance->has_patients = true;
+    Py_INCREF(patient);
+    internals.patients[nurse].push_back(patient);
+}
+
+inline void clear_patients(PyObject *self) {
+    auto instance = reinterpret_cast<detail::instance *>(self);
+    auto &internals = get_internals();
+    auto pos = internals.patients.find(self);
+    assert(pos != internals.patients.end());
+    // Clearing the patients can cause more Python code to run, which
+    // can invalidate the iterator. Extract the vector of patients
+    // from the unordered_map first.
+    auto patients = std::move(pos->second);
+    internals.patients.erase(pos);
+    instance->has_patients = false;
+    for (PyObject *&patient : patients)
+        Py_CLEAR(patient);
+}
+
+/// Clears all internal data from the instance and removes it from registered instances in
+/// preparation for deallocation.
+inline void clear_instance(PyObject *self) {
+    auto instance = reinterpret_cast<detail::instance *>(self);
+
+    // Deallocate any values/holders, if present:
+    for (auto &v_h : values_and_holders(instance)) {
+        if (v_h) {
+
+            // We have to deregister before we call dealloc because, for virtual MI types, we still
+            // need to be able to get the parent pointers.
+            if (v_h.instance_registered() && !deregister_instance(instance, v_h.value_ptr(), v_h.type))
+                pybind11_fail("pybind11_object_dealloc(): Tried to deallocate unregistered instance!");
+
+            if (instance->owned || v_h.holder_constructed())
+                v_h.type->dealloc(v_h);
+        }
+    }
+    // Deallocate the value/holder layout internals:
+    instance->deallocate_layout();
+
+    if (instance->weakrefs)
+        PyObject_ClearWeakRefs(self);
+
+    PyObject **dict_ptr = _PyObject_GetDictPtr(self);
+    if (dict_ptr)
+        Py_CLEAR(*dict_ptr);
+
+    if (instance->has_patients)
+        clear_patients(self);
+}
+
+/// Instance destructor function for all pybind11 types. It calls `type_info.dealloc`
+/// to destroy the C++ object itself, while the rest is Python bookkeeping.
+extern "C" inline void pybind11_object_dealloc(PyObject *self) {
+    clear_instance(self);
+
+    auto type = Py_TYPE(self);
+    type->tp_free(self);
+
+    // `type->tp_dealloc != pybind11_object_dealloc` means that we're being called
+    // as part of a derived type's dealloc, in which case we're not allowed to decref
+    // the type here. For cross-module compatibility, we shouldn't compare directly
+    // with `pybind11_object_dealloc`, but with the common one stashed in internals.
+    auto pybind11_object_type = (PyTypeObject *) get_internals().instance_base;
+    if (type->tp_dealloc == pybind11_object_type->tp_dealloc)
+        Py_DECREF(type);
+}
+
+/** Create the type which can be used as a common base for all classes.  This is
+    needed in order to satisfy Python's requirements for multiple inheritance.
+    Return value: New reference. */
+inline PyObject *make_object_base_type(PyTypeObject *metaclass) {
+    constexpr auto *name = "pybind11_object";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type)
+        pybind11_fail("make_object_base_type(): error allocating type!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyBaseObject_Type);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_new = pybind11_object_new;
+    type->tp_init = pybind11_object_init;
+    type->tp_dealloc = pybind11_object_dealloc;
+
+    /* Support weak references (needed for the keep_alive feature) */
+    type->tp_weaklistoffset = offsetof(instance, weakrefs);
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("PyType_Ready failed in make_object_base_type():" + error_string());
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    assert(!PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+    return (PyObject *) heap_type;
+}
+
+/// dynamic_attr: Support for `d = instance.__dict__`.
+extern "C" inline PyObject *pybind11_get_dict(PyObject *self, void *) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    if (!dict)
+        dict = PyDict_New();
+    Py_XINCREF(dict);
+    return dict;
+}
+
+/// dynamic_attr: Support for `instance.__dict__ = dict()`.
+extern "C" inline int pybind11_set_dict(PyObject *self, PyObject *new_dict, void *) {
+    if (!PyDict_Check(new_dict)) {
+        PyErr_Format(PyExc_TypeError, "__dict__ must be set to a dictionary, not a '%.200s'",
+                     Py_TYPE(new_dict)->tp_name);
+        return -1;
+    }
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_INCREF(new_dict);
+    Py_CLEAR(dict);
+    dict = new_dict;
+    return 0;
+}
+
+/// dynamic_attr: Allow the garbage collector to traverse the internal instance `__dict__`.
+extern "C" inline int pybind11_traverse(PyObject *self, visitproc visit, void *arg) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_VISIT(dict);
+    return 0;
+}
+
+/// dynamic_attr: Allow the GC to clear the dictionary.
+extern "C" inline int pybind11_clear(PyObject *self) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_CLEAR(dict);
+    return 0;
+}
+
+/// Give instances of this type a `__dict__` and opt into garbage collection.
+inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type) {
+    auto type = &heap_type->ht_type;
+#if defined(PYPY_VERSION)
+    pybind11_fail(std::string(type->tp_name) + ": dynamic attributes are "
+                                               "currently not supported in "
+                                               "conjunction with PyPy!");
+#endif
+    type->tp_flags |= Py_TPFLAGS_HAVE_GC;
+    type->tp_dictoffset = type->tp_basicsize; // place dict at the end
+    type->tp_basicsize += (ssize_t)sizeof(PyObject *); // and allocate enough space for it
+    type->tp_traverse = pybind11_traverse;
+    type->tp_clear = pybind11_clear;
+
+    static PyGetSetDef getset[] = {
+        {const_cast<char*>("__dict__"), pybind11_get_dict, pybind11_set_dict, nullptr, nullptr},
+        {nullptr, nullptr, nullptr, nullptr, nullptr}
+    };
+    type->tp_getset = getset;
+}
+
+/// buffer_protocol: Fill in the view as specified by flags.
+extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int flags) {
+    // Look for a `get_buffer` implementation in this type's info or any bases (following MRO).
+    type_info *tinfo = nullptr;
+    for (auto type : reinterpret_borrow<tuple>(Py_TYPE(obj)->tp_mro)) {
+        tinfo = get_type_info((PyTypeObject *) type.ptr());
+        if (tinfo && tinfo->get_buffer)
+            break;
+    }
+    if (view == nullptr || !tinfo || !tinfo->get_buffer) {
+        if (view)
+            view->obj = nullptr;
+        PyErr_SetString(PyExc_BufferError, "pybind11_getbuffer(): Internal error");
+        return -1;
+    }
+    std::memset(view, 0, sizeof(Py_buffer));
+    buffer_info *info = tinfo->get_buffer(obj, tinfo->get_buffer_data);
+    view->obj = obj;
+    view->ndim = 1;
+    view->internal = info;
+    view->buf = info->ptr;
+    view->itemsize = info->itemsize;
+    view->len = view->itemsize;
+    for (auto s : info->shape)
+        view->len *= s;
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)
+        view->format = const_cast<char *>(info->format.c_str());
+    if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
+        view->ndim = (int) info->ndim;
+        view->strides = &info->strides[0];
+        view->shape = &info->shape[0];
+    }
+    Py_INCREF(view->obj);
+    return 0;
+}
+
+/// buffer_protocol: Release the resources of the buffer.
+extern "C" inline void pybind11_releasebuffer(PyObject *, Py_buffer *view) {
+    delete (buffer_info *) view->internal;
+}
+
+/// Give this type a buffer interface.
+inline void enable_buffer_protocol(PyHeapTypeObject *heap_type) {
+    heap_type->ht_type.tp_as_buffer = &heap_type->as_buffer;
+#if PY_MAJOR_VERSION < 3
+    heap_type->ht_type.tp_flags |= Py_TPFLAGS_HAVE_NEWBUFFER;
+#endif
+
+    heap_type->as_buffer.bf_getbuffer = pybind11_getbuffer;
+    heap_type->as_buffer.bf_releasebuffer = pybind11_releasebuffer;
+}
+
+/** Create a brand new Python type according to the `type_record` specification.
+    Return value: New reference. */
+inline PyObject* make_new_python_type(const type_record &rec) {
+    auto name = reinterpret_steal<object>(PYBIND11_FROM_STRING(rec.name));
+
+    auto qualname = name;
+    if (rec.scope && !PyModule_Check(rec.scope.ptr()) && hasattr(rec.scope, "__qualname__")) {
+#if PY_MAJOR_VERSION >= 3
+        qualname = reinterpret_steal<object>(
+            PyUnicode_FromFormat("%U.%U", rec.scope.attr("__qualname__").ptr(), name.ptr()));
+#else
+        qualname = str(rec.scope.attr("__qualname__").cast<std::string>() + "." + rec.name);
+#endif
+    }
+
+    object module;
+    if (rec.scope) {
+        if (hasattr(rec.scope, "__module__"))
+            module = rec.scope.attr("__module__");
+        else if (hasattr(rec.scope, "__name__"))
+            module = rec.scope.attr("__name__");
+    }
+
+    auto full_name = c_str(
+#if !defined(PYPY_VERSION)
+        module ? str(module).cast<std::string>() + "." + rec.name :
+#endif
+        rec.name);
+
+    char *tp_doc = nullptr;
+    if (rec.doc && options::show_user_defined_docstrings()) {
+        /* Allocate memory for docstring (using PyObject_MALLOC, since
+           Python will free this later on) */
+        size_t size = strlen(rec.doc) + 1;
+        tp_doc = (char *) PyObject_MALLOC(size);
+        memcpy((void *) tp_doc, rec.doc, size);
+    }
+
+    auto &internals = get_internals();
+    auto bases = tuple(rec.bases);
+    auto base = (bases.size() == 0) ? internals.instance_base
+                                    : bases[0].ptr();
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto metaclass = rec.metaclass.ptr() ? (PyTypeObject *) rec.metaclass.ptr()
+                                         : internals.default_metaclass;
+
+    auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type)
+        pybind11_fail(std::string(rec.name) + ": Unable to create type object!");
+
+    heap_type->ht_name = name.release().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = qualname.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = full_name;
+    type->tp_doc = tp_doc;
+    type->tp_base = type_incref((PyTypeObject *)base);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    if (bases.size() > 0)
+        type->tp_bases = bases.release().ptr();
+
+    /* Don't inherit base __init__ */
+    type->tp_init = pybind11_object_init;
+
+    /* Supported protocols */
+    type->tp_as_number = &heap_type->as_number;
+    type->tp_as_sequence = &heap_type->as_sequence;
+    type->tp_as_mapping = &heap_type->as_mapping;
+
+    /* Flags */
+    type->tp_flags |= Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+#if PY_MAJOR_VERSION < 3
+    type->tp_flags |= Py_TPFLAGS_CHECKTYPES;
+#endif
+
+    if (rec.dynamic_attr)
+        enable_dynamic_attributes(heap_type);
+
+    if (rec.buffer_protocol)
+        enable_buffer_protocol(heap_type);
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail(std::string(rec.name) + ": PyType_Ready failed (" + error_string() + ")!");
+
+    assert(rec.dynamic_attr ? PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC)
+                            : !PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+
+    /* Register type with the parent scope */
+    if (rec.scope)
+        setattr(rec.scope, rec.name, (PyObject *) type);
+    else
+        Py_INCREF(type); // Keep it alive forever (reference leak)
+
+    if (module) // Needed by pydoc
+        setattr((PyObject *) type, "__module__", module);
+
+    PYBIND11_SET_OLDPY_QUALNAME(type, qualname);
+
+    return (PyObject *) type;
+}
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/detail/common.h b/python/src/pybind11/detail/common.h
new file mode 100644
index 000000000..bec8ccf3b
--- /dev/null
+++ b/python/src/pybind11/detail/common.h
@@ -0,0 +1,807 @@
+/*
+    pybind11/detail/common.h -- Basic macros
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#if !defined(NAMESPACE_BEGIN)
+#  define NAMESPACE_BEGIN(name) namespace name {
+#endif
+#if !defined(NAMESPACE_END)
+#  define NAMESPACE_END(name) }
+#endif
+
+// Robust support for some features and loading modules compiled against different pybind versions
+// requires forcing hidden visibility on pybind code, so we enforce this by setting the attribute on
+// the main `pybind11` namespace.
+#if !defined(PYBIND11_NAMESPACE)
+#  ifdef __GNUG__
+#    define PYBIND11_NAMESPACE pybind11 __attribute__((visibility("hidden")))
+#  else
+#    define PYBIND11_NAMESPACE pybind11
+#  endif
+#endif
+
+#if !(defined(_MSC_VER) && __cplusplus == 199711L) && !defined(__INTEL_COMPILER)
+#  if __cplusplus >= 201402L
+#    define PYBIND11_CPP14
+#    if __cplusplus >= 201703L
+#      define PYBIND11_CPP17
+#    endif
+#  endif
+#elif defined(_MSC_VER) && __cplusplus == 199711L
+// MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard is fully implemented)
+// Unless you use the /Zc:__cplusplus flag on Visual Studio 2017 15.7 Preview 3 or newer
+#  if _MSVC_LANG >= 201402L
+#    define PYBIND11_CPP14
+#    if _MSVC_LANG > 201402L && _MSC_VER >= 1910
+#      define PYBIND11_CPP17
+#    endif
+#  endif
+#endif
+
+// Compiler version assertions
+#if defined(__INTEL_COMPILER)
+#  if __INTEL_COMPILER < 1700
+#    error pybind11 requires Intel C++ compiler v17 or newer
+#  endif
+#elif defined(__clang__) && !defined(__apple_build_version__)
+#  if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 3)
+#    error pybind11 requires clang 3.3 or newer
+#  endif
+#elif defined(__clang__)
+// Apple changes clang version macros to its Xcode version; the first Xcode release based on
+// (upstream) clang 3.3 was Xcode 5:
+#  if __clang_major__ < 5
+#    error pybind11 requires Xcode/clang 5.0 or newer
+#  endif
+#elif defined(__GNUG__)
+#  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)
+#    error pybind11 requires gcc 4.8 or newer
+#  endif
+#elif defined(_MSC_VER)
+// Pybind hits various compiler bugs in 2015u2 and earlier, and also makes use of some stl features
+// (e.g. std::negation) added in 2015u3:
+#  if _MSC_FULL_VER < 190024210
+#    error pybind11 requires MSVC 2015 update 3 or newer
+#  endif
+#endif
+
+#if !defined(PYBIND11_EXPORT)
+#  if defined(WIN32) || defined(_WIN32)
+#    define PYBIND11_EXPORT __declspec(dllexport)
+#  else
+#    define PYBIND11_EXPORT __attribute__ ((visibility("default")))
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  define PYBIND11_NOINLINE __declspec(noinline)
+#else
+#  define PYBIND11_NOINLINE __attribute__ ((noinline))
+#endif
+
+#if defined(PYBIND11_CPP14)
+#  define PYBIND11_DEPRECATED(reason) [[deprecated(reason)]]
+#else
+#  define PYBIND11_DEPRECATED(reason) __attribute__((deprecated(reason)))
+#endif
+
+#define PYBIND11_VERSION_MAJOR 2
+#define PYBIND11_VERSION_MINOR 3
+#define PYBIND11_VERSION_PATCH 0
+
+/// Include Python header, disable linking to pythonX_d.lib on Windows in debug mode
+#if defined(_MSC_VER)
+#  if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 4)
+#    define HAVE_ROUND 1
+#  endif
+#  pragma warning(push)
+#  pragma warning(disable: 4510 4610 4512 4005)
+#  if defined(_DEBUG)
+#    define PYBIND11_DEBUG_MARKER
+#    undef _DEBUG
+#  endif
+#endif
+
+#include <Python.h>
+#include <frameobject.h>
+#include <pythread.h>
+
+#if defined(_WIN32) && (defined(min) || defined(max))
+#  error Macro clash with min and max -- define NOMINMAX when compiling your program on Windows
+#endif
+
+#if defined(isalnum)
+#  undef isalnum
+#  undef isalpha
+#  undef islower
+#  undef isspace
+#  undef isupper
+#  undef tolower
+#  undef toupper
+#endif
+
+#if defined(_MSC_VER)
+#  if defined(PYBIND11_DEBUG_MARKER)
+#    define _DEBUG
+#    undef PYBIND11_DEBUG_MARKER
+#  endif
+#  pragma warning(pop)
+#endif
+
+#include <cstddef>
+#include <cstring>
+#include <forward_list>
+#include <vector>
+#include <string>
+#include <stdexcept>
+#include <unordered_set>
+#include <unordered_map>
+#include <memory>
+#include <typeindex>
+#include <type_traits>
+
+#if PY_MAJOR_VERSION >= 3 /// Compatibility macros for various Python versions
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr)
+#define PYBIND11_INSTANCE_METHOD_CHECK PyInstanceMethod_Check
+#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyInstanceMethod_GET_FUNCTION
+#define PYBIND11_BYTES_CHECK PyBytes_Check
+#define PYBIND11_BYTES_FROM_STRING PyBytes_FromString
+#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyBytes_FromStringAndSize
+#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyBytes_AsStringAndSize
+#define PYBIND11_BYTES_AS_STRING PyBytes_AsString
+#define PYBIND11_BYTES_SIZE PyBytes_Size
+#define PYBIND11_LONG_CHECK(o) PyLong_Check(o)
+#define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o)
+#define PYBIND11_LONG_FROM_SIGNED(o) PyLong_FromSsize_t((ssize_t) o)
+#define PYBIND11_LONG_FROM_UNSIGNED(o) PyLong_FromSize_t((size_t) o)
+#define PYBIND11_BYTES_NAME "bytes"
+#define PYBIND11_STRING_NAME "str"
+#define PYBIND11_SLICE_OBJECT PyObject
+#define PYBIND11_FROM_STRING PyUnicode_FromString
+#define PYBIND11_STR_TYPE ::pybind11::str
+#define PYBIND11_BOOL_ATTR "__bool__"
+#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_bool)
+#define PYBIND11_PLUGIN_IMPL(name) \
+    extern "C" PYBIND11_EXPORT PyObject *PyInit_##name()
+
+#else
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyMethod_New(ptr, nullptr, class_)
+#define PYBIND11_INSTANCE_METHOD_CHECK PyMethod_Check
+#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyMethod_GET_FUNCTION
+#define PYBIND11_BYTES_CHECK PyString_Check
+#define PYBIND11_BYTES_FROM_STRING PyString_FromString
+#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyString_FromStringAndSize
+#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyString_AsStringAndSize
+#define PYBIND11_BYTES_AS_STRING PyString_AsString
+#define PYBIND11_BYTES_SIZE PyString_Size
+#define PYBIND11_LONG_CHECK(o) (PyInt_Check(o) || PyLong_Check(o))
+#define PYBIND11_LONG_AS_LONGLONG(o) (PyInt_Check(o) ? (long long) PyLong_AsLong(o) : PyLong_AsLongLong(o))
+#define PYBIND11_LONG_FROM_SIGNED(o) PyInt_FromSsize_t((ssize_t) o) // Returns long if needed.
+#define PYBIND11_LONG_FROM_UNSIGNED(o) PyInt_FromSize_t((size_t) o) // Returns long if needed.
+#define PYBIND11_BYTES_NAME "str"
+#define PYBIND11_STRING_NAME "unicode"
+#define PYBIND11_SLICE_OBJECT PySliceObject
+#define PYBIND11_FROM_STRING PyString_FromString
+#define PYBIND11_STR_TYPE ::pybind11::bytes
+#define PYBIND11_BOOL_ATTR "__nonzero__"
+#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_nonzero)
+#define PYBIND11_PLUGIN_IMPL(name) \
+    static PyObject *pybind11_init_wrapper();               \
+    extern "C" PYBIND11_EXPORT void init##name() {          \
+        (void)pybind11_init_wrapper();                      \
+    }                                                       \
+    PyObject *pybind11_init_wrapper()
+#endif
+
+#if PY_VERSION_HEX >= 0x03050000 && PY_VERSION_HEX < 0x03050200
+extern "C" {
+    struct _Py_atomic_address { void *value; };
+    PyAPI_DATA(_Py_atomic_address) _PyThreadState_Current;
+}
+#endif
+
+#define PYBIND11_TRY_NEXT_OVERLOAD ((PyObject *) 1) // special failure return code
+#define PYBIND11_STRINGIFY(x) #x
+#define PYBIND11_TOSTRING(x) PYBIND11_STRINGIFY(x)
+#define PYBIND11_CONCAT(first, second) first##second
+
+#define PYBIND11_CHECK_PYTHON_VERSION \
+    {                                                                          \
+        const char *compiled_ver = PYBIND11_TOSTRING(PY_MAJOR_VERSION)         \
+            "." PYBIND11_TOSTRING(PY_MINOR_VERSION);                           \
+        const char *runtime_ver = Py_GetVersion();                             \
+        size_t len = std::strlen(compiled_ver);                                \
+        if (std::strncmp(runtime_ver, compiled_ver, len) != 0                  \
+                || (runtime_ver[len] >= '0' && runtime_ver[len] <= '9')) {     \
+            PyErr_Format(PyExc_ImportError,                                    \
+                "Python version mismatch: module was compiled for Python %s, " \
+                "but the interpreter version is incompatible: %s.",            \
+                compiled_ver, runtime_ver);                                    \
+            return nullptr;                                                    \
+        }                                                                      \
+    }
+
+#define PYBIND11_CATCH_INIT_EXCEPTIONS \
+        catch (pybind11::error_already_set &e) {                               \
+            PyErr_SetString(PyExc_ImportError, e.what());                      \
+            return nullptr;                                                    \
+        } catch (const std::exception &e) {                                    \
+            PyErr_SetString(PyExc_ImportError, e.what());                      \
+            return nullptr;                                                    \
+        }                                                                      \
+
+/** \rst
+    ***Deprecated in favor of PYBIND11_MODULE***
+
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports a plugin library. Please create a `module` in the function body and return
+    the pointer to its underlying Python object at the end.
+
+    .. code-block:: cpp
+
+        PYBIND11_PLUGIN(example) {
+            pybind11::module m("example", "pybind11 example plugin");
+            /// Set up bindings here
+            return m.ptr();
+        }
+\endrst */
+#define PYBIND11_PLUGIN(name)                                                  \
+    PYBIND11_DEPRECATED("PYBIND11_PLUGIN is deprecated, use PYBIND11_MODULE")  \
+    static PyObject *pybind11_init();                                          \
+    PYBIND11_PLUGIN_IMPL(name) {                                               \
+        PYBIND11_CHECK_PYTHON_VERSION                                          \
+        try {                                                                  \
+            return pybind11_init();                                            \
+        } PYBIND11_CATCH_INIT_EXCEPTIONS                                       \
+    }                                                                          \
+    PyObject *pybind11_init()
+
+/** \rst
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports an extension module. The module name is given as the fist argument and it
+    should not be in quotes. The second macro argument defines a variable of type
+    `py::module` which can be used to initialize the module.
+
+    .. code-block:: cpp
+
+        PYBIND11_MODULE(example, m) {
+            m.doc() = "pybind11 example module";
+
+            // Add bindings here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+\endrst */
+#define PYBIND11_MODULE(name, variable)                                        \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);     \
+    PYBIND11_PLUGIN_IMPL(name) {                                               \
+        PYBIND11_CHECK_PYTHON_VERSION                                          \
+        auto m = pybind11::module(PYBIND11_TOSTRING(name));                    \
+        try {                                                                  \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                          \
+            return m.ptr();                                                    \
+        } PYBIND11_CATCH_INIT_EXCEPTIONS                                       \
+    }                                                                          \
+    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
+
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+using ssize_t = Py_ssize_t;
+using size_t  = std::size_t;
+
+/// Approach used to cast a previously unknown C++ instance into a Python object
+enum class return_value_policy : uint8_t {
+    /** This is the default return value policy, which falls back to the policy
+        return_value_policy::take_ownership when the return value is a pointer.
+        Otherwise, it uses return_value::move or return_value::copy for rvalue
+        and lvalue references, respectively. See below for a description of what
+        all of these different policies do. */
+    automatic = 0,
+
+    /** As above, but use policy return_value_policy::reference when the return
+        value is a pointer. This is the default conversion policy for function
+        arguments when calling Python functions manually from C++ code (i.e. via
+        handle::operator()). You probably won't need to use this. */
+    automatic_reference,
+
+    /** Reference an existing object (i.e. do not create a new copy) and take
+        ownership. Python will call the destructor and delete operator when the
+        object’s reference count reaches zero. Undefined behavior ensues when
+        the C++ side does the same.. */
+    take_ownership,
+
+    /** Create a new copy of the returned object, which will be owned by
+        Python. This policy is comparably safe because the lifetimes of the two
+        instances are decoupled. */
+    copy,
+
+    /** Use std::move to move the return value contents into a new instance
+        that will be owned by Python. This policy is comparably safe because the
+        lifetimes of the two instances (move source and destination) are
+        decoupled. */
+    move,
+
+    /** Reference an existing object, but do not take ownership. The C++ side
+        is responsible for managing the object’s lifetime and deallocating it
+        when it is no longer used. Warning: undefined behavior will ensue when
+        the C++ side deletes an object that is still referenced and used by
+        Python. */
+    reference,
+
+    /** This policy only applies to methods and properties. It references the
+        object without taking ownership similar to the above
+        return_value_policy::reference policy. In contrast to that policy, the
+        function or property’s implicit this argument (called the parent) is
+        considered to be the the owner of the return value (the child).
+        pybind11 then couples the lifetime of the parent to the child via a
+        reference relationship that ensures that the parent cannot be garbage
+        collected while Python is still using the child. More advanced
+        variations of this scheme are also possible using combinations of
+        return_value_policy::reference and the keep_alive call policy */
+    reference_internal
+};
+
+NAMESPACE_BEGIN(detail)
+
+inline static constexpr int log2(size_t n, int k = 0) { return (n <= 1) ? k : log2(n >> 1, k + 1); }
+
+// Returns the size as a multiple of sizeof(void *), rounded up.
+inline static constexpr size_t size_in_ptrs(size_t s) { return 1 + ((s - 1) >> log2(sizeof(void *))); }
+
+/**
+ * The space to allocate for simple layout instance holders (see below) in multiple of the size of
+ * a pointer (e.g.  2 means 16 bytes on 64-bit architectures).  The default is the minimum required
+ * to holder either a std::unique_ptr or std::shared_ptr (which is almost always
+ * sizeof(std::shared_ptr<T>)).
+ */
+constexpr size_t instance_simple_holder_in_ptrs() {
+    static_assert(sizeof(std::shared_ptr<int>) >= sizeof(std::unique_ptr<int>),
+            "pybind assumes std::shared_ptrs are at least as big as std::unique_ptrs");
+    return size_in_ptrs(sizeof(std::shared_ptr<int>));
+}
+
+// Forward declarations
+struct type_info;
+struct value_and_holder;
+
+struct nonsimple_values_and_holders {
+    void **values_and_holders;
+    uint8_t *status;
+};
+
+/// The 'instance' type which needs to be standard layout (need to be able to use 'offsetof')
+struct instance {
+    PyObject_HEAD
+    /// Storage for pointers and holder; see simple_layout, below, for a description
+    union {
+        void *simple_value_holder[1 + instance_simple_holder_in_ptrs()];
+        nonsimple_values_and_holders nonsimple;
+    };
+    /// Weak references
+    PyObject *weakrefs;
+    /// If true, the pointer is owned which means we're free to manage it with a holder.
+    bool owned : 1;
+    /**
+     * An instance has two possible value/holder layouts.
+     *
+     * Simple layout (when this flag is true), means the `simple_value_holder` is set with a pointer
+     * and the holder object governing that pointer, i.e. [val1*][holder].  This layout is applied
+     * whenever there is no python-side multiple inheritance of bound C++ types *and* the type's
+     * holder will fit in the default space (which is large enough to hold either a std::unique_ptr
+     * or std::shared_ptr).
+     *
+     * Non-simple layout applies when using custom holders that require more space than `shared_ptr`
+     * (which is typically the size of two pointers), or when multiple inheritance is used on the
+     * python side.  Non-simple layout allocates the required amount of memory to have multiple
+     * bound C++ classes as parents.  Under this layout, `nonsimple.values_and_holders` is set to a
+     * pointer to allocated space of the required space to hold a sequence of value pointers and
+     * holders followed `status`, a set of bit flags (1 byte each), i.e.
+     * [val1*][holder1][val2*][holder2]...[bb...]  where each [block] is rounded up to a multiple of
+     * `sizeof(void *)`.  `nonsimple.status` is, for convenience, a pointer to the
+     * beginning of the [bb...] block (but not independently allocated).
+     *
+     * Status bits indicate whether the associated holder is constructed (&
+     * status_holder_constructed) and whether the value pointer is registered (&
+     * status_instance_registered) in `registered_instances`.
+     */
+    bool simple_layout : 1;
+    /// For simple layout, tracks whether the holder has been constructed
+    bool simple_holder_constructed : 1;
+    /// For simple layout, tracks whether the instance is registered in `registered_instances`
+    bool simple_instance_registered : 1;
+    /// If true, get_internals().patients has an entry for this object
+    bool has_patients : 1;
+
+    /// Initializes all of the above type/values/holders data (but not the instance values themselves)
+    void allocate_layout();
+
+    /// Destroys/deallocates all of the above
+    void deallocate_layout();
+
+    /// Returns the value_and_holder wrapper for the given type (or the first, if `find_type`
+    /// omitted).  Returns a default-constructed (with `.inst = nullptr`) object on failure if
+    /// `throw_if_missing` is false.
+    value_and_holder get_value_and_holder(const type_info *find_type = nullptr, bool throw_if_missing = true);
+
+    /// Bit values for the non-simple status flags
+    static constexpr uint8_t status_holder_constructed  = 1;
+    static constexpr uint8_t status_instance_registered = 2;
+};
+
+static_assert(std::is_standard_layout<instance>::value, "Internal error: `pybind11::detail::instance` is not standard layout!");
+
+/// from __cpp_future__ import (convenient aliases from C++14/17)
+#if defined(PYBIND11_CPP14) && (!defined(_MSC_VER) || _MSC_VER >= 1910)
+using std::enable_if_t;
+using std::conditional_t;
+using std::remove_cv_t;
+using std::remove_reference_t;
+#else
+template <bool B, typename T = void> using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, typename T, typename F> using conditional_t = typename std::conditional<B, T, F>::type;
+template <typename T> using remove_cv_t = typename std::remove_cv<T>::type;
+template <typename T> using remove_reference_t = typename std::remove_reference<T>::type;
+#endif
+
+/// Index sequences
+#if defined(PYBIND11_CPP14)
+using std::index_sequence;
+using std::make_index_sequence;
+#else
+template<size_t ...> struct index_sequence  { };
+template<size_t N, size_t ...S> struct make_index_sequence_impl : make_index_sequence_impl <N - 1, N - 1, S...> { };
+template<size_t ...S> struct make_index_sequence_impl <0, S...> { typedef index_sequence<S...> type; };
+template<size_t N> using make_index_sequence = typename make_index_sequence_impl<N>::type;
+#endif
+
+/// Make an index sequence of the indices of true arguments
+template <typename ISeq, size_t, bool...> struct select_indices_impl { using type = ISeq; };
+template <size_t... IPrev, size_t I, bool B, bool... Bs> struct select_indices_impl<index_sequence<IPrev...>, I, B, Bs...>
+    : select_indices_impl<conditional_t<B, index_sequence<IPrev..., I>, index_sequence<IPrev...>>, I + 1, Bs...> {};
+template <bool... Bs> using select_indices = typename select_indices_impl<index_sequence<>, 0, Bs...>::type;
+
+/// Backports of std::bool_constant and std::negation to accommodate older compilers
+template <bool B> using bool_constant = std::integral_constant<bool, B>;
+template <typename T> struct negation : bool_constant<!T::value> { };
+
+template <typename...> struct void_t_impl { using type = void; };
+template <typename... Ts> using void_t = typename void_t_impl<Ts...>::type;
+
+/// Compile-time all/any/none of that check the boolean value of all template types
+#if defined(__cpp_fold_expressions) && !(defined(_MSC_VER) && (_MSC_VER < 1916))
+template <class... Ts> using all_of = bool_constant<(Ts::value && ...)>;
+template <class... Ts> using any_of = bool_constant<(Ts::value || ...)>;
+#elif !defined(_MSC_VER)
+template <bool...> struct bools {};
+template <class... Ts> using all_of = std::is_same<
+    bools<Ts::value..., true>,
+    bools<true, Ts::value...>>;
+template <class... Ts> using any_of = negation<all_of<negation<Ts>...>>;
+#else
+// MSVC has trouble with the above, but supports std::conjunction, which we can use instead (albeit
+// at a slight loss of compilation efficiency).
+template <class... Ts> using all_of = std::conjunction<Ts...>;
+template <class... Ts> using any_of = std::disjunction<Ts...>;
+#endif
+template <class... Ts> using none_of = negation<any_of<Ts...>>;
+
+template <class T, template<class> class... Predicates> using satisfies_all_of = all_of<Predicates<T>...>;
+template <class T, template<class> class... Predicates> using satisfies_any_of = any_of<Predicates<T>...>;
+template <class T, template<class> class... Predicates> using satisfies_none_of = none_of<Predicates<T>...>;
+
+/// Strip the class from a method type
+template <typename T> struct remove_class { };
+template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...)> { typedef R type(A...); };
+template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...) const> { typedef R type(A...); };
+
+/// Helper template to strip away type modifiers
+template <typename T> struct intrinsic_type                       { typedef T type; };
+template <typename T> struct intrinsic_type<const T>              { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T*>                   { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T&>                   { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T&&>                  { typedef typename intrinsic_type<T>::type type; };
+template <typename T, size_t N> struct intrinsic_type<const T[N]> { typedef typename intrinsic_type<T>::type type; };
+template <typename T, size_t N> struct intrinsic_type<T[N]>       { typedef typename intrinsic_type<T>::type type; };
+template <typename T> using intrinsic_t = typename intrinsic_type<T>::type;
+
+/// Helper type to replace 'void' in some expressions
+struct void_type { };
+
+/// Helper template which holds a list of types
+template <typename...> struct type_list { };
+
+/// Compile-time integer sum
+#ifdef __cpp_fold_expressions
+template <typename... Ts> constexpr size_t constexpr_sum(Ts... ns) { return (0 + ... + size_t{ns}); }
+#else
+constexpr size_t constexpr_sum() { return 0; }
+template <typename T, typename... Ts>
+constexpr size_t constexpr_sum(T n, Ts... ns) { return size_t{n} + constexpr_sum(ns...); }
+#endif
+
+NAMESPACE_BEGIN(constexpr_impl)
+/// Implementation details for constexpr functions
+constexpr int first(int i) { return i; }
+template <typename T, typename... Ts>
+constexpr int first(int i, T v, Ts... vs) { return v ? i : first(i + 1, vs...); }
+
+constexpr int last(int /*i*/, int result) { return result; }
+template <typename T, typename... Ts>
+constexpr int last(int i, int result, T v, Ts... vs) { return last(i + 1, v ? i : result, vs...); }
+NAMESPACE_END(constexpr_impl)
+
+/// Return the index of the first type in Ts which satisfies Predicate<T>.  Returns sizeof...(Ts) if
+/// none match.
+template <template<typename> class Predicate, typename... Ts>
+constexpr int constexpr_first() { return constexpr_impl::first(0, Predicate<Ts>::value...); }
+
+/// Return the index of the last type in Ts which satisfies Predicate<T>, or -1 if none match.
+template <template<typename> class Predicate, typename... Ts>
+constexpr int constexpr_last() { return constexpr_impl::last(0, -1, Predicate<Ts>::value...); }
+
+/// Return the Nth element from the parameter pack
+template <size_t N, typename T, typename... Ts>
+struct pack_element { using type = typename pack_element<N - 1, Ts...>::type; };
+template <typename T, typename... Ts>
+struct pack_element<0, T, Ts...> { using type = T; };
+
+/// Return the one and only type which matches the predicate, or Default if none match.
+/// If more than one type matches the predicate, fail at compile-time.
+template <template<typename> class Predicate, typename Default, typename... Ts>
+struct exactly_one {
+    static constexpr auto found = constexpr_sum(Predicate<Ts>::value...);
+    static_assert(found <= 1, "Found more than one type matching the predicate");
+
+    static constexpr auto index = found ? constexpr_first<Predicate, Ts...>() : 0;
+    using type = conditional_t<found, typename pack_element<index, Ts...>::type, Default>;
+};
+template <template<typename> class P, typename Default>
+struct exactly_one<P, Default> { using type = Default; };
+
+template <template<typename> class Predicate, typename Default, typename... Ts>
+using exactly_one_t = typename exactly_one<Predicate, Default, Ts...>::type;
+
+/// Defer the evaluation of type T until types Us are instantiated
+template <typename T, typename... /*Us*/> struct deferred_type { using type = T; };
+template <typename T, typename... Us> using deferred_t = typename deferred_type<T, Us...>::type;
+
+/// Like is_base_of, but requires a strict base (i.e. `is_strict_base_of<T, T>::value == false`,
+/// unlike `std::is_base_of`)
+template <typename Base, typename Derived> using is_strict_base_of = bool_constant<
+    std::is_base_of<Base, Derived>::value && !std::is_same<Base, Derived>::value>;
+
+/// Like is_base_of, but also requires that the base type is accessible (i.e. that a Derived pointer
+/// can be converted to a Base pointer)
+template <typename Base, typename Derived> using is_accessible_base_of = bool_constant<
+    std::is_base_of<Base, Derived>::value && std::is_convertible<Derived *, Base *>::value>;
+
+template <template<typename...> class Base>
+struct is_template_base_of_impl {
+    template <typename... Us> static std::true_type check(Base<Us...> *);
+    static std::false_type check(...);
+};
+
+/// Check if a template is the base of a type. For example:
+/// `is_template_base_of<Base, T>` is true if `struct T : Base<U> {}` where U can be anything
+template <template<typename...> class Base, typename T>
+#if !defined(_MSC_VER)
+using is_template_base_of = decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T>*)nullptr));
+#else // MSVC2015 has trouble with decltype in template aliases
+struct is_template_base_of : decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T>*)nullptr)) { };
+#endif
+
+/// Check if T is an instantiation of the template `Class`. For example:
+/// `is_instantiation<shared_ptr, T>` is true if `T == shared_ptr<U>` where U can be anything.
+template <template<typename...> class Class, typename T>
+struct is_instantiation : std::false_type { };
+template <template<typename...> class Class, typename... Us>
+struct is_instantiation<Class, Class<Us...>> : std::true_type { };
+
+/// Check if T is std::shared_ptr<U> where U can be anything
+template <typename T> using is_shared_ptr = is_instantiation<std::shared_ptr, T>;
+
+/// Check if T looks like an input iterator
+template <typename T, typename = void> struct is_input_iterator : std::false_type {};
+template <typename T>
+struct is_input_iterator<T, void_t<decltype(*std::declval<T &>()), decltype(++std::declval<T &>())>>
+    : std::true_type {};
+
+template <typename T> using is_function_pointer = bool_constant<
+    std::is_pointer<T>::value && std::is_function<typename std::remove_pointer<T>::type>::value>;
+
+template <typename F> struct strip_function_object {
+    using type = typename remove_class<decltype(&F::operator())>::type;
+};
+
+// Extracts the function signature from a function, function pointer or lambda.
+template <typename Function, typename F = remove_reference_t<Function>>
+using function_signature_t = conditional_t<
+    std::is_function<F>::value,
+    F,
+    typename conditional_t<
+        std::is_pointer<F>::value || std::is_member_pointer<F>::value,
+        std::remove_pointer<F>,
+        strip_function_object<F>
+    >::type
+>;
+
+/// Returns true if the type looks like a lambda: that is, isn't a function, pointer or member
+/// pointer.  Note that this can catch all sorts of other things, too; this is intended to be used
+/// in a place where passing a lambda makes sense.
+template <typename T> using is_lambda = satisfies_none_of<remove_reference_t<T>,
+        std::is_function, std::is_pointer, std::is_member_pointer>;
+
+/// Ignore that a variable is unused in compiler warnings
+inline void ignore_unused(const int *) { }
+
+/// Apply a function over each element of a parameter pack
+#ifdef __cpp_fold_expressions
+#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) (((PATTERN), void()), ...)
+#else
+using expand_side_effects = bool[];
+#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) pybind11::detail::expand_side_effects{ ((PATTERN), void(), false)..., false }
+#endif
+
+NAMESPACE_END(detail)
+
+/// C++ bindings of builtin Python exceptions
+class builtin_exception : public std::runtime_error {
+public:
+    using std::runtime_error::runtime_error;
+    /// Set the error using the Python C API
+    virtual void set_error() const = 0;
+};
+
+#define PYBIND11_RUNTIME_EXCEPTION(name, type) \
+    class name : public builtin_exception { public: \
+        using builtin_exception::builtin_exception; \
+        name() : name("") { } \
+        void set_error() const override { PyErr_SetString(type, what()); } \
+    };
+
+PYBIND11_RUNTIME_EXCEPTION(stop_iteration, PyExc_StopIteration)
+PYBIND11_RUNTIME_EXCEPTION(index_error, PyExc_IndexError)
+PYBIND11_RUNTIME_EXCEPTION(key_error, PyExc_KeyError)
+PYBIND11_RUNTIME_EXCEPTION(value_error, PyExc_ValueError)
+PYBIND11_RUNTIME_EXCEPTION(type_error, PyExc_TypeError)
+PYBIND11_RUNTIME_EXCEPTION(cast_error, PyExc_RuntimeError) /// Thrown when pybind11::cast or handle::call fail due to a type casting error
+PYBIND11_RUNTIME_EXCEPTION(reference_cast_error, PyExc_RuntimeError) /// Used internally
+
+[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const char *reason) { throw std::runtime_error(reason); }
+[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const std::string &reason) { throw std::runtime_error(reason); }
+
+template <typename T, typename SFINAE = void> struct format_descriptor { };
+
+NAMESPACE_BEGIN(detail)
+// Returns the index of the given type in the type char array below, and in the list in numpy.h
+// The order here is: bool; 8 ints ((signed,unsigned)x(8,16,32,64)bits); float,double,long double;
+// complex float,double,long double.  Note that the long double types only participate when long
+// double is actually longer than double (it isn't under MSVC).
+// NB: not only the string below but also complex.h and numpy.h rely on this order.
+template <typename T, typename SFINAE = void> struct is_fmt_numeric { static constexpr bool value = false; };
+template <typename T> struct is_fmt_numeric<T, enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index = std::is_same<T, bool>::value ? 0 : 1 + (
+        std::is_integral<T>::value ? detail::log2(sizeof(T))*2 + std::is_unsigned<T>::value : 8 + (
+        std::is_same<T, double>::value ? 1 : std::is_same<T, long double>::value ? 2 : 0));
+};
+NAMESPACE_END(detail)
+
+template <typename T> struct format_descriptor<T, detail::enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr const char c = "?bBhHiIqQfdg"[detail::is_fmt_numeric<T>::index];
+    static constexpr const char value[2] = { c, '\0' };
+    static std::string format() { return std::string(1, c); }
+};
+
+#if !defined(PYBIND11_CPP17)
+
+template <typename T> constexpr const char format_descriptor<
+    T, detail::enable_if_t<std::is_arithmetic<T>::value>>::value[2];
+
+#endif
+
+/// RAII wrapper that temporarily clears any Python error state
+struct error_scope {
+    PyObject *type, *value, *trace;
+    error_scope() { PyErr_Fetch(&type, &value, &trace); }
+    ~error_scope() { PyErr_Restore(type, value, trace); }
+};
+
+/// Dummy destructor wrapper that can be used to expose classes with a private destructor
+struct nodelete { template <typename T> void operator()(T*) { } };
+
+// overload_cast requires variable templates: C++14
+#if defined(PYBIND11_CPP14)
+#define PYBIND11_OVERLOAD_CAST 1
+
+NAMESPACE_BEGIN(detail)
+template <typename... Args>
+struct overload_cast_impl {
+    constexpr overload_cast_impl() {} // MSVC 2015 needs this
+
+    template <typename Return>
+    constexpr auto operator()(Return (*pf)(Args...)) const noexcept
+                              -> decltype(pf) { return pf; }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...), std::false_type = {}) const noexcept
+                              -> decltype(pmf) { return pmf; }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...) const, std::true_type) const noexcept
+                              -> decltype(pmf) { return pmf; }
+};
+NAMESPACE_END(detail)
+
+/// Syntax sugar for resolving overloaded function pointers:
+///  - regular: static_cast<Return (Class::*)(Arg0, Arg1, Arg2)>(&Class::func)
+///  - sweet:   overload_cast<Arg0, Arg1, Arg2>(&Class::func)
+template <typename... Args>
+static constexpr detail::overload_cast_impl<Args...> overload_cast = {};
+// MSVC 2015 only accepts this particular initialization syntax for this variable template.
+
+/// Const member function selector for overload_cast
+///  - regular: static_cast<Return (Class::*)(Arg) const>(&Class::func)
+///  - sweet:   overload_cast<Arg>(&Class::func, const_)
+static constexpr auto const_ = std::true_type{};
+
+#else // no overload_cast: providing something that static_assert-fails:
+template <typename... Args> struct overload_cast {
+    static_assert(detail::deferred_t<std::false_type, Args...>::value,
+                  "pybind11::overload_cast<...> requires compiling in C++14 mode");
+};
+#endif // overload_cast
+
+NAMESPACE_BEGIN(detail)
+
+// Adaptor for converting arbitrary container arguments into a vector; implicitly convertible from
+// any standard container (or C-style array) supporting std::begin/std::end, any singleton
+// arithmetic type (if T is arithmetic), or explicitly constructible from an iterator pair.
+template <typename T>
+class any_container {
+    std::vector<T> v;
+public:
+    any_container() = default;
+
+    // Can construct from a pair of iterators
+    template <typename It, typename = enable_if_t<is_input_iterator<It>::value>>
+    any_container(It first, It last) : v(first, last) { }
+
+    // Implicit conversion constructor from any arbitrary container type with values convertible to T
+    template <typename Container, typename = enable_if_t<std::is_convertible<decltype(*std::begin(std::declval<const Container &>())), T>::value>>
+    any_container(const Container &c) : any_container(std::begin(c), std::end(c)) { }
+
+    // initializer_list's aren't deducible, so don't get matched by the above template; we need this
+    // to explicitly allow implicit conversion from one:
+    template <typename TIn, typename = enable_if_t<std::is_convertible<TIn, T>::value>>
+    any_container(const std::initializer_list<TIn> &c) : any_container(c.begin(), c.end()) { }
+
+    // Avoid copying if given an rvalue vector of the correct type.
+    any_container(std::vector<T> &&v) : v(std::move(v)) { }
+
+    // Moves the vector out of an rvalue any_container
+    operator std::vector<T> &&() && { return std::move(v); }
+
+    // Dereferencing obtains a reference to the underlying vector
+    std::vector<T> &operator*() { return v; }
+    const std::vector<T> &operator*() const { return v; }
+
+    // -> lets you call methods on the underlying vector
+    std::vector<T> *operator->() { return &v; }
+    const std::vector<T> *operator->() const { return &v; }
+};
+
+NAMESPACE_END(detail)
+
+
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/detail/descr.h b/python/src/pybind11/detail/descr.h
new file mode 100644
index 000000000..8d404e534
--- /dev/null
+++ b/python/src/pybind11/detail/descr.h
@@ -0,0 +1,100 @@
+/*
+    pybind11/detail/descr.h: Helper type for concatenating type signatures at compile time
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+#if !defined(_MSC_VER)
+#  define PYBIND11_DESCR_CONSTEXPR static constexpr
+#else
+#  define PYBIND11_DESCR_CONSTEXPR const
+#endif
+
+/* Concatenate type signatures at compile time */
+template <size_t N, typename... Ts>
+struct descr {
+    char text[N + 1];
+
+    constexpr descr() : text{'\0'} { }
+    constexpr descr(char const (&s)[N+1]) : descr(s, make_index_sequence<N>()) { }
+
+    template <size_t... Is>
+    constexpr descr(char const (&s)[N+1], index_sequence<Is...>) : text{s[Is]..., '\0'} { }
+
+    template <typename... Chars>
+    constexpr descr(char c, Chars... cs) : text{c, static_cast<char>(cs)..., '\0'} { }
+
+    static constexpr std::array<const std::type_info *, sizeof...(Ts) + 1> types() {
+        return {{&typeid(Ts)..., nullptr}};
+    }
+};
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2, size_t... Is1, size_t... Is2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> plus_impl(const descr<N1, Ts1...> &a, const descr<N2, Ts2...> &b,
+                                                   index_sequence<Is1...>, index_sequence<Is2...>) {
+    return {a.text[Is1]..., b.text[Is2]...};
+}
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> operator+(const descr<N1, Ts1...> &a, const descr<N2, Ts2...> &b) {
+    return plus_impl(a, b, make_index_sequence<N1>(), make_index_sequence<N2>());
+}
+
+template <size_t N>
+constexpr descr<N - 1> _(char const(&text)[N]) { return descr<N - 1>(text); }
+constexpr descr<0> _(char const(&)[1]) { return {}; }
+
+template <size_t Rem, size_t... Digits> struct int_to_str : int_to_str<Rem/10, Rem%10, Digits...> { };
+template <size_t...Digits> struct int_to_str<0, Digits...> {
+    static constexpr auto digits = descr<sizeof...(Digits)>(('0' + Digits)...);
+};
+
+// Ternary description (like std::conditional)
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<B, descr<N1 - 1>> _(char const(&text1)[N1], char const(&)[N2]) {
+    return _(text1);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<!B, descr<N2 - 1>> _(char const(&)[N1], char const(&text2)[N2]) {
+    return _(text2);
+}
+
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<B, T1> _(const T1 &d, const T2 &) { return d; }
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<!B, T2> _(const T1 &, const T2 &d) { return d; }
+
+template <size_t Size> auto constexpr _() -> decltype(int_to_str<Size / 10, Size % 10>::digits) {
+    return int_to_str<Size / 10, Size % 10>::digits;
+}
+
+template <typename Type> constexpr descr<1, Type> _() { return {'%'}; }
+
+constexpr descr<0> concat() { return {}; }
+
+template <size_t N, typename... Ts>
+constexpr descr<N, Ts...> concat(const descr<N, Ts...> &descr) { return descr; }
+
+template <size_t N, typename... Ts, typename... Args>
+constexpr auto concat(const descr<N, Ts...> &d, const Args &...args)
+    -> decltype(std::declval<descr<N + 2, Ts...>>() + concat(args...)) {
+    return d + _(", ") + concat(args...);
+}
+
+template <size_t N, typename... Ts>
+constexpr descr<N + 2, Ts...> type_descr(const descr<N, Ts...> &descr) {
+    return _("{") + descr + _("}");
+}
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/detail/init.h b/python/src/pybind11/detail/init.h
new file mode 100644
index 000000000..acfe00bdb
--- /dev/null
+++ b/python/src/pybind11/detail/init.h
@@ -0,0 +1,335 @@
+/*
+    pybind11/detail/init.h: init factory function implementation and support code.
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "class.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+template <>
+class type_caster<value_and_holder> {
+public:
+    bool load(handle h, bool) {
+        value = reinterpret_cast<value_and_holder *>(h.ptr());
+        return true;
+    }
+
+    template <typename> using cast_op_type = value_and_holder &;
+    operator value_and_holder &() { return *value; }
+    static constexpr auto name = _<value_and_holder>();
+
+private:
+    value_and_holder *value = nullptr;
+};
+
+NAMESPACE_BEGIN(initimpl)
+
+inline void no_nullptr(void *ptr) {
+    if (!ptr) throw type_error("pybind11::init(): factory function returned nullptr");
+}
+
+// Implementing functions for all forms of py::init<...> and py::init(...)
+template <typename Class> using Cpp = typename Class::type;
+template <typename Class> using Alias = typename Class::type_alias;
+template <typename Class> using Holder = typename Class::holder_type;
+
+template <typename Class> using is_alias_constructible = std::is_constructible<Alias<Class>, Cpp<Class> &&>;
+
+// Takes a Cpp pointer and returns true if it actually is a polymorphic Alias instance.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+bool is_alias(Cpp<Class> *ptr) {
+    return dynamic_cast<Alias<Class> *>(ptr) != nullptr;
+}
+// Failing fallback version of the above for a no-alias class (always returns false)
+template <typename /*Class*/>
+constexpr bool is_alias(void *) { return false; }
+
+// Constructs and returns a new object; if the given arguments don't map to a constructor, we fall
+// back to brace aggregate initiailization so that for aggregate initialization can be used with
+// py::init, e.g.  `py::init<int, int>` to initialize a `struct T { int a; int b; }`.  For
+// non-aggregate types, we need to use an ordinary T(...) constructor (invoking as `T{...}` usually
+// works, but will not do the expected thing when `T` has an `initializer_list<T>` constructor).
+template <typename Class, typename... Args, detail::enable_if_t<std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) { return new Class(std::forward<Args>(args)...); }
+template <typename Class, typename... Args, detail::enable_if_t<!std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) { return new Class{std::forward<Args>(args)...}; }
+
+// Attempts to constructs an alias using a `Alias(Cpp &&)` constructor.  This allows types with
+// an alias to provide only a single Cpp factory function as long as the Alias can be
+// constructed from an rvalue reference of the base Cpp type.  This means that Alias classes
+// can, when appropriate, simply define a `Alias(Cpp &&)` constructor rather than needing to
+// inherit all the base class constructors.
+template <typename Class>
+void construct_alias_from_cpp(std::true_type /*is_alias_constructible*/,
+                              value_and_holder &v_h, Cpp<Class> &&base) {
+    v_h.value_ptr() = new Alias<Class>(std::move(base));
+}
+template <typename Class>
+[[noreturn]] void construct_alias_from_cpp(std::false_type /*!is_alias_constructible*/,
+                                           value_and_holder &, Cpp<Class> &&) {
+    throw type_error("pybind11::init(): unable to convert returned instance to required "
+                     "alias class: no `Alias<Class>(Class &&)` constructor available");
+}
+
+// Error-generating fallback for factories that don't match one of the below construction
+// mechanisms.
+template <typename Class>
+void construct(...) {
+    static_assert(!std::is_same<Class, Class>::value /* always false */,
+            "pybind11::init(): init function must return a compatible pointer, "
+            "holder, or value");
+}
+
+// Pointer return v1: the factory function returns a class pointer for a registered class.
+// If we don't need an alias (because this class doesn't have one, or because the final type is
+// inherited on the Python side) we can simply take over ownership.  Otherwise we need to try to
+// construct an Alias from the returned base instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> *ptr, bool need_alias) {
+    no_nullptr(ptr);
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        // We're going to try to construct an alias by moving the cpp type.  Whether or not
+        // that succeeds, we still need to destroy the original cpp pointer (either the
+        // moved away leftover, if the alias construction works, or the value itself if we
+        // throw an error), but we can't just call `delete ptr`: it might have a special
+        // deleter, or might be shared_from_this.  So we construct a holder around it as if
+        // it was a normal instance, then steal the holder away into a local variable; thus
+        // the holder and destruction happens when we leave the C++ scope, and the holder
+        // class gets to handle the destruction however it likes.
+        v_h.value_ptr() = ptr;
+        v_h.set_instance_registered(true); // To prevent init_instance from registering it
+        v_h.type->init_instance(v_h.inst, nullptr); // Set up the holder
+        Holder<Class> temp_holder(std::move(v_h.holder<Holder<Class>>())); // Steal the holder
+        v_h.type->dealloc(v_h); // Destroys the moved-out holder remains, resets value ptr to null
+        v_h.set_instance_registered(false);
+
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(*ptr));
+    } else {
+        // Otherwise the type isn't inherited, so we don't need an Alias
+        v_h.value_ptr() = ptr;
+    }
+}
+
+// Pointer return v2: a factory that always returns an alias instance ptr.  We simply take over
+// ownership of the pointer.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+void construct(value_and_holder &v_h, Alias<Class> *alias_ptr, bool) {
+    no_nullptr(alias_ptr);
+    v_h.value_ptr() = static_cast<Cpp<Class> *>(alias_ptr);
+}
+
+// Holder return: copy its pointer, and move or copy the returned holder into the new instance's
+// holder.  This also handles types like std::shared_ptr<T> and std::unique_ptr<T> where T is a
+// derived type (through those holder's implicit conversion from derived class holder constructors).
+template <typename Class>
+void construct(value_and_holder &v_h, Holder<Class> holder, bool need_alias) {
+    auto *ptr = holder_helper<Holder<Class>>::get(holder);
+    // If we need an alias, check that the held pointer is actually an alias instance
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr))
+        throw type_error("pybind11::init(): construction failed: returned holder-wrapped instance "
+                         "is not an alias instance");
+
+    v_h.value_ptr() = ptr;
+    v_h.type->init_instance(v_h.inst, &holder);
+}
+
+// return-by-value version 1: returning a cpp class by value.  If the class has an alias and an
+// alias is required the alias must have an `Alias(Cpp &&)` constructor so that we can construct
+// the alias from the base when needed (i.e. because of Python-side inheritance).  When we don't
+// need it, we simply move-construct the cpp value into a new instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> &&result, bool need_alias) {
+    static_assert(std::is_move_constructible<Cpp<Class>>::value,
+        "pybind11::init() return-by-value factory function requires a movable class");
+    if (Class::has_alias && need_alias)
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(result));
+    else
+        v_h.value_ptr() = new Cpp<Class>(std::move(result));
+}
+
+// return-by-value version 2: returning a value of the alias type itself.  We move-construct an
+// Alias instance (even if no the python-side inheritance is involved).  The is intended for
+// cases where Alias initialization is always desired.
+template <typename Class>
+void construct(value_and_holder &v_h, Alias<Class> &&result, bool) {
+    static_assert(std::is_move_constructible<Alias<Class>>::value,
+        "pybind11::init() return-by-alias-value factory function requires a movable alias class");
+    v_h.value_ptr() = new Alias<Class>(std::move(result));
+}
+
+// Implementing class for py::init<...>()
+template <typename... Args>
+struct constructor {
+    template <typename Class, typename... Extra, enable_if_t<!Class::has_alias, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias &&
+                          std::is_constructible<Cpp<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            if (Py_TYPE(v_h.inst) == v_h.type->type)
+                v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+            else
+                v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias &&
+                          !std::is_constructible<Cpp<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Implementing class for py::init_alias<...>()
+template <typename... Args> struct alias_constructor {
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias && std::is_constructible<Alias<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Implementation class for py::init(Func) and py::init(Func, AliasFunc)
+template <typename CFunc, typename AFunc = void_type (*)(),
+          typename = function_signature_t<CFunc>, typename = function_signature_t<AFunc>>
+struct factory;
+
+// Specialization for py::init(Func)
+template <typename Func, typename Return, typename... Args>
+struct factory<Func, void_type (*)(), Return(Args...)> {
+    remove_reference_t<Func> class_factory;
+
+    factory(Func &&f) : class_factory(std::forward<Func>(f)) { }
+
+    // The given class either has no alias or has no separate alias factory;
+    // this always constructs the class itself.  If the class is registered with an alias
+    // type and an alias instance is needed (i.e. because the final type is a Python class
+    // inheriting from the C++ type) the returned value needs to either already be an alias
+    // instance, or the alias needs to be constructible from a `Class &&` argument.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        #if defined(PYBIND11_CPP14)
+        cl.def("__init__", [func = std::move(class_factory)]
+        #else
+        auto &func = class_factory;
+        cl.def("__init__", [func]
+        #endif
+        (value_and_holder &v_h, Args... args) {
+            construct<Class>(v_h, func(std::forward<Args>(args)...),
+                             Py_TYPE(v_h.inst) != v_h.type->type);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Specialization for py::init(Func, AliasFunc)
+template <typename CFunc, typename AFunc,
+          typename CReturn, typename... CArgs, typename AReturn, typename... AArgs>
+struct factory<CFunc, AFunc, CReturn(CArgs...), AReturn(AArgs...)> {
+    static_assert(sizeof...(CArgs) == sizeof...(AArgs),
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+    static_assert(all_of<std::is_same<CArgs, AArgs>...>::value,
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+
+    remove_reference_t<CFunc> class_factory;
+    remove_reference_t<AFunc> alias_factory;
+
+    factory(CFunc &&c, AFunc &&a)
+        : class_factory(std::forward<CFunc>(c)), alias_factory(std::forward<AFunc>(a)) { }
+
+    // The class factory is called when the `self` type passed to `__init__` is the direct
+    // class (i.e. not inherited), the alias factory when `self` is a Python-side subtype.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra&... extra) && {
+        static_assert(Class::has_alias, "The two-argument version of `py::init()` can "
+                                        "only be used if the class has an alias");
+        #if defined(PYBIND11_CPP14)
+        cl.def("__init__", [class_func = std::move(class_factory), alias_func = std::move(alias_factory)]
+        #else
+        auto &class_func = class_factory;
+        auto &alias_func = alias_factory;
+        cl.def("__init__", [class_func, alias_func]
+        #endif
+        (value_and_holder &v_h, CArgs... args) {
+            if (Py_TYPE(v_h.inst) == v_h.type->type)
+                // If the instance type equals the registered type we don't have inheritance, so
+                // don't need the alias and can construct using the class function:
+                construct<Class>(v_h, class_func(std::forward<CArgs>(args)...), false);
+            else
+                construct<Class>(v_h, alias_func(std::forward<CArgs>(args)...), true);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+/// Set just the C++ state. Same as `__init__`.
+template <typename Class, typename T>
+void setstate(value_and_holder &v_h, T &&result, bool need_alias) {
+    construct<Class>(v_h, std::forward<T>(result), need_alias);
+}
+
+/// Set both the C++ and Python states
+template <typename Class, typename T, typename O,
+          enable_if_t<std::is_convertible<O, handle>::value, int> = 0>
+void setstate(value_and_holder &v_h, std::pair<T, O> &&result, bool need_alias) {
+    construct<Class>(v_h, std::move(result.first), need_alias);
+    setattr((PyObject *) v_h.inst, "__dict__", result.second);
+}
+
+/// Implementation for py::pickle(GetState, SetState)
+template <typename Get, typename Set,
+          typename = function_signature_t<Get>, typename = function_signature_t<Set>>
+struct pickle_factory;
+
+template <typename Get, typename Set,
+          typename RetState, typename Self, typename NewInstance, typename ArgState>
+struct pickle_factory<Get, Set, RetState(Self), NewInstance(ArgState)> {
+    static_assert(std::is_same<intrinsic_t<RetState>, intrinsic_t<ArgState>>::value,
+                  "The type returned by `__getstate__` must be the same "
+                  "as the argument accepted by `__setstate__`");
+
+    remove_reference_t<Get> get;
+    remove_reference_t<Set> set;
+
+    pickle_factory(Get get, Set set)
+        : get(std::forward<Get>(get)), set(std::forward<Set>(set)) { }
+
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        cl.def("__getstate__", std::move(get));
+
+#if defined(PYBIND11_CPP14)
+        cl.def("__setstate__", [func = std::move(set)]
+#else
+        auto &func = set;
+        cl.def("__setstate__", [func]
+#endif
+        (value_and_holder &v_h, ArgState state) {
+            setstate<Class>(v_h, func(std::forward<ArgState>(state)),
+                            Py_TYPE(v_h.inst) != v_h.type->type);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+NAMESPACE_END(initimpl)
+NAMESPACE_END(detail)
+NAMESPACE_END(pybind11)
diff --git a/python/src/pybind11/detail/internals.h b/python/src/pybind11/detail/internals.h
new file mode 100644
index 000000000..f1dd38764
--- /dev/null
+++ b/python/src/pybind11/detail/internals.h
@@ -0,0 +1,291 @@
+/*
+    pybind11/detail/internals.h: Internal data structure and related functions
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../pytypes.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+// Forward declarations
+inline PyTypeObject *make_static_property_type();
+inline PyTypeObject *make_default_metaclass();
+inline PyObject *make_object_base_type(PyTypeObject *metaclass);
+
+// The old Python Thread Local Storage (TLS) API is deprecated in Python 3.7 in favor of the new
+// Thread Specific Storage (TSS) API.
+#if PY_VERSION_HEX >= 0x03070000
+#    define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr
+#    define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key))
+#    define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value))
+#    define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr)
+#else
+    // Usually an int but a long on Cygwin64 with Python 3.x
+#    define PYBIND11_TLS_KEY_INIT(var) decltype(PyThread_create_key()) var = 0
+#    define PYBIND11_TLS_GET_VALUE(key) PyThread_get_key_value((key))
+#    if PY_MAJOR_VERSION < 3
+#        define PYBIND11_TLS_DELETE_VALUE(key)                               \
+             PyThread_delete_key_value(key)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                       \
+             do {                                                            \
+                 PyThread_delete_key_value((key));                           \
+                 PyThread_set_key_value((key), (value));                     \
+             } while (false)
+#    else
+#        define PYBIND11_TLS_DELETE_VALUE(key)                               \
+             PyThread_set_key_value((key), nullptr)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                       \
+             PyThread_set_key_value((key), (value))
+#    endif
+#endif
+
+// Python loads modules by default with dlopen with the RTLD_LOCAL flag; under libc++ and possibly
+// other STLs, this means `typeid(A)` from one module won't equal `typeid(A)` from another module
+// even when `A` is the same, non-hidden-visibility type (e.g. from a common include).  Under
+// libstdc++, this doesn't happen: equality and the type_index hash are based on the type name,
+// which works.  If not under a known-good stl, provide our own name-based hash and equality
+// functions that use the type name.
+#if defined(__GLIBCXX__)
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) { return lhs == rhs; }
+using type_hash = std::hash<std::type_index>;
+using type_equal_to = std::equal_to<std::type_index>;
+#else
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) {
+    return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+}
+
+struct type_hash {
+    size_t operator()(const std::type_index &t) const {
+        size_t hash = 5381;
+        const char *ptr = t.name();
+        while (auto c = static_cast<unsigned char>(*ptr++))
+            hash = (hash * 33) ^ c;
+        return hash;
+    }
+};
+
+struct type_equal_to {
+    bool operator()(const std::type_index &lhs, const std::type_index &rhs) const {
+        return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+    }
+};
+#endif
+
+template <typename value_type>
+using type_map = std::unordered_map<std::type_index, value_type, type_hash, type_equal_to>;
+
+struct overload_hash {
+    inline size_t operator()(const std::pair<const PyObject *, const char *>& v) const {
+        size_t value = std::hash<const void *>()(v.first);
+        value ^= std::hash<const void *>()(v.second)  + 0x9e3779b9 + (value<<6) + (value>>2);
+        return value;
+    }
+};
+
+/// Internal data structure used to track registered instances and types.
+/// Whenever binary incompatible changes are made to this structure,
+/// `PYBIND11_INTERNALS_VERSION` must be incremented.
+struct internals {
+    type_map<type_info *> registered_types_cpp; // std::type_index -> pybind11's type information
+    std::unordered_map<PyTypeObject *, std::vector<type_info *>> registered_types_py; // PyTypeObject* -> base type_info(s)
+    std::unordered_multimap<const void *, instance*> registered_instances; // void * -> instance*
+    std::unordered_set<std::pair<const PyObject *, const char *>, overload_hash> inactive_overload_cache;
+    type_map<std::vector<bool (*)(PyObject *, void *&)>> direct_conversions;
+    std::unordered_map<const PyObject *, std::vector<PyObject *>> patients;
+    std::forward_list<void (*) (std::exception_ptr)> registered_exception_translators;
+    std::unordered_map<std::string, void *> shared_data; // Custom data to be shared across extensions
+    std::vector<PyObject *> loader_patient_stack; // Used by `loader_life_support`
+    std::forward_list<std::string> static_strings; // Stores the std::strings backing detail::c_str()
+    PyTypeObject *static_property_type;
+    PyTypeObject *default_metaclass;
+    PyObject *instance_base;
+#if defined(WITH_THREAD)
+    PYBIND11_TLS_KEY_INIT(tstate);
+    PyInterpreterState *istate = nullptr;
+#endif
+};
+
+/// Additional type information which does not fit into the PyTypeObject.
+/// Changes to this struct also require bumping `PYBIND11_INTERNALS_VERSION`.
+struct type_info {
+    PyTypeObject *type;
+    const std::type_info *cpptype;
+    size_t type_size, type_align, holder_size_in_ptrs;
+    void *(*operator_new)(size_t);
+    void (*init_instance)(instance *, const void *);
+    void (*dealloc)(value_and_holder &v_h);
+    std::vector<PyObject *(*)(PyObject *, PyTypeObject *)> implicit_conversions;
+    std::vector<std::pair<const std::type_info *, void *(*)(void *)>> implicit_casts;
+    std::vector<bool (*)(PyObject *, void *&)> *direct_conversions;
+    buffer_info *(*get_buffer)(PyObject *, void *) = nullptr;
+    void *get_buffer_data = nullptr;
+    void *(*module_local_load)(PyObject *, const type_info *) = nullptr;
+    /* A simple type never occurs as a (direct or indirect) parent
+     * of a class that makes use of multiple inheritance */
+    bool simple_type : 1;
+    /* True if there is no multiple inheritance in this type's inheritance tree */
+    bool simple_ancestors : 1;
+    /* for base vs derived holder_type checks */
+    bool default_holder : 1;
+    /* true if this is a type registered with py::module_local */
+    bool module_local : 1;
+};
+
+/// Tracks the `internals` and `type_info` ABI version independent of the main library version
+#define PYBIND11_INTERNALS_VERSION 3
+
+#if defined(_DEBUG)
+#   define PYBIND11_BUILD_TYPE "_debug"
+#else
+#   define PYBIND11_BUILD_TYPE ""
+#endif
+
+#if defined(WITH_THREAD)
+#  define PYBIND11_INTERNALS_KIND ""
+#else
+#  define PYBIND11_INTERNALS_KIND "_without_thread"
+#endif
+
+#define PYBIND11_INTERNALS_ID "__pybind11_internals_v" \
+    PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_BUILD_TYPE "__"
+
+#define PYBIND11_MODULE_LOCAL_ID "__pybind11_module_local_v" \
+    PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_BUILD_TYPE "__"
+
+/// Each module locally stores a pointer to the `internals` data. The data
+/// itself is shared among modules with the same `PYBIND11_INTERNALS_ID`.
+inline internals **&get_internals_pp() {
+    static internals **internals_pp = nullptr;
+    return internals_pp;
+}
+
+/// Return a reference to the current `internals` data
+PYBIND11_NOINLINE inline internals &get_internals() {
+    auto **&internals_pp = get_internals_pp();
+    if (internals_pp && *internals_pp)
+        return **internals_pp;
+
+    constexpr auto *id = PYBIND11_INTERNALS_ID;
+    auto builtins = handle(PyEval_GetBuiltins());
+    if (builtins.contains(id) && isinstance<capsule>(builtins[id])) {
+        internals_pp = static_cast<internals **>(capsule(builtins[id]));
+
+        // We loaded builtins through python's builtins, which means that our `error_already_set`
+        // and `builtin_exception` may be different local classes than the ones set up in the
+        // initial exception translator, below, so add another for our local exception classes.
+        //
+        // libstdc++ doesn't require this (types there are identified only by name)
+#if !defined(__GLIBCXX__)
+        (*internals_pp)->registered_exception_translators.push_front(
+            [](std::exception_ptr p) -> void {
+                try {
+                    if (p) std::rethrow_exception(p);
+                } catch (error_already_set &e)       { e.restore();   return;
+                } catch (const builtin_exception &e) { e.set_error(); return;
+                }
+            }
+        );
+#endif
+    } else {
+        if (!internals_pp) internals_pp = new internals*();
+        auto *&internals_ptr = *internals_pp;
+        internals_ptr = new internals();
+#if defined(WITH_THREAD)
+        PyEval_InitThreads();
+        PyThreadState *tstate = PyThreadState_Get();
+        #if PY_VERSION_HEX >= 0x03070000
+            internals_ptr->tstate = PyThread_tss_alloc();
+            if (!internals_ptr->tstate || PyThread_tss_create(internals_ptr->tstate))
+                pybind11_fail("get_internals: could not successfully initialize the TSS key!");
+            PyThread_tss_set(internals_ptr->tstate, tstate);
+        #else
+            internals_ptr->tstate = PyThread_create_key();
+            if (internals_ptr->tstate == -1)
+                pybind11_fail("get_internals: could not successfully initialize the TLS key!");
+            PyThread_set_key_value(internals_ptr->tstate, tstate);
+        #endif
+        internals_ptr->istate = tstate->interp;
+#endif
+        builtins[id] = capsule(internals_pp);
+        internals_ptr->registered_exception_translators.push_front(
+            [](std::exception_ptr p) -> void {
+                try {
+                    if (p) std::rethrow_exception(p);
+                } catch (error_already_set &e)           { e.restore();                                    return;
+                } catch (const builtin_exception &e)     { e.set_error();                                  return;
+                } catch (const std::bad_alloc &e)        { PyErr_SetString(PyExc_MemoryError,   e.what()); return;
+                } catch (const std::domain_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+                } catch (const std::invalid_argument &e) { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+                } catch (const std::length_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+                } catch (const std::out_of_range &e)     { PyErr_SetString(PyExc_IndexError,    e.what()); return;
+                } catch (const std::range_error &e)      { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+                } catch (const std::exception &e)        { PyErr_SetString(PyExc_RuntimeError,  e.what()); return;
+                } catch (...) {
+                    PyErr_SetString(PyExc_RuntimeError, "Caught an unknown exception!");
+                    return;
+                }
+            }
+        );
+        internals_ptr->static_property_type = make_static_property_type();
+        internals_ptr->default_metaclass = make_default_metaclass();
+        internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass);
+    }
+    return **internals_pp;
+}
+
+/// Works like `internals.registered_types_cpp`, but for module-local registered types:
+inline type_map<type_info *> &registered_local_types_cpp() {
+    static type_map<type_info *> locals{};
+    return locals;
+}
+
+/// Constructs a std::string with the given arguments, stores it in `internals`, and returns its
+/// `c_str()`.  Such strings objects have a long storage duration -- the internal strings are only
+/// cleared when the program exits or after interpreter shutdown (when embedding), and so are
+/// suitable for c-style strings needed by Python internals (such as PyTypeObject's tp_name).
+template <typename... Args>
+const char *c_str(Args &&...args) {
+    auto &strings = get_internals().static_strings;
+    strings.emplace_front(std::forward<Args>(args)...);
+    return strings.front().c_str();
+}
+
+NAMESPACE_END(detail)
+
+/// Returns a named pointer that is shared among all extension modules (using the same
+/// pybind11 version) running in the current interpreter. Names starting with underscores
+/// are reserved for internal usage. Returns `nullptr` if no matching entry was found.
+inline PYBIND11_NOINLINE void *get_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    return it != internals.shared_data.end() ? it->second : nullptr;
+}
+
+/// Set the shared data that can be later recovered by `get_shared_data()`.
+inline PYBIND11_NOINLINE void *set_shared_data(const std::string &name, void *data) {
+    detail::get_internals().shared_data[name] = data;
+    return data;
+}
+
+/// Returns a typed reference to a shared data entry (by using `get_shared_data()`) if
+/// such entry exists. Otherwise, a new object of default-constructible type `T` is
+/// added to the shared data under the given name and a reference to it is returned.
+template<typename T>
+T &get_or_create_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    T *ptr = (T *) (it != internals.shared_data.end() ? it->second : nullptr);
+    if (!ptr) {
+        ptr = new T();
+        internals.shared_data[name] = ptr;
+    }
+    return *ptr;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/detail/typeid.h b/python/src/pybind11/detail/typeid.h
new file mode 100644
index 000000000..9c8a4fc69
--- /dev/null
+++ b/python/src/pybind11/detail/typeid.h
@@ -0,0 +1,55 @@
+/*
+    pybind11/detail/typeid.h: Compiler-independent access to type identifiers
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <cstdio>
+#include <cstdlib>
+
+#if defined(__GNUG__)
+#include <cxxabi.h>
+#endif
+
+#include "common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+/// Erase all occurrences of a substring
+inline void erase_all(std::string &string, const std::string &search) {
+    for (size_t pos = 0;;) {
+        pos = string.find(search, pos);
+        if (pos == std::string::npos) break;
+        string.erase(pos, search.length());
+    }
+}
+
+PYBIND11_NOINLINE inline void clean_type_id(std::string &name) {
+#if defined(__GNUG__)
+    int status = 0;
+    std::unique_ptr<char, void (*)(void *)> res {
+        abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), std::free };
+    if (status == 0)
+        name = res.get();
+#else
+    detail::erase_all(name, "class ");
+    detail::erase_all(name, "struct ");
+    detail::erase_all(name, "enum ");
+#endif
+    detail::erase_all(name, "pybind11::");
+}
+NAMESPACE_END(detail)
+
+/// Return a string representation of a C++ type
+template <typename T> static std::string type_id() {
+    std::string name(typeid(T).name());
+    detail::clean_type_id(name);
+    return name;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/eigen.h b/python/src/pybind11/eigen.h
new file mode 100644
index 000000000..d963d9650
--- /dev/null
+++ b/python/src/pybind11/eigen.h
@@ -0,0 +1,607 @@
+/*
+    pybind11/eigen.h: Transparent conversion for dense and sparse Eigen matrices
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "numpy.h"
+
+#if defined(__INTEL_COMPILER)
+#  pragma warning(disable: 1682) // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem)
+#elif defined(__GNUG__) || defined(__clang__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wconversion"
+#  pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#  ifdef __clang__
+//   Eigen generates a bunch of implicit-copy-constructor-is-deprecated warnings with -Wdeprecated
+//   under Clang, so disable that warning here:
+#    pragma GCC diagnostic ignored "-Wdeprecated"
+#  endif
+#  if __GNUC__ >= 7
+#    pragma GCC diagnostic ignored "-Wint-in-bool-context"
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#  pragma warning(disable: 4996) // warning C4996: std::unary_negate is deprecated in C++17
+#endif
+
+#include <Eigen/Core>
+#include <Eigen/SparseCore>
+
+// Eigen prior to 3.2.7 doesn't have proper move constructors--but worse, some classes get implicit
+// move constructors that break things.  We could detect this an explicitly copy, but an extra copy
+// of matrices seems highly undesirable.
+static_assert(EIGEN_VERSION_AT_LEAST(3,2,7), "Eigen support in pybind11 requires Eigen >= 3.2.7");
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+// Provide a convenience alias for easier pass-by-ref usage with fully dynamic strides:
+using EigenDStride = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>;
+template <typename MatrixType> using EigenDRef = Eigen::Ref<MatrixType, 0, EigenDStride>;
+template <typename MatrixType> using EigenDMap = Eigen::Map<MatrixType, 0, EigenDStride>;
+
+NAMESPACE_BEGIN(detail)
+
+#if EIGEN_VERSION_AT_LEAST(3,3,0)
+using EigenIndex = Eigen::Index;
+#else
+using EigenIndex = EIGEN_DEFAULT_DENSE_INDEX_TYPE;
+#endif
+
+// Matches Eigen::Map, Eigen::Ref, blocks, etc:
+template <typename T> using is_eigen_dense_map = all_of<is_template_base_of<Eigen::DenseBase, T>, std::is_base_of<Eigen::MapBase<T, Eigen::ReadOnlyAccessors>, T>>;
+template <typename T> using is_eigen_mutable_map = std::is_base_of<Eigen::MapBase<T, Eigen::WriteAccessors>, T>;
+template <typename T> using is_eigen_dense_plain = all_of<negation<is_eigen_dense_map<T>>, is_template_base_of<Eigen::PlainObjectBase, T>>;
+template <typename T> using is_eigen_sparse = is_template_base_of<Eigen::SparseMatrixBase, T>;
+// Test for objects inheriting from EigenBase<Derived> that aren't captured by the above.  This
+// basically covers anything that can be assigned to a dense matrix but that don't have a typical
+// matrix data layout that can be copied from their .data().  For example, DiagonalMatrix and
+// SelfAdjointView fall into this category.
+template <typename T> using is_eigen_other = all_of<
+    is_template_base_of<Eigen::EigenBase, T>,
+    negation<any_of<is_eigen_dense_map<T>, is_eigen_dense_plain<T>, is_eigen_sparse<T>>>
+>;
+
+// Captures numpy/eigen conformability status (returned by EigenProps::conformable()):
+template <bool EigenRowMajor> struct EigenConformable {
+    bool conformable = false;
+    EigenIndex rows = 0, cols = 0;
+    EigenDStride stride{0, 0};      // Only valid if negativestrides is false!
+    bool negativestrides = false;   // If true, do not use stride!
+
+    EigenConformable(bool fits = false) : conformable{fits} {}
+    // Matrix type:
+    EigenConformable(EigenIndex r, EigenIndex c,
+            EigenIndex rstride, EigenIndex cstride) :
+        conformable{true}, rows{r}, cols{c} {
+        // TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity. http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747
+        if (rstride < 0 || cstride < 0) {
+            negativestrides = true;
+        } else {
+            stride = {EigenRowMajor ? rstride : cstride /* outer stride */,
+                      EigenRowMajor ? cstride : rstride /* inner stride */ };
+        }
+    }
+    // Vector type:
+    EigenConformable(EigenIndex r, EigenIndex c, EigenIndex stride)
+        : EigenConformable(r, c, r == 1 ? c*stride : stride, c == 1 ? r : r*stride) {}
+
+    template <typename props> bool stride_compatible() const {
+        // To have compatible strides, we need (on both dimensions) one of fully dynamic strides,
+        // matching strides, or a dimension size of 1 (in which case the stride value is irrelevant)
+        return
+            !negativestrides &&
+            (props::inner_stride == Eigen::Dynamic || props::inner_stride == stride.inner() ||
+                (EigenRowMajor ? cols : rows) == 1) &&
+            (props::outer_stride == Eigen::Dynamic || props::outer_stride == stride.outer() ||
+                (EigenRowMajor ? rows : cols) == 1);
+    }
+    operator bool() const { return conformable; }
+};
+
+template <typename Type> struct eigen_extract_stride { using type = Type; };
+template <typename PlainObjectType, int MapOptions, typename StrideType>
+struct eigen_extract_stride<Eigen::Map<PlainObjectType, MapOptions, StrideType>> { using type = StrideType; };
+template <typename PlainObjectType, int Options, typename StrideType>
+struct eigen_extract_stride<Eigen::Ref<PlainObjectType, Options, StrideType>> { using type = StrideType; };
+
+// Helper struct for extracting information from an Eigen type
+template <typename Type_> struct EigenProps {
+    using Type = Type_;
+    using Scalar = typename Type::Scalar;
+    using StrideType = typename eigen_extract_stride<Type>::type;
+    static constexpr EigenIndex
+        rows = Type::RowsAtCompileTime,
+        cols = Type::ColsAtCompileTime,
+        size = Type::SizeAtCompileTime;
+    static constexpr bool
+        row_major = Type::IsRowMajor,
+        vector = Type::IsVectorAtCompileTime, // At least one dimension has fixed size 1
+        fixed_rows = rows != Eigen::Dynamic,
+        fixed_cols = cols != Eigen::Dynamic,
+        fixed = size != Eigen::Dynamic, // Fully-fixed size
+        dynamic = !fixed_rows && !fixed_cols; // Fully-dynamic size
+
+    template <EigenIndex i, EigenIndex ifzero> using if_zero = std::integral_constant<EigenIndex, i == 0 ? ifzero : i>;
+    static constexpr EigenIndex inner_stride = if_zero<StrideType::InnerStrideAtCompileTime, 1>::value,
+                                outer_stride = if_zero<StrideType::OuterStrideAtCompileTime,
+                                                       vector ? size : row_major ? cols : rows>::value;
+    static constexpr bool dynamic_stride = inner_stride == Eigen::Dynamic && outer_stride == Eigen::Dynamic;
+    static constexpr bool requires_row_major = !dynamic_stride && !vector && (row_major ? inner_stride : outer_stride) == 1;
+    static constexpr bool requires_col_major = !dynamic_stride && !vector && (row_major ? outer_stride : inner_stride) == 1;
+
+    // Takes an input array and determines whether we can make it fit into the Eigen type.  If
+    // the array is a vector, we attempt to fit it into either an Eigen 1xN or Nx1 vector
+    // (preferring the latter if it will fit in either, i.e. for a fully dynamic matrix type).
+    static EigenConformable<row_major> conformable(const array &a) {
+        const auto dims = a.ndim();
+        if (dims < 1 || dims > 2)
+            return false;
+
+        if (dims == 2) { // Matrix type: require exact match (or dynamic)
+
+            EigenIndex
+                np_rows = a.shape(0),
+                np_cols = a.shape(1),
+                np_rstride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar)),
+                np_cstride = a.strides(1) / static_cast<ssize_t>(sizeof(Scalar));
+            if ((fixed_rows && np_rows != rows) || (fixed_cols && np_cols != cols))
+                return false;
+
+            return {np_rows, np_cols, np_rstride, np_cstride};
+        }
+
+        // Otherwise we're storing an n-vector.  Only one of the strides will be used, but whichever
+        // is used, we want the (single) numpy stride value.
+        const EigenIndex n = a.shape(0),
+              stride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar));
+
+        if (vector) { // Eigen type is a compile-time vector
+            if (fixed && size != n)
+                return false; // Vector size mismatch
+            return {rows == 1 ? 1 : n, cols == 1 ? 1 : n, stride};
+        }
+        else if (fixed) {
+            // The type has a fixed size, but is not a vector: abort
+            return false;
+        }
+        else if (fixed_cols) {
+            // Since this isn't a vector, cols must be != 1.  We allow this only if it exactly
+            // equals the number of elements (rows is Dynamic, and so 1 row is allowed).
+            if (cols != n) return false;
+            return {1, n, stride};
+        }
+        else {
+            // Otherwise it's either fully dynamic, or column dynamic; both become a column vector
+            if (fixed_rows && rows != n) return false;
+            return {n, 1, stride};
+        }
+    }
+
+    static constexpr bool show_writeable = is_eigen_dense_map<Type>::value && is_eigen_mutable_map<Type>::value;
+    static constexpr bool show_order = is_eigen_dense_map<Type>::value;
+    static constexpr bool show_c_contiguous = show_order && requires_row_major;
+    static constexpr bool show_f_contiguous = !show_c_contiguous && show_order && requires_col_major;
+
+    static constexpr auto descriptor =
+        _("numpy.ndarray[") + npy_format_descriptor<Scalar>::name +
+        _("[")  + _<fixed_rows>(_<(size_t) rows>(), _("m")) +
+        _(", ") + _<fixed_cols>(_<(size_t) cols>(), _("n")) +
+        _("]") +
+        // For a reference type (e.g. Ref<MatrixXd>) we have other constraints that might need to be
+        // satisfied: writeable=True (for a mutable reference), and, depending on the map's stride
+        // options, possibly f_contiguous or c_contiguous.  We include them in the descriptor output
+        // to provide some hint as to why a TypeError is occurring (otherwise it can be confusing to
+        // see that a function accepts a 'numpy.ndarray[float64[3,2]]' and an error message that you
+        // *gave* a numpy.ndarray of the right type and dimensions.
+        _<show_writeable>(", flags.writeable", "") +
+        _<show_c_contiguous>(", flags.c_contiguous", "") +
+        _<show_f_contiguous>(", flags.f_contiguous", "") +
+        _("]");
+};
+
+// Casts an Eigen type to numpy array.  If given a base, the numpy array references the src data,
+// otherwise it'll make a copy.  writeable lets you turn off the writeable flag for the array.
+template <typename props> handle eigen_array_cast(typename props::Type const &src, handle base = handle(), bool writeable = true) {
+    constexpr ssize_t elem_size = sizeof(typename props::Scalar);
+    array a;
+    if (props::vector)
+        a = array({ src.size() }, { elem_size * src.innerStride() }, src.data(), base);
+    else
+        a = array({ src.rows(), src.cols() }, { elem_size * src.rowStride(), elem_size * src.colStride() },
+                  src.data(), base);
+
+    if (!writeable)
+        array_proxy(a.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+
+    return a.release();
+}
+
+// Takes an lvalue ref to some Eigen type and a (python) base object, creating a numpy array that
+// reference the Eigen object's data with `base` as the python-registered base class (if omitted,
+// the base will be set to None, and lifetime management is up to the caller).  The numpy array is
+// non-writeable if the given type is const.
+template <typename props, typename Type>
+handle eigen_ref_array(Type &src, handle parent = none()) {
+    // none here is to get past array's should-we-copy detection, which currently always
+    // copies when there is no base.  Setting the base to None should be harmless.
+    return eigen_array_cast<props>(src, parent, !std::is_const<Type>::value);
+}
+
+// Takes a pointer to some dense, plain Eigen type, builds a capsule around it, then returns a numpy
+// array that references the encapsulated data with a python-side reference to the capsule to tie
+// its destruction to that of any dependent python objects.  Const-ness is determined by whether or
+// not the Type of the pointer given is const.
+template <typename props, typename Type, typename = enable_if_t<is_eigen_dense_plain<Type>::value>>
+handle eigen_encapsulate(Type *src) {
+    capsule base(src, [](void *o) { delete static_cast<Type *>(o); });
+    return eigen_ref_array<props>(*src, base);
+}
+
+// Type caster for regular, dense matrix types (e.g. MatrixXd), but not maps/refs/etc. of dense
+// types.
+template<typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_dense_plain<Type>::value>> {
+    using Scalar = typename Type::Scalar;
+    using props = EigenProps<Type>;
+
+    bool load(handle src, bool convert) {
+        // If we're in no-convert mode, only load if given an array of the correct type
+        if (!convert && !isinstance<array_t<Scalar>>(src))
+            return false;
+
+        // Coerce into an array, but don't do type conversion yet; the copy below handles it.
+        auto buf = array::ensure(src);
+
+        if (!buf)
+            return false;
+
+        auto dims = buf.ndim();
+        if (dims < 1 || dims > 2)
+            return false;
+
+        auto fits = props::conformable(buf);
+        if (!fits)
+            return false;
+
+        // Allocate the new type, then build a numpy reference into it
+        value = Type(fits.rows, fits.cols);
+        auto ref = reinterpret_steal<array>(eigen_ref_array<props>(value));
+        if (dims == 1) ref = ref.squeeze();
+        else if (ref.ndim() == 1) buf = buf.squeeze();
+
+        int result = detail::npy_api::get().PyArray_CopyInto_(ref.ptr(), buf.ptr());
+
+        if (result < 0) { // Copy failed!
+            PyErr_Clear();
+            return false;
+        }
+
+        return true;
+    }
+
+private:
+
+    // Cast implementation
+    template <typename CType>
+    static handle cast_impl(CType *src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::take_ownership:
+            case return_value_policy::automatic:
+                return eigen_encapsulate<props>(src);
+            case return_value_policy::move:
+                return eigen_encapsulate<props>(new CType(std::move(*src)));
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(*src);
+            case return_value_policy::reference:
+            case return_value_policy::automatic_reference:
+                return eigen_ref_array<props>(*src);
+            case return_value_policy::reference_internal:
+                return eigen_ref_array<props>(*src, parent);
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        };
+    }
+
+public:
+
+    // Normal returned non-reference, non-const value:
+    static handle cast(Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // If you return a non-reference const, we mark the numpy array readonly:
+    static handle cast(const Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // lvalue reference return; default (automatic) becomes copy
+    static handle cast(Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast_impl(&src, policy, parent);
+    }
+    // const lvalue reference return; default (automatic) becomes copy
+    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast(&src, policy, parent);
+    }
+    // non-const pointer return
+    static handle cast(Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+    // const pointer return
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    operator Type*() { return &value; }
+    operator Type&() { return value; }
+    operator Type&&() && { return std::move(value); }
+    template <typename T> using cast_op_type = movable_cast_op_type<T>;
+
+private:
+    Type value;
+};
+
+// Base class for casting reference/map/block/etc. objects back to python.
+template <typename MapType> struct eigen_map_caster {
+private:
+    using props = EigenProps<MapType>;
+
+public:
+
+    // Directly referencing a ref/map's data is a bit dangerous (whatever the map/ref points to has
+    // to stay around), but we'll allow it under the assumption that you know what you're doing (and
+    // have an appropriate keep_alive in place).  We return a numpy array pointing directly at the
+    // ref's data (The numpy array ends up read-only if the ref was to a const matrix type.) Note
+    // that this means you need to ensure you don't destroy the object in some other way (e.g. with
+    // an appropriate keep_alive, or with a reference to a statically allocated matrix).
+    static handle cast(const MapType &src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(src);
+            case return_value_policy::reference_internal:
+                return eigen_array_cast<props>(src, parent, is_eigen_mutable_map<MapType>::value);
+            case return_value_policy::reference:
+            case return_value_policy::automatic:
+            case return_value_policy::automatic_reference:
+                return eigen_array_cast<props>(src, none(), is_eigen_mutable_map<MapType>::value);
+            default:
+                // move, take_ownership don't make any sense for a ref/map:
+                pybind11_fail("Invalid return_value_policy for Eigen Map/Ref/Block type");
+        }
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator MapType() = delete;
+    template <typename> using cast_op_type = MapType;
+};
+
+// We can return any map-like object (but can only load Refs, specialized next):
+template <typename Type> struct type_caster<Type, enable_if_t<is_eigen_dense_map<Type>::value>>
+    : eigen_map_caster<Type> {};
+
+// Loader for Ref<...> arguments.  See the documentation for info on how to make this work without
+// copying (it requires some extra effort in many cases).
+template <typename PlainObjectType, typename StrideType>
+struct type_caster<
+    Eigen::Ref<PlainObjectType, 0, StrideType>,
+    enable_if_t<is_eigen_dense_map<Eigen::Ref<PlainObjectType, 0, StrideType>>::value>
+> : public eigen_map_caster<Eigen::Ref<PlainObjectType, 0, StrideType>> {
+private:
+    using Type = Eigen::Ref<PlainObjectType, 0, StrideType>;
+    using props = EigenProps<Type>;
+    using Scalar = typename props::Scalar;
+    using MapType = Eigen::Map<PlainObjectType, 0, StrideType>;
+    using Array = array_t<Scalar, array::forcecast |
+                ((props::row_major ? props::inner_stride : props::outer_stride) == 1 ? array::c_style :
+                 (props::row_major ? props::outer_stride : props::inner_stride) == 1 ? array::f_style : 0)>;
+    static constexpr bool need_writeable = is_eigen_mutable_map<Type>::value;
+    // Delay construction (these have no default constructor)
+    std::unique_ptr<MapType> map;
+    std::unique_ptr<Type> ref;
+    // Our array.  When possible, this is just a numpy array pointing to the source data, but
+    // sometimes we can't avoid copying (e.g. input is not a numpy array at all, has an incompatible
+    // layout, or is an array of a type that needs to be converted).  Using a numpy temporary
+    // (rather than an Eigen temporary) saves an extra copy when we need both type conversion and
+    // storage order conversion.  (Note that we refuse to use this temporary copy when loading an
+    // argument for a Ref<M> with M non-const, i.e. a read-write reference).
+    Array copy_or_ref;
+public:
+    bool load(handle src, bool convert) {
+        // First check whether what we have is already an array of the right type.  If not, we can't
+        // avoid a copy (because the copy is also going to do type conversion).
+        bool need_copy = !isinstance<Array>(src);
+
+        EigenConformable<props::row_major> fits;
+        if (!need_copy) {
+            // We don't need a converting copy, but we also need to check whether the strides are
+            // compatible with the Ref's stride requirements
+            Array aref = reinterpret_borrow<Array>(src);
+
+            if (aref && (!need_writeable || aref.writeable())) {
+                fits = props::conformable(aref);
+                if (!fits) return false; // Incompatible dimensions
+                if (!fits.template stride_compatible<props>())
+                    need_copy = true;
+                else
+                    copy_or_ref = std::move(aref);
+            }
+            else {
+                need_copy = true;
+            }
+        }
+
+        if (need_copy) {
+            // We need to copy: If we need a mutable reference, or we're not supposed to convert
+            // (either because we're in the no-convert overload pass, or because we're explicitly
+            // instructed not to copy (via `py::arg().noconvert()`) we have to fail loading.
+            if (!convert || need_writeable) return false;
+
+            Array copy = Array::ensure(src);
+            if (!copy) return false;
+            fits = props::conformable(copy);
+            if (!fits || !fits.template stride_compatible<props>())
+                return false;
+            copy_or_ref = std::move(copy);
+            loader_life_support::add_patient(copy_or_ref);
+        }
+
+        ref.reset();
+        map.reset(new MapType(data(copy_or_ref), fits.rows, fits.cols, make_stride(fits.stride.outer(), fits.stride.inner())));
+        ref.reset(new Type(*map));
+
+        return true;
+    }
+
+    operator Type*() { return ref.get(); }
+    operator Type&() { return *ref; }
+    template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+
+private:
+    template <typename T = Type, enable_if_t<is_eigen_mutable_map<T>::value, int> = 0>
+    Scalar *data(Array &a) { return a.mutable_data(); }
+
+    template <typename T = Type, enable_if_t<!is_eigen_mutable_map<T>::value, int> = 0>
+    const Scalar *data(Array &a) { return a.data(); }
+
+    // Attempt to figure out a constructor of `Stride` that will work.
+    // If both strides are fixed, use a default constructor:
+    template <typename S> using stride_ctor_default = bool_constant<
+        S::InnerStrideAtCompileTime != Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_default_constructible<S>::value>;
+    // Otherwise, if there is a two-index constructor, assume it is (outer,inner) like
+    // Eigen::Stride, and use it:
+    template <typename S> using stride_ctor_dual = bool_constant<
+        !stride_ctor_default<S>::value && std::is_constructible<S, EigenIndex, EigenIndex>::value>;
+    // Otherwise, if there is a one-index constructor, and just one of the strides is dynamic, use
+    // it (passing whichever stride is dynamic).
+    template <typename S> using stride_ctor_outer = bool_constant<
+        !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
+        S::OuterStrideAtCompileTime == Eigen::Dynamic && S::InnerStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_constructible<S, EigenIndex>::value>;
+    template <typename S> using stride_ctor_inner = bool_constant<
+        !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
+        S::InnerStrideAtCompileTime == Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_constructible<S, EigenIndex>::value>;
+
+    template <typename S = StrideType, enable_if_t<stride_ctor_default<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex) { return S(); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_dual<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex inner) { return S(outer, inner); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_outer<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex) { return S(outer); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_inner<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex inner) { return S(inner); }
+
+};
+
+// type_caster for special matrix types (e.g. DiagonalMatrix), which are EigenBase, but not
+// EigenDense (i.e. they don't have a data(), at least not with the usual matrix layout).
+// load() is not supported, but we can cast them into the python domain by first copying to a
+// regular Eigen::Matrix, then casting that.
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_other<Type>::value>> {
+protected:
+    using Matrix = Eigen::Matrix<typename Type::Scalar, Type::RowsAtCompileTime, Type::ColsAtCompileTime>;
+    using props = EigenProps<Matrix>;
+public:
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        handle h = eigen_encapsulate<props>(new Matrix(src));
+        return h;
+    }
+    static handle cast(const Type *src, return_value_policy policy, handle parent) { return cast(*src, policy, parent); }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator Type() = delete;
+    template <typename> using cast_op_type = Type;
+};
+
+template<typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
+    typedef typename Type::Scalar Scalar;
+    typedef remove_reference_t<decltype(*std::declval<Type>().outerIndexPtr())> StorageIndex;
+    typedef typename Type::Index Index;
+    static constexpr bool rowMajor = Type::IsRowMajor;
+
+    bool load(handle src, bool) {
+        if (!src)
+            return false;
+
+        auto obj = reinterpret_borrow<object>(src);
+        object sparse_module = module::import("scipy.sparse");
+        object matrix_type = sparse_module.attr(
+            rowMajor ? "csr_matrix" : "csc_matrix");
+
+        if (!obj.get_type().is(matrix_type)) {
+            try {
+                obj = matrix_type(obj);
+            } catch (const error_already_set &) {
+                return false;
+            }
+        }
+
+        auto values = array_t<Scalar>((object) obj.attr("data"));
+        auto innerIndices = array_t<StorageIndex>((object) obj.attr("indices"));
+        auto outerIndices = array_t<StorageIndex>((object) obj.attr("indptr"));
+        auto shape = pybind11::tuple((pybind11::object) obj.attr("shape"));
+        auto nnz = obj.attr("nnz").cast<Index>();
+
+        if (!values || !innerIndices || !outerIndices)
+            return false;
+
+        value = Eigen::MappedSparseMatrix<Scalar, Type::Flags, StorageIndex>(
+            shape[0].cast<Index>(), shape[1].cast<Index>(), nnz,
+            outerIndices.mutable_data(), innerIndices.mutable_data(), values.mutable_data());
+
+        return true;
+    }
+
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        const_cast<Type&>(src).makeCompressed();
+
+        object matrix_type = module::import("scipy.sparse").attr(
+            rowMajor ? "csr_matrix" : "csc_matrix");
+
+        array data(src.nonZeros(), src.valuePtr());
+        array outerIndices((rowMajor ? src.rows() : src.cols()) + 1, src.outerIndexPtr());
+        array innerIndices(src.nonZeros(), src.innerIndexPtr());
+
+        return matrix_type(
+            std::make_tuple(data, innerIndices, outerIndices),
+            std::make_pair(src.rows(), src.cols())
+        ).release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, _<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[", "scipy.sparse.csc_matrix[")
+            + npy_format_descriptor<Scalar>::name + _("]"));
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(__GNUG__) || defined(__clang__)
+#  pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/python/src/pybind11/embed.h b/python/src/pybind11/embed.h
new file mode 100644
index 000000000..72655885e
--- /dev/null
+++ b/python/src/pybind11/embed.h
@@ -0,0 +1,200 @@
+/*
+    pybind11/embed.h: Support for embedding the interpreter
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "eval.h"
+
+#if defined(PYPY_VERSION)
+#  error Embedding the interpreter is not supported with PyPy
+#endif
+
+#if PY_MAJOR_VERSION >= 3
+#  define PYBIND11_EMBEDDED_MODULE_IMPL(name)            \
+      extern "C" PyObject *pybind11_init_impl_##name() { \
+          return pybind11_init_wrapper_##name();         \
+      }
+#else
+#  define PYBIND11_EMBEDDED_MODULE_IMPL(name)            \
+      extern "C" void pybind11_init_impl_##name() {      \
+          pybind11_init_wrapper_##name();                \
+      }
+#endif
+
+/** \rst
+    Add a new module to the table of builtins for the interpreter. Must be
+    defined in global scope. The first macro parameter is the name of the
+    module (without quotes). The second parameter is the variable which will
+    be used as the interface to add functions and classes to the module.
+
+    .. code-block:: cpp
+
+        PYBIND11_EMBEDDED_MODULE(example, m) {
+            // ... initialize functions and classes here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+ \endrst */
+#define PYBIND11_EMBEDDED_MODULE(name, variable)                              \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);    \
+    static PyObject PYBIND11_CONCAT(*pybind11_init_wrapper_, name)() {        \
+        auto m = pybind11::module(PYBIND11_TOSTRING(name));                   \
+        try {                                                                 \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                         \
+            return m.ptr();                                                   \
+        } catch (pybind11::error_already_set &e) {                            \
+            PyErr_SetString(PyExc_ImportError, e.what());                     \
+            return nullptr;                                                   \
+        } catch (const std::exception &e) {                                   \
+            PyErr_SetString(PyExc_ImportError, e.what());                     \
+            return nullptr;                                                   \
+        }                                                                     \
+    }                                                                         \
+    PYBIND11_EMBEDDED_MODULE_IMPL(name)                                       \
+    pybind11::detail::embedded_module name(PYBIND11_TOSTRING(name),           \
+                               PYBIND11_CONCAT(pybind11_init_impl_, name));   \
+    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
+
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// Python 2.7/3.x compatible version of `PyImport_AppendInittab` and error checks.
+struct embedded_module {
+#if PY_MAJOR_VERSION >= 3
+    using init_t = PyObject *(*)();
+#else
+    using init_t = void (*)();
+#endif
+    embedded_module(const char *name, init_t init) {
+        if (Py_IsInitialized())
+            pybind11_fail("Can't add new modules after the interpreter has been initialized");
+
+        auto result = PyImport_AppendInittab(name, init);
+        if (result == -1)
+            pybind11_fail("Insufficient memory to add a new module");
+    }
+};
+
+NAMESPACE_END(detail)
+
+/** \rst
+    Initialize the Python interpreter. No other pybind11 or CPython API functions can be
+    called before this is done; with the exception of `PYBIND11_EMBEDDED_MODULE`. The
+    optional parameter can be used to skip the registration of signal handlers (see the
+    `Python documentation`_ for details). Calling this function again after the interpreter
+    has already been initialized is a fatal error.
+
+    If initializing the Python interpreter fails, then the program is terminated.  (This
+    is controlled by the CPython runtime and is an exception to pybind11's normal behavior
+    of throwing exceptions on errors.)
+
+    .. _Python documentation: https://docs.python.org/3/c-api/init.html#c.Py_InitializeEx
+ \endrst */
+inline void initialize_interpreter(bool init_signal_handlers = true) {
+    if (Py_IsInitialized())
+        pybind11_fail("The interpreter is already running");
+
+    Py_InitializeEx(init_signal_handlers ? 1 : 0);
+
+    // Make .py files in the working directory available by default
+    module::import("sys").attr("path").cast<list>().append(".");
+}
+
+/** \rst
+    Shut down the Python interpreter. No pybind11 or CPython API functions can be called
+    after this. In addition, pybind11 objects must not outlive the interpreter:
+
+    .. code-block:: cpp
+
+        { // BAD
+            py::initialize_interpreter();
+            auto hello = py::str("Hello, World!");
+            py::finalize_interpreter();
+        } // <-- BOOM, hello's destructor is called after interpreter shutdown
+
+        { // GOOD
+            py::initialize_interpreter();
+            { // scoped
+                auto hello = py::str("Hello, World!");
+            } // <-- OK, hello is cleaned up properly
+            py::finalize_interpreter();
+        }
+
+        { // BETTER
+            py::scoped_interpreter guard{};
+            auto hello = py::str("Hello, World!");
+        }
+
+    .. warning::
+
+        The interpreter can be restarted by calling `initialize_interpreter` again.
+        Modules created using pybind11 can be safely re-initialized. However, Python
+        itself cannot completely unload binary extension modules and there are several
+        caveats with regard to interpreter restarting. All the details can be found
+        in the CPython documentation. In short, not all interpreter memory may be
+        freed, either due to reference cycles or user-created global data.
+
+ \endrst */
+inline void finalize_interpreter() {
+    handle builtins(PyEval_GetBuiltins());
+    const char *id = PYBIND11_INTERNALS_ID;
+
+    // Get the internals pointer (without creating it if it doesn't exist).  It's possible for the
+    // internals to be created during Py_Finalize() (e.g. if a py::capsule calls `get_internals()`
+    // during destruction), so we get the pointer-pointer here and check it after Py_Finalize().
+    detail::internals **internals_ptr_ptr = detail::get_internals_pp();
+    // It could also be stashed in builtins, so look there too:
+    if (builtins.contains(id) && isinstance<capsule>(builtins[id]))
+        internals_ptr_ptr = capsule(builtins[id]);
+
+    Py_Finalize();
+
+    if (internals_ptr_ptr) {
+        delete *internals_ptr_ptr;
+        *internals_ptr_ptr = nullptr;
+    }
+}
+
+/** \rst
+    Scope guard version of `initialize_interpreter` and `finalize_interpreter`.
+    This a move-only guard and only a single instance can exist.
+
+    .. code-block:: cpp
+
+        #include <pybind11/embed.h>
+
+        int main() {
+            py::scoped_interpreter guard{};
+            py::print(Hello, World!);
+        } // <-- interpreter shutdown
+ \endrst */
+class scoped_interpreter {
+public:
+    scoped_interpreter(bool init_signal_handlers = true) {
+        initialize_interpreter(init_signal_handlers);
+    }
+
+    scoped_interpreter(const scoped_interpreter &) = delete;
+    scoped_interpreter(scoped_interpreter &&other) noexcept { other.is_valid = false; }
+    scoped_interpreter &operator=(const scoped_interpreter &) = delete;
+    scoped_interpreter &operator=(scoped_interpreter &&) = delete;
+
+    ~scoped_interpreter() {
+        if (is_valid)
+            finalize_interpreter();
+    }
+
+private:
+    bool is_valid = true;
+};
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/eval.h b/python/src/pybind11/eval.h
new file mode 100644
index 000000000..ea85ba1db
--- /dev/null
+++ b/python/src/pybind11/eval.h
@@ -0,0 +1,117 @@
+/*
+    pybind11/exec.h: Support for evaluating Python expressions and statements
+    from strings and files
+
+    Copyright (c) 2016 Klemens Morgenstern <klemens.morgenstern@ed-chemnitz.de> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+enum eval_mode {
+    /// Evaluate a string containing an isolated expression
+    eval_expr,
+
+    /// Evaluate a string containing a single statement. Returns \c none
+    eval_single_statement,
+
+    /// Evaluate a string containing a sequence of statement. Returns \c none
+    eval_statements
+};
+
+template <eval_mode mode = eval_expr>
+object eval(str expr, object global = globals(), object local = object()) {
+    if (!local)
+        local = global;
+
+    /* PyRun_String does not accept a PyObject / encoding specifier,
+       this seems to be the only alternative */
+    std::string buffer = "# -*- coding: utf-8 -*-\n" + (std::string) expr;
+
+    int start;
+    switch (mode) {
+        case eval_expr:             start = Py_eval_input;   break;
+        case eval_single_statement: start = Py_single_input; break;
+        case eval_statements:       start = Py_file_input;   break;
+        default: pybind11_fail("invalid evaluation mode");
+    }
+
+    PyObject *result = PyRun_String(buffer.c_str(), start, global.ptr(), local.ptr());
+    if (!result)
+        throw error_already_set();
+    return reinterpret_steal<object>(result);
+}
+
+template <eval_mode mode = eval_expr, size_t N>
+object eval(const char (&s)[N], object global = globals(), object local = object()) {
+    /* Support raw string literals by removing common leading whitespace */
+    auto expr = (s[0] == '\n') ? str(module::import("textwrap").attr("dedent")(s))
+                               : str(s);
+    return eval<mode>(expr, global, local);
+}
+
+inline void exec(str expr, object global = globals(), object local = object()) {
+    eval<eval_statements>(expr, global, local);
+}
+
+template <size_t N>
+void exec(const char (&s)[N], object global = globals(), object local = object()) {
+    eval<eval_statements>(s, global, local);
+}
+
+template <eval_mode mode = eval_statements>
+object eval_file(str fname, object global = globals(), object local = object()) {
+    if (!local)
+        local = global;
+
+    int start;
+    switch (mode) {
+        case eval_expr:             start = Py_eval_input;   break;
+        case eval_single_statement: start = Py_single_input; break;
+        case eval_statements:       start = Py_file_input;   break;
+        default: pybind11_fail("invalid evaluation mode");
+    }
+
+    int closeFile = 1;
+    std::string fname_str = (std::string) fname;
+#if PY_VERSION_HEX >= 0x03040000
+    FILE *f = _Py_fopen_obj(fname.ptr(), "r");
+#elif PY_VERSION_HEX >= 0x03000000
+    FILE *f = _Py_fopen(fname.ptr(), "r");
+#else
+    /* No unicode support in open() :( */
+    auto fobj = reinterpret_steal<object>(PyFile_FromString(
+        const_cast<char *>(fname_str.c_str()),
+        const_cast<char*>("r")));
+    FILE *f = nullptr;
+    if (fobj)
+        f = PyFile_AsFile(fobj.ptr());
+    closeFile = 0;
+#endif
+    if (!f) {
+        PyErr_Clear();
+        pybind11_fail("File \"" + fname_str + "\" could not be opened!");
+    }
+
+#if PY_VERSION_HEX < 0x03000000 && defined(PYPY_VERSION)
+    PyObject *result = PyRun_File(f, fname_str.c_str(), start, global.ptr(),
+                                  local.ptr());
+    (void) closeFile;
+#else
+    PyObject *result = PyRun_FileEx(f, fname_str.c_str(), start, global.ptr(),
+                                    local.ptr(), closeFile);
+#endif
+
+    if (!result)
+        throw error_already_set();
+    return reinterpret_steal<object>(result);
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/functional.h b/python/src/pybind11/functional.h
new file mode 100644
index 000000000..00457e965
--- /dev/null
+++ b/python/src/pybind11/functional.h
@@ -0,0 +1,108 @@
+/*
+    pybind11/functional.h: std::function<> support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <functional>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+template <typename Return, typename... Args>
+struct type_caster<std::function<Return(Args...)>> {
+    using type = std::function<Return(Args...)>;
+    using retval_type = conditional_t<std::is_same<Return, void>::value, void_type, Return>;
+    using function_type = Return (*) (Args...);
+
+public:
+    bool load(handle src, bool convert) {
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
+            return true;
+        }
+
+        if (!isinstance<function>(src))
+            return false;
+
+        auto func = reinterpret_borrow<function>(src);
+
+        /*
+           When passing a C++ function as an argument to another C++
+           function via Python, every function call would normally involve
+           a full C++ -> Python -> C++ roundtrip, which can be prohibitive.
+           Here, we try to at least detect the case where the function is
+           stateless (i.e. function pointer or lambda function without
+           captured variables), in which case the roundtrip can be avoided.
+         */
+        if (auto cfunc = func.cpp_function()) {
+            auto c = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(cfunc.ptr()));
+            auto rec = (function_record *) c;
+
+            if (rec && rec->is_stateless &&
+                    same_type(typeid(function_type), *reinterpret_cast<const std::type_info *>(rec->data[1]))) {
+                struct capture { function_type f; };
+                value = ((capture *) &rec->data)->f;
+                return true;
+            }
+        }
+
+        // ensure GIL is held during functor destruction
+        struct func_handle {
+            function f;
+            func_handle(function&& f_) : f(std::move(f_)) {}
+            func_handle(const func_handle&) = default;
+            ~func_handle() {
+                gil_scoped_acquire acq;
+                function kill_f(std::move(f));
+            }
+        };
+
+        // value = [hfunc = func_handle(std::move(func))](Args... args) -> Return {
+        //     gil_scoped_acquire acq;
+        //     object retval(hfunc.f(std::forward<Args>(args)...));
+        //     /* Visual studio 2015 parser issue: need parentheses around this expression */
+        //     return (retval.template cast<Return>());
+        // };
+
+        struct func_wrapper {
+            func_handle hfunc;
+            func_wrapper(func_handle&& hf): hfunc(std::move(hf)) {}
+            Return operator()(Args... args) const {
+                gil_scoped_acquire acq;
+                object retval(hfunc.f(std::forward<Args>(args)...));
+                /* Visual studio 2015 parser issue: need parentheses around this expression */
+                return (retval.template cast<Return>());
+            }
+        };
+
+        value = func_wrapper(func_handle(std::move(func)));
+
+        return true;
+    }
+
+    template <typename Func>
+    static handle cast(Func &&f_, return_value_policy policy, handle /* parent */) {
+        if (!f_)
+            return none().inc_ref();
+
+        auto result = f_.template target<function_type>();
+        if (result)
+            return cpp_function(*result, policy).release();
+        else
+            return cpp_function(std::forward<Func>(f_), policy).release();
+    }
+
+    PYBIND11_TYPE_CASTER(type, _("Callable[[") + concat(make_caster<Args>::name...) + _("], ")
+                               + make_caster<retval_type>::name + _("]"));
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/iostream.h b/python/src/pybind11/iostream.h
new file mode 100644
index 000000000..72baef8fd
--- /dev/null
+++ b/python/src/pybind11/iostream.h
@@ -0,0 +1,207 @@
+/*
+    pybind11/iostream.h -- Tools to assist with redirecting cout and cerr to Python
+
+    Copyright (c) 2017 Henry F. Schreiner
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <streambuf>
+#include <ostream>
+#include <string>
+#include <memory>
+#include <iostream>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+// Buffer that writes to Python instead of C++
+class pythonbuf : public std::streambuf {
+private:
+    using traits_type = std::streambuf::traits_type;
+
+    const size_t buf_size;
+    std::unique_ptr<char[]> d_buffer;
+    object pywrite;
+    object pyflush;
+
+    int overflow(int c) {
+        if (!traits_type::eq_int_type(c, traits_type::eof())) {
+            *pptr() = traits_type::to_char_type(c);
+            pbump(1);
+        }
+        return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof();
+    }
+
+    int sync() {
+        if (pbase() != pptr()) {
+            // This subtraction cannot be negative, so dropping the sign
+            str line(pbase(), static_cast<size_t>(pptr() - pbase()));
+
+            {
+                gil_scoped_acquire tmp;
+                pywrite(line);
+                pyflush();
+            }
+
+            setp(pbase(), epptr());
+        }
+        return 0;
+    }
+
+public:
+
+    pythonbuf(object pyostream, size_t buffer_size = 1024)
+        : buf_size(buffer_size),
+          d_buffer(new char[buf_size]),
+          pywrite(pyostream.attr("write")),
+          pyflush(pyostream.attr("flush")) {
+        setp(d_buffer.get(), d_buffer.get() + buf_size - 1);
+    }
+
+    /// Sync before destroy
+    ~pythonbuf() {
+        sync();
+    }
+};
+
+NAMESPACE_END(detail)
+
+
+/** \rst
+    This a move-only guard that redirects output.
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        {
+            py::scoped_ostream_redirect output;
+            std::cout << "Hello, World!"; // Python stdout
+        } // <-- return std::cout to normal
+
+    You can explicitly pass the c++ stream and the python object,
+    for example to guard stderr instead.
+
+    .. code-block:: cpp
+
+        {
+            py::scoped_ostream_redirect output{std::cerr, py::module::import("sys").attr("stderr")};
+            std::cerr << "Hello, World!";
+        }
+ \endrst */
+class scoped_ostream_redirect {
+protected:
+    std::streambuf *old;
+    std::ostream &costream;
+    detail::pythonbuf buffer;
+
+public:
+    scoped_ostream_redirect(
+            std::ostream &costream = std::cout,
+            object pyostream = module::import("sys").attr("stdout"))
+        : costream(costream), buffer(pyostream) {
+        old = costream.rdbuf(&buffer);
+    }
+
+    ~scoped_ostream_redirect() {
+        costream.rdbuf(old);
+    }
+
+    scoped_ostream_redirect(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect(scoped_ostream_redirect &&other) = default;
+    scoped_ostream_redirect &operator=(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect &operator=(scoped_ostream_redirect &&) = delete;
+};
+
+
+/** \rst
+    Like `scoped_ostream_redirect`, but redirects cerr by default. This class
+    is provided primary to make ``py::call_guard`` easier to make.
+
+    .. code-block:: cpp
+
+     m.def("noisy_func", &noisy_func,
+           py::call_guard<scoped_ostream_redirect,
+                          scoped_estream_redirect>());
+
+\endrst */
+class scoped_estream_redirect : public scoped_ostream_redirect {
+public:
+    scoped_estream_redirect(
+            std::ostream &costream = std::cerr,
+            object pyostream = module::import("sys").attr("stderr"))
+        : scoped_ostream_redirect(costream,pyostream) {}
+};
+
+
+NAMESPACE_BEGIN(detail)
+
+// Class to redirect output as a context manager. C++ backend.
+class OstreamRedirect {
+    bool do_stdout_;
+    bool do_stderr_;
+    std::unique_ptr<scoped_ostream_redirect> redirect_stdout;
+    std::unique_ptr<scoped_estream_redirect> redirect_stderr;
+
+public:
+    OstreamRedirect(bool do_stdout = true, bool do_stderr = true)
+        : do_stdout_(do_stdout), do_stderr_(do_stderr) {}
+
+    void enter() {
+        if (do_stdout_)
+            redirect_stdout.reset(new scoped_ostream_redirect());
+        if (do_stderr_)
+            redirect_stderr.reset(new scoped_estream_redirect());
+    }
+
+    void exit() {
+        redirect_stdout.reset();
+        redirect_stderr.reset();
+    }
+};
+
+NAMESPACE_END(detail)
+
+/** \rst
+    This is a helper function to add a C++ redirect context manager to Python
+    instead of using a C++ guard. To use it, add the following to your binding code:
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        py::add_ostream_redirect(m, "ostream_redirect");
+
+    You now have a Python context manager that redirects your output:
+
+    .. code-block:: python
+
+        with m.ostream_redirect():
+            m.print_to_cout_function()
+
+    This manager can optionally be told which streams to operate on:
+
+    .. code-block:: python
+
+        with m.ostream_redirect(stdout=true, stderr=true):
+            m.noisy_function_with_error_printing()
+
+ \endrst */
+inline class_<detail::OstreamRedirect> add_ostream_redirect(module m, std::string name = "ostream_redirect") {
+    return class_<detail::OstreamRedirect>(m, name.c_str(), module_local())
+        .def(init<bool,bool>(), arg("stdout")=true, arg("stderr")=true)
+        .def("__enter__", &detail::OstreamRedirect::enter)
+        .def("__exit__", [](detail::OstreamRedirect &self_, args) { self_.exit(); });
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/numpy.h b/python/src/pybind11/numpy.h
new file mode 100644
index 000000000..b2a02e024
--- /dev/null
+++ b/python/src/pybind11/numpy.h
@@ -0,0 +1,1610 @@
+/*
+    pybind11/numpy.h: Basic NumPy support, vectorize() wrapper
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "complex.h"
+#include <numeric>
+#include <algorithm>
+#include <array>
+#include <cstdlib>
+#include <cstring>
+#include <sstream>
+#include <string>
+#include <functional>
+#include <utility>
+#include <vector>
+#include <typeindex>
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+/* This will be true on all flat address space platforms and allows us to reduce the
+   whole npy_intp / ssize_t / Py_intptr_t business down to just ssize_t for all size
+   and dimension types (e.g. shape, strides, indexing), instead of inflicting this
+   upon the library user. */
+static_assert(sizeof(ssize_t) == sizeof(Py_intptr_t), "ssize_t != Py_intptr_t");
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+class array; // Forward declaration
+
+NAMESPACE_BEGIN(detail)
+template <typename type, typename SFINAE = void> struct npy_format_descriptor;
+
+struct PyArrayDescr_Proxy {
+    PyObject_HEAD
+    PyObject *typeobj;
+    char kind;
+    char type;
+    char byteorder;
+    char flags;
+    int type_num;
+    int elsize;
+    int alignment;
+    char *subarray;
+    PyObject *fields;
+    PyObject *names;
+};
+
+struct PyArray_Proxy {
+    PyObject_HEAD
+    char *data;
+    int nd;
+    ssize_t *dimensions;
+    ssize_t *strides;
+    PyObject *base;
+    PyObject *descr;
+    int flags;
+};
+
+struct PyVoidScalarObject_Proxy {
+    PyObject_VAR_HEAD
+    char *obval;
+    PyArrayDescr_Proxy *descr;
+    int flags;
+    PyObject *base;
+};
+
+struct numpy_type_info {
+    PyObject* dtype_ptr;
+    std::string format_str;
+};
+
+struct numpy_internals {
+    std::unordered_map<std::type_index, numpy_type_info> registered_dtypes;
+
+    numpy_type_info *get_type_info(const std::type_info& tinfo, bool throw_if_missing = true) {
+        auto it = registered_dtypes.find(std::type_index(tinfo));
+        if (it != registered_dtypes.end())
+            return &(it->second);
+        if (throw_if_missing)
+            pybind11_fail(std::string("NumPy type info missing for ") + tinfo.name());
+        return nullptr;
+    }
+
+    template<typename T> numpy_type_info *get_type_info(bool throw_if_missing = true) {
+        return get_type_info(typeid(typename std::remove_cv<T>::type), throw_if_missing);
+    }
+};
+
+inline PYBIND11_NOINLINE void load_numpy_internals(numpy_internals* &ptr) {
+    ptr = &get_or_create_shared_data<numpy_internals>("_numpy_internals");
+}
+
+inline numpy_internals& get_numpy_internals() {
+    static numpy_internals* ptr = nullptr;
+    if (!ptr)
+        load_numpy_internals(ptr);
+    return *ptr;
+}
+
+struct npy_api {
+    enum constants {
+        NPY_ARRAY_C_CONTIGUOUS_ = 0x0001,
+        NPY_ARRAY_F_CONTIGUOUS_ = 0x0002,
+        NPY_ARRAY_OWNDATA_ = 0x0004,
+        NPY_ARRAY_FORCECAST_ = 0x0010,
+        NPY_ARRAY_ENSUREARRAY_ = 0x0040,
+        NPY_ARRAY_ALIGNED_ = 0x0100,
+        NPY_ARRAY_WRITEABLE_ = 0x0400,
+        NPY_BOOL_ = 0,
+        NPY_BYTE_, NPY_UBYTE_,
+        NPY_SHORT_, NPY_USHORT_,
+        NPY_INT_, NPY_UINT_,
+        NPY_LONG_, NPY_ULONG_,
+        NPY_LONGLONG_, NPY_ULONGLONG_,
+        NPY_FLOAT_, NPY_DOUBLE_, NPY_LONGDOUBLE_,
+        NPY_CFLOAT_, NPY_CDOUBLE_, NPY_CLONGDOUBLE_,
+        NPY_OBJECT_ = 17,
+        NPY_STRING_, NPY_UNICODE_, NPY_VOID_
+    };
+
+    typedef struct {
+        Py_intptr_t *ptr;
+        int len;
+    } PyArray_Dims;
+
+    static npy_api& get() {
+        static npy_api api = lookup();
+        return api;
+    }
+
+    bool PyArray_Check_(PyObject *obj) const {
+        return (bool) PyObject_TypeCheck(obj, PyArray_Type_);
+    }
+    bool PyArrayDescr_Check_(PyObject *obj) const {
+        return (bool) PyObject_TypeCheck(obj, PyArrayDescr_Type_);
+    }
+
+    unsigned int (*PyArray_GetNDArrayCFeatureVersion_)();
+    PyObject *(*PyArray_DescrFromType_)(int);
+    PyObject *(*PyArray_NewFromDescr_)
+        (PyTypeObject *, PyObject *, int, Py_intptr_t *,
+         Py_intptr_t *, void *, int, PyObject *);
+    PyObject *(*PyArray_DescrNewFromType_)(int);
+    int (*PyArray_CopyInto_)(PyObject *, PyObject *);
+    PyObject *(*PyArray_NewCopy_)(PyObject *, int);
+    PyTypeObject *PyArray_Type_;
+    PyTypeObject *PyVoidArrType_Type_;
+    PyTypeObject *PyArrayDescr_Type_;
+    PyObject *(*PyArray_DescrFromScalar_)(PyObject *);
+    PyObject *(*PyArray_FromAny_) (PyObject *, PyObject *, int, int, int, PyObject *);
+    int (*PyArray_DescrConverter_) (PyObject *, PyObject **);
+    bool (*PyArray_EquivTypes_) (PyObject *, PyObject *);
+    int (*PyArray_GetArrayParamsFromObject_)(PyObject *, PyObject *, char, PyObject **, int *,
+                                             Py_ssize_t *, PyObject **, PyObject *);
+    PyObject *(*PyArray_Squeeze_)(PyObject *);
+    int (*PyArray_SetBaseObject_)(PyObject *, PyObject *);
+    PyObject* (*PyArray_Resize_)(PyObject*, PyArray_Dims*, int, int);
+private:
+    enum functions {
+        API_PyArray_GetNDArrayCFeatureVersion = 211,
+        API_PyArray_Type = 2,
+        API_PyArrayDescr_Type = 3,
+        API_PyVoidArrType_Type = 39,
+        API_PyArray_DescrFromType = 45,
+        API_PyArray_DescrFromScalar = 57,
+        API_PyArray_FromAny = 69,
+        API_PyArray_Resize = 80,
+        API_PyArray_CopyInto = 82,
+        API_PyArray_NewCopy = 85,
+        API_PyArray_NewFromDescr = 94,
+        API_PyArray_DescrNewFromType = 9,
+        API_PyArray_DescrConverter = 174,
+        API_PyArray_EquivTypes = 182,
+        API_PyArray_GetArrayParamsFromObject = 278,
+        API_PyArray_Squeeze = 136,
+        API_PyArray_SetBaseObject = 282
+    };
+
+    static npy_api lookup() {
+        module m = module::import("numpy.core.multiarray");
+        auto c = m.attr("_ARRAY_API");
+#if PY_MAJOR_VERSION >= 3
+        void **api_ptr = (void **) PyCapsule_GetPointer(c.ptr(), NULL);
+#else
+        void **api_ptr = (void **) PyCObject_AsVoidPtr(c.ptr());
+#endif
+        npy_api api;
+#define DECL_NPY_API(Func) api.Func##_ = (decltype(api.Func##_)) api_ptr[API_##Func];
+        DECL_NPY_API(PyArray_GetNDArrayCFeatureVersion);
+        if (api.PyArray_GetNDArrayCFeatureVersion_() < 0x7)
+            pybind11_fail("pybind11 numpy support requires numpy >= 1.7.0");
+        DECL_NPY_API(PyArray_Type);
+        DECL_NPY_API(PyVoidArrType_Type);
+        DECL_NPY_API(PyArrayDescr_Type);
+        DECL_NPY_API(PyArray_DescrFromType);
+        DECL_NPY_API(PyArray_DescrFromScalar);
+        DECL_NPY_API(PyArray_FromAny);
+        DECL_NPY_API(PyArray_Resize);
+        DECL_NPY_API(PyArray_CopyInto);
+        DECL_NPY_API(PyArray_NewCopy);
+        DECL_NPY_API(PyArray_NewFromDescr);
+        DECL_NPY_API(PyArray_DescrNewFromType);
+        DECL_NPY_API(PyArray_DescrConverter);
+        DECL_NPY_API(PyArray_EquivTypes);
+        DECL_NPY_API(PyArray_GetArrayParamsFromObject);
+        DECL_NPY_API(PyArray_Squeeze);
+        DECL_NPY_API(PyArray_SetBaseObject);
+#undef DECL_NPY_API
+        return api;
+    }
+};
+
+inline PyArray_Proxy* array_proxy(void* ptr) {
+    return reinterpret_cast<PyArray_Proxy*>(ptr);
+}
+
+inline const PyArray_Proxy* array_proxy(const void* ptr) {
+    return reinterpret_cast<const PyArray_Proxy*>(ptr);
+}
+
+inline PyArrayDescr_Proxy* array_descriptor_proxy(PyObject* ptr) {
+   return reinterpret_cast<PyArrayDescr_Proxy*>(ptr);
+}
+
+inline const PyArrayDescr_Proxy* array_descriptor_proxy(const PyObject* ptr) {
+   return reinterpret_cast<const PyArrayDescr_Proxy*>(ptr);
+}
+
+inline bool check_flags(const void* ptr, int flag) {
+    return (flag == (array_proxy(ptr)->flags & flag));
+}
+
+template <typename T> struct is_std_array : std::false_type { };
+template <typename T, size_t N> struct is_std_array<std::array<T, N>> : std::true_type { };
+template <typename T> struct is_complex : std::false_type { };
+template <typename T> struct is_complex<std::complex<T>> : std::true_type { };
+
+template <typename T> struct array_info_scalar {
+    typedef T type;
+    static constexpr bool is_array = false;
+    static constexpr bool is_empty = false;
+    static constexpr auto extents = _("");
+    static void append_extents(list& /* shape */) { }
+};
+// Computes underlying type and a comma-separated list of extents for array
+// types (any mix of std::array and built-in arrays). An array of char is
+// treated as scalar because it gets special handling.
+template <typename T> struct array_info : array_info_scalar<T> { };
+template <typename T, size_t N> struct array_info<std::array<T, N>> {
+    using type = typename array_info<T>::type;
+    static constexpr bool is_array = true;
+    static constexpr bool is_empty = (N == 0) || array_info<T>::is_empty;
+    static constexpr size_t extent = N;
+
+    // appends the extents to shape
+    static void append_extents(list& shape) {
+        shape.append(N);
+        array_info<T>::append_extents(shape);
+    }
+
+    static constexpr auto extents = _<array_info<T>::is_array>(
+        concat(_<N>(), array_info<T>::extents), _<N>()
+    );
+};
+// For numpy we have special handling for arrays of characters, so we don't include
+// the size in the array extents.
+template <size_t N> struct array_info<char[N]> : array_info_scalar<char[N]> { };
+template <size_t N> struct array_info<std::array<char, N>> : array_info_scalar<std::array<char, N>> { };
+template <typename T, size_t N> struct array_info<T[N]> : array_info<std::array<T, N>> { };
+template <typename T> using remove_all_extents_t = typename array_info<T>::type;
+
+template <typename T> using is_pod_struct = all_of<
+    std::is_standard_layout<T>,     // since we're accessing directly in memory we need a standard layout type
+#if !defined(__GNUG__) || defined(_LIBCPP_VERSION) || defined(_GLIBCXX_USE_CXX11_ABI)
+    // _GLIBCXX_USE_CXX11_ABI indicates that we're using libstdc++ from GCC 5 or newer, independent
+    // of the actual compiler (Clang can also use libstdc++, but it always defines __GNUC__ == 4).
+    std::is_trivially_copyable<T>,
+#else
+    // GCC 4 doesn't implement is_trivially_copyable, so approximate it
+    std::is_trivially_destructible<T>,
+    satisfies_any_of<T, std::has_trivial_copy_constructor, std::has_trivial_copy_assign>,
+#endif
+    satisfies_none_of<T, std::is_reference, std::is_array, is_std_array, std::is_arithmetic, is_complex, std::is_enum>
+>;
+
+template <ssize_t Dim = 0, typename Strides> ssize_t byte_offset_unsafe(const Strides &) { return 0; }
+template <ssize_t Dim = 0, typename Strides, typename... Ix>
+ssize_t byte_offset_unsafe(const Strides &strides, ssize_t i, Ix... index) {
+    return i * strides[Dim] + byte_offset_unsafe<Dim + 1>(strides, index...);
+}
+
+/**
+ * Proxy class providing unsafe, unchecked const access to array data.  This is constructed through
+ * the `unchecked<T, N>()` method of `array` or the `unchecked<N>()` method of `array_t<T>`.  `Dims`
+ * will be -1 for dimensions determined at runtime.
+ */
+template <typename T, ssize_t Dims>
+class unchecked_reference {
+protected:
+    static constexpr bool Dynamic = Dims < 0;
+    const unsigned char *data_;
+    // Storing the shape & strides in local variables (i.e. these arrays) allows the compiler to
+    // make large performance gains on big, nested loops, but requires compile-time dimensions
+    conditional_t<Dynamic, const ssize_t *, std::array<ssize_t, (size_t) Dims>>
+            shape_, strides_;
+    const ssize_t dims_;
+
+    friend class pybind11::array;
+    // Constructor for compile-time dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data, const ssize_t *shape, const ssize_t *strides, enable_if_t<!Dyn, ssize_t>)
+    : data_{reinterpret_cast<const unsigned char *>(data)}, dims_{Dims} {
+        for (size_t i = 0; i < (size_t) dims_; i++) {
+            shape_[i] = shape[i];
+            strides_[i] = strides[i];
+        }
+    }
+    // Constructor for runtime dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data, const ssize_t *shape, const ssize_t *strides, enable_if_t<Dyn, ssize_t> dims)
+    : data_{reinterpret_cast<const unsigned char *>(data)}, shape_{shape}, strides_{strides}, dims_{dims} {}
+
+public:
+    /**
+     * Unchecked const reference access to data at the given indices.  For a compile-time known
+     * number of dimensions, this requires the correct number of arguments; for run-time
+     * dimensionality, this is not checked (and so is up to the caller to use safely).
+     */
+    template <typename... Ix> const T &operator()(Ix... index) const {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                "Invalid number of indices for unchecked array reference");
+        return *reinterpret_cast<const T *>(data_ + byte_offset_unsafe(strides_, ssize_t(index)...));
+    }
+    /**
+     * Unchecked const reference access to data; this operator only participates if the reference
+     * is to a 1-dimensional array.  When present, this is exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    const T &operator[](ssize_t index) const { return operator()(index); }
+
+    /// Pointer access to the data at the given indices.
+    template <typename... Ix> const T *data(Ix... ix) const { return &operator()(ssize_t(ix)...); }
+
+    /// Returns the item size, i.e. sizeof(T)
+    constexpr static ssize_t itemsize() { return sizeof(T); }
+
+    /// Returns the shape (i.e. size) of dimension `dim`
+    ssize_t shape(ssize_t dim) const { return shape_[(size_t) dim]; }
+
+    /// Returns the number of dimensions of the array
+    ssize_t ndim() const { return dims_; }
+
+    /// Returns the total number of elements in the referenced array, i.e. the product of the shapes
+    template <bool Dyn = Dynamic>
+    enable_if_t<!Dyn, ssize_t> size() const {
+        return std::accumulate(shape_.begin(), shape_.end(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+    template <bool Dyn = Dynamic>
+    enable_if_t<Dyn, ssize_t> size() const {
+        return std::accumulate(shape_, shape_ + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Returns the total number of bytes used by the referenced data.  Note that the actual span in
+    /// memory may be larger if the referenced array has non-contiguous strides (e.g. for a slice).
+    ssize_t nbytes() const {
+        return size() * itemsize();
+    }
+};
+
+template <typename T, ssize_t Dims>
+class unchecked_mutable_reference : public unchecked_reference<T, Dims> {
+    friend class pybind11::array;
+    using ConstBase = unchecked_reference<T, Dims>;
+    using ConstBase::ConstBase;
+    using ConstBase::Dynamic;
+public:
+    /// Mutable, unchecked access to data at the given indices.
+    template <typename... Ix> T& operator()(Ix... index) {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                "Invalid number of indices for unchecked array reference");
+        return const_cast<T &>(ConstBase::operator()(index...));
+    }
+    /**
+     * Mutable, unchecked access data at the given index; this operator only participates if the
+     * reference is to a 1-dimensional array (or has runtime dimensions).  When present, this is
+     * exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    T &operator[](ssize_t index) { return operator()(index); }
+
+    /// Mutable pointer access to the data at the given indices.
+    template <typename... Ix> T *mutable_data(Ix... ix) { return &operator()(ssize_t(ix)...); }
+};
+
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_reference<T, Dim>> {
+    static_assert(Dim == 0 && Dim > 0 /* always fail */, "unchecked array proxy object is not castable");
+};
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_mutable_reference<T, Dim>> : type_caster<unchecked_reference<T, Dim>> {};
+
+NAMESPACE_END(detail)
+
+class dtype : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(dtype, object, detail::npy_api::get().PyArrayDescr_Check_);
+
+    explicit dtype(const buffer_info &info) {
+        dtype descr(_dtype_from_pep3118()(PYBIND11_STR_TYPE(info.format)));
+        // If info.itemsize == 0, use the value calculated from the format string
+        m_ptr = descr.strip_padding(info.itemsize ? info.itemsize : descr.itemsize()).release().ptr();
+    }
+
+    explicit dtype(const std::string &format) {
+        m_ptr = from_args(pybind11::str(format)).release().ptr();
+    }
+
+    dtype(const char *format) : dtype(std::string(format)) { }
+
+    dtype(list names, list formats, list offsets, ssize_t itemsize) {
+        dict args;
+        args["names"] = names;
+        args["formats"] = formats;
+        args["offsets"] = offsets;
+        args["itemsize"] = pybind11::int_(itemsize);
+        m_ptr = from_args(args).release().ptr();
+    }
+
+    /// This is essentially the same as calling numpy.dtype(args) in Python.
+    static dtype from_args(object args) {
+        PyObject *ptr = nullptr;
+        if (!detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) || !ptr)
+            throw error_already_set();
+        return reinterpret_steal<dtype>(ptr);
+    }
+
+    /// Return dtype associated with a C++ type.
+    template <typename T> static dtype of() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::dtype();
+    }
+
+    /// Size of the data type in bytes.
+    ssize_t itemsize() const {
+        return detail::array_descriptor_proxy(m_ptr)->elsize;
+    }
+
+    /// Returns true for structured data types.
+    bool has_fields() const {
+        return detail::array_descriptor_proxy(m_ptr)->names != nullptr;
+    }
+
+    /// Single-character type code.
+    char kind() const {
+        return detail::array_descriptor_proxy(m_ptr)->kind;
+    }
+
+private:
+    static object _dtype_from_pep3118() {
+        static PyObject *obj = module::import("numpy.core._internal")
+            .attr("_dtype_from_pep3118").cast<object>().release().ptr();
+        return reinterpret_borrow<object>(obj);
+    }
+
+    dtype strip_padding(ssize_t itemsize) {
+        // Recursively strip all void fields with empty names that are generated for
+        // padding fields (as of NumPy v1.11).
+        if (!has_fields())
+            return *this;
+
+        struct field_descr { PYBIND11_STR_TYPE name; object format; pybind11::int_ offset; };
+        std::vector<field_descr> field_descriptors;
+
+        for (auto field : attr("fields").attr("items")()) {
+            auto spec = field.cast<tuple>();
+            auto name = spec[0].cast<pybind11::str>();
+            auto format = spec[1].cast<tuple>()[0].cast<dtype>();
+            auto offset = spec[1].cast<tuple>()[1].cast<pybind11::int_>();
+            if (!len(name) && format.kind() == 'V')
+                continue;
+            field_descriptors.push_back({(PYBIND11_STR_TYPE) name, format.strip_padding(format.itemsize()), offset});
+        }
+
+        std::sort(field_descriptors.begin(), field_descriptors.end(),
+                  [](const field_descr& a, const field_descr& b) {
+                      return a.offset.cast<int>() < b.offset.cast<int>();
+                  });
+
+        list names, formats, offsets;
+        for (auto& descr : field_descriptors) {
+            names.append(descr.name);
+            formats.append(descr.format);
+            offsets.append(descr.offset);
+        }
+        return dtype(names, formats, offsets, itemsize);
+    }
+};
+
+class array : public buffer {
+public:
+    PYBIND11_OBJECT_CVT(array, buffer, detail::npy_api::get().PyArray_Check_, raw_array)
+
+    enum {
+        c_style = detail::npy_api::NPY_ARRAY_C_CONTIGUOUS_,
+        f_style = detail::npy_api::NPY_ARRAY_F_CONTIGUOUS_,
+        forcecast = detail::npy_api::NPY_ARRAY_FORCECAST_
+    };
+
+    array() : array({{0}}, static_cast<const double *>(nullptr)) {}
+
+    using ShapeContainer = detail::any_container<ssize_t>;
+    using StridesContainer = detail::any_container<ssize_t>;
+
+    // Constructs an array taking shape/strides from arbitrary container types
+    array(const pybind11::dtype &dt, ShapeContainer shape, StridesContainer strides,
+          const void *ptr = nullptr, handle base = handle()) {
+
+        if (strides->empty())
+            *strides = c_strides(*shape, dt.itemsize());
+
+        auto ndim = shape->size();
+        if (ndim != strides->size())
+            pybind11_fail("NumPy: shape ndim doesn't match strides ndim");
+        auto descr = dt;
+
+        int flags = 0;
+        if (base && ptr) {
+            if (isinstance<array>(base))
+                /* Copy flags from base (except ownership bit) */
+                flags = reinterpret_borrow<array>(base).flags() & ~detail::npy_api::NPY_ARRAY_OWNDATA_;
+            else
+                /* Writable by default, easy to downgrade later on if needed */
+                flags = detail::npy_api::NPY_ARRAY_WRITEABLE_;
+        }
+
+        auto &api = detail::npy_api::get();
+        auto tmp = reinterpret_steal<object>(api.PyArray_NewFromDescr_(
+            api.PyArray_Type_, descr.release().ptr(), (int) ndim, shape->data(), strides->data(),
+            const_cast<void *>(ptr), flags, nullptr));
+        if (!tmp)
+            throw error_already_set();
+        if (ptr) {
+            if (base) {
+                api.PyArray_SetBaseObject_(tmp.ptr(), base.inc_ref().ptr());
+            } else {
+                tmp = reinterpret_steal<object>(api.PyArray_NewCopy_(tmp.ptr(), -1 /* any order */));
+            }
+        }
+        m_ptr = tmp.release().ptr();
+    }
+
+    array(const pybind11::dtype &dt, ShapeContainer shape, const void *ptr = nullptr, handle base = handle())
+        : array(dt, std::move(shape), {}, ptr, base) { }
+
+    template <typename T, typename = detail::enable_if_t<std::is_integral<T>::value && !std::is_same<bool, T>::value>>
+    array(const pybind11::dtype &dt, T count, const void *ptr = nullptr, handle base = handle())
+        : array(dt, {{count}}, ptr, base) { }
+
+    template <typename T>
+    array(ShapeContainer shape, StridesContainer strides, const T *ptr, handle base = handle())
+        : array(pybind11::dtype::of<T>(), std::move(shape), std::move(strides), ptr, base) { }
+
+    template <typename T>
+    array(ShapeContainer shape, const T *ptr, handle base = handle())
+        : array(std::move(shape), {}, ptr, base) { }
+
+    template <typename T>
+    explicit array(ssize_t count, const T *ptr, handle base = handle()) : array({count}, {}, ptr, base) { }
+
+    explicit array(const buffer_info &info)
+    : array(pybind11::dtype(info), info.shape, info.strides, info.ptr) { }
+
+    /// Array descriptor (dtype)
+    pybind11::dtype dtype() const {
+        return reinterpret_borrow<pybind11::dtype>(detail::array_proxy(m_ptr)->descr);
+    }
+
+    /// Total number of elements
+    ssize_t size() const {
+        return std::accumulate(shape(), shape() + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Byte size of a single element
+    ssize_t itemsize() const {
+        return detail::array_descriptor_proxy(detail::array_proxy(m_ptr)->descr)->elsize;
+    }
+
+    /// Total number of bytes
+    ssize_t nbytes() const {
+        return size() * itemsize();
+    }
+
+    /// Number of dimensions
+    ssize_t ndim() const {
+        return detail::array_proxy(m_ptr)->nd;
+    }
+
+    /// Base object
+    object base() const {
+        return reinterpret_borrow<object>(detail::array_proxy(m_ptr)->base);
+    }
+
+    /// Dimensions of the array
+    const ssize_t* shape() const {
+        return detail::array_proxy(m_ptr)->dimensions;
+    }
+
+    /// Dimension along a given axis
+    ssize_t shape(ssize_t dim) const {
+        if (dim >= ndim())
+            fail_dim_check(dim, "invalid axis");
+        return shape()[dim];
+    }
+
+    /// Strides of the array
+    const ssize_t* strides() const {
+        return detail::array_proxy(m_ptr)->strides;
+    }
+
+    /// Stride along a given axis
+    ssize_t strides(ssize_t dim) const {
+        if (dim >= ndim())
+            fail_dim_check(dim, "invalid axis");
+        return strides()[dim];
+    }
+
+    /// Return the NumPy array flags
+    int flags() const {
+        return detail::array_proxy(m_ptr)->flags;
+    }
+
+    /// If set, the array is writeable (otherwise the buffer is read-only)
+    bool writeable() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_WRITEABLE_);
+    }
+
+    /// If set, the array owns the data (will be freed when the array is deleted)
+    bool owndata() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_OWNDATA_);
+    }
+
+    /// Pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    template<typename... Ix> const void* data(Ix... index) const {
+        return static_cast<const void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Mutable pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    /// May throw if the array is not writeable.
+    template<typename... Ix> void* mutable_data(Ix... index) {
+        check_writeable();
+        return static_cast<void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Byte offset from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template<typename... Ix> ssize_t offset_at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) > ndim())
+            fail_dim_check(sizeof...(index), "too many indices for an array");
+        return byte_offset(ssize_t(index)...);
+    }
+
+    ssize_t offset_at() const { return 0; }
+
+    /// Item count from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template<typename... Ix> ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1> detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        if (Dims >= 0 && ndim() != Dims)
+            throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) +
+                    "; expected " + std::to_string(Dims));
+        return detail::unchecked_mutable_reference<T, Dims>(mutable_data(), shape(), strides(), ndim());
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `mutable_unchecked()`, this does not require that the
+     * underlying array have the `writable` flag.  Use with care: the array must not be destroyed or
+     * reshaped for the duration of the returned object, and the caller must take care not to access
+     * invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1> detail::unchecked_reference<T, Dims> unchecked() const & {
+        if (Dims >= 0 && ndim() != Dims)
+            throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) +
+                    "; expected " + std::to_string(Dims));
+        return detail::unchecked_reference<T, Dims>(data(), shape(), strides(), ndim());
+    }
+
+    /// Return a new view with all of the dimensions of length 1 removed
+    array squeeze() {
+        auto& api = detail::npy_api::get();
+        return reinterpret_steal<array>(api.PyArray_Squeeze_(m_ptr));
+    }
+
+    /// Resize array to given shape
+    /// If refcheck is true and more that one reference exist to this array
+    /// then resize will succeed only if it makes a reshape, i.e. original size doesn't change
+    void resize(ShapeContainer new_shape, bool refcheck = true) {
+        detail::npy_api::PyArray_Dims d = {
+            new_shape->data(), int(new_shape->size())
+        };
+        // try to resize, set ordering param to -1 cause it's not used anyway
+        object new_array = reinterpret_steal<object>(
+            detail::npy_api::get().PyArray_Resize_(m_ptr, &d, int(refcheck), -1)
+        );
+        if (!new_array) throw error_already_set();
+        if (isinstance<array>(new_array)) { *this = std::move(new_array); }
+    }
+
+    /// Ensure that the argument is a NumPy array
+    /// In case of an error, nullptr is returned and the Python error is cleared.
+    static array ensure(handle h, int ExtraFlags = 0) {
+        auto result = reinterpret_steal<array>(raw_array(h.ptr(), ExtraFlags));
+        if (!result)
+            PyErr_Clear();
+        return result;
+    }
+
+protected:
+    template<typename, typename> friend struct detail::npy_format_descriptor;
+
+    void fail_dim_check(ssize_t dim, const std::string& msg) const {
+        throw index_error(msg + ": " + std::to_string(dim) +
+                          " (ndim = " + std::to_string(ndim()) + ")");
+    }
+
+    template<typename... Ix> ssize_t byte_offset(Ix... index) const {
+        check_dimensions(index...);
+        return detail::byte_offset_unsafe(strides(), ssize_t(index)...);
+    }
+
+    void check_writeable() const {
+        if (!writeable())
+            throw std::domain_error("array is not writeable");
+    }
+
+    // Default, C-style strides
+    static std::vector<ssize_t> c_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+        auto ndim = shape.size();
+        std::vector<ssize_t> strides(ndim, itemsize);
+        if (ndim > 0)
+            for (size_t i = ndim - 1; i > 0; --i)
+                strides[i - 1] = strides[i] * shape[i];
+        return strides;
+    }
+
+    // F-style strides; default when constructing an array_t with `ExtraFlags & f_style`
+    static std::vector<ssize_t> f_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+        auto ndim = shape.size();
+        std::vector<ssize_t> strides(ndim, itemsize);
+        for (size_t i = 1; i < ndim; ++i)
+            strides[i] = strides[i - 1] * shape[i - 1];
+        return strides;
+    }
+
+    template<typename... Ix> void check_dimensions(Ix... index) const {
+        check_dimensions_impl(ssize_t(0), shape(), ssize_t(index)...);
+    }
+
+    void check_dimensions_impl(ssize_t, const ssize_t*) const { }
+
+    template<typename... Ix> void check_dimensions_impl(ssize_t axis, const ssize_t* shape, ssize_t i, Ix... index) const {
+        if (i >= *shape) {
+            throw index_error(std::string("index ") + std::to_string(i) +
+                              " is out of bounds for axis " + std::to_string(axis) +
+                              " with size " + std::to_string(*shape));
+        }
+        check_dimensions_impl(axis + 1, shape + 1, index...);
+    }
+
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array(PyObject *ptr, int ExtraFlags = 0) {
+        if (ptr == nullptr) {
+            PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(
+            ptr, nullptr, 0, 0, detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+    }
+};
+
+template <typename T, int ExtraFlags = array::forcecast> class array_t : public array {
+private:
+    struct private_ctor {};
+    // Delegating constructor needed when both moving and accessing in the same constructor
+    array_t(private_ctor, ShapeContainer &&shape, StridesContainer &&strides, const T *ptr, handle base)
+        : array(std::move(shape), std::move(strides), ptr, base) {}
+public:
+    static_assert(!detail::array_info<T>::is_array, "Array types cannot be used with array_t");
+
+    using value_type = T;
+
+    array_t() : array(0, static_cast<const T *>(nullptr)) {}
+    array_t(handle h, borrowed_t) : array(h, borrowed_t{}) { }
+    array_t(handle h, stolen_t) : array(h, stolen_t{}) { }
+
+    PYBIND11_DEPRECATED("Use array_t<T>::ensure() instead")
+    array_t(handle h, bool is_borrowed) : array(raw_array_t(h.ptr()), stolen_t{}) {
+        if (!m_ptr) PyErr_Clear();
+        if (!is_borrowed) Py_XDECREF(h.ptr());
+    }
+
+    array_t(const object &o) : array(raw_array_t(o.ptr()), stolen_t{}) {
+        if (!m_ptr) throw error_already_set();
+    }
+
+    explicit array_t(const buffer_info& info) : array(info) { }
+
+    array_t(ShapeContainer shape, StridesContainer strides, const T *ptr = nullptr, handle base = handle())
+        : array(std::move(shape), std::move(strides), ptr, base) { }
+
+    explicit array_t(ShapeContainer shape, const T *ptr = nullptr, handle base = handle())
+        : array_t(private_ctor{}, std::move(shape),
+                ExtraFlags & f_style ? f_strides(*shape, itemsize()) : c_strides(*shape, itemsize()),
+                ptr, base) { }
+
+    explicit array_t(size_t count, const T *ptr = nullptr, handle base = handle())
+        : array({count}, {}, ptr, base) { }
+
+    constexpr ssize_t itemsize() const {
+        return sizeof(T);
+    }
+
+    template<typename... Ix> ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    template<typename... Ix> const T* data(Ix... index) const {
+        return static_cast<const T*>(array::data(index...));
+    }
+
+    template<typename... Ix> T* mutable_data(Ix... index) {
+        return static_cast<T*>(array::mutable_data(index...));
+    }
+
+    // Reference to element at a given index
+    template<typename... Ix> const T& at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) != ndim())
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        return *(static_cast<const T*>(array::data()) + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    // Mutable reference to element at a given index
+    template<typename... Ix> T& mutable_at(Ix... index) {
+        if ((ssize_t) sizeof...(index) != ndim())
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        return *(static_cast<T*>(array::mutable_data()) + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1> detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        return array::mutable_unchecked<T, Dims>();
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `unchecked()`, this does not require that the underlying
+     * array have the `writable` flag.  Use with care: the array must not be destroyed or reshaped
+     * for the duration of the returned object, and the caller must take care not to access invalid
+     * dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1> detail::unchecked_reference<T, Dims> unchecked() const & {
+        return array::unchecked<T, Dims>();
+    }
+
+    /// Ensure that the argument is a NumPy array of the correct dtype (and if not, try to convert
+    /// it).  In case of an error, nullptr is returned and the Python error is cleared.
+    static array_t ensure(handle h) {
+        auto result = reinterpret_steal<array_t>(raw_array_t(h.ptr()));
+        if (!result)
+            PyErr_Clear();
+        return result;
+    }
+
+    static bool check_(handle h) {
+        const auto &api = detail::npy_api::get();
+        return api.PyArray_Check_(h.ptr())
+               && api.PyArray_EquivTypes_(detail::array_proxy(h.ptr())->descr, dtype::of<T>().ptr());
+    }
+
+protected:
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array_t(PyObject *ptr) {
+        if (ptr == nullptr) {
+            PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array_t from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(
+            ptr, dtype::of<T>().release().ptr(), 0, 0,
+            detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static std::string format() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::format();
+    }
+};
+
+template <size_t N> struct format_descriptor<char[N]> {
+    static std::string format() { return std::to_string(N) + "s"; }
+};
+template <size_t N> struct format_descriptor<std::array<char, N>> {
+    static std::string format() { return std::to_string(N) + "s"; }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<std::is_enum<T>::value>> {
+    static std::string format() {
+        return format_descriptor<
+            typename std::remove_cv<typename std::underlying_type<T>::type>::type>::format();
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::array_info<T>::is_array>> {
+    static std::string format() {
+        using namespace detail;
+        static constexpr auto extents = _("(") + array_info<T>::extents + _(")");
+        return extents.text + format_descriptor<remove_all_extents_t<T>>::format();
+    }
+};
+
+NAMESPACE_BEGIN(detail)
+template <typename T, int ExtraFlags>
+struct pyobject_caster<array_t<T, ExtraFlags>> {
+    using type = array_t<T, ExtraFlags>;
+
+    bool load(handle src, bool convert) {
+        if (!convert && !type::check_(src))
+            return false;
+        value = type::ensure(src);
+        return static_cast<bool>(value);
+    }
+
+    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
+        return src.inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+};
+
+template <typename T>
+struct compare_buffer_info<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static bool compare(const buffer_info& b) {
+        return npy_api::get().PyArray_EquivTypes_(dtype::of<T>().ptr(), dtype(b).ptr());
+    }
+};
+
+template <typename T, typename = void>
+struct npy_format_descriptor_name;
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_integral<T>::value>> {
+    static constexpr auto name = _<std::is_same<T, bool>::value>(
+        _("bool"), _<std::is_signed<T>::value>("int", "uint") + _<sizeof(T)*8>()
+    );
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr auto name = _<std::is_same<T, float>::value || std::is_same<T, double>::value>(
+        _("float") + _<sizeof(T)*8>(), _("longdouble")
+    );
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<is_complex<T>::value>> {
+    static constexpr auto name = _<std::is_same<typename T::value_type, float>::value
+                                   || std::is_same<typename T::value_type, double>::value>(
+        _("complex") + _<sizeof(typename T::value_type)*16>(), _("longcomplex")
+    );
+};
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<satisfies_any_of<T, std::is_arithmetic, is_complex>::value>>
+    : npy_format_descriptor_name<T> {
+private:
+    // NB: the order here must match the one in common.h
+    constexpr static const int values[15] = {
+        npy_api::NPY_BOOL_,
+        npy_api::NPY_BYTE_,   npy_api::NPY_UBYTE_,   npy_api::NPY_SHORT_,    npy_api::NPY_USHORT_,
+        npy_api::NPY_INT_,    npy_api::NPY_UINT_,    npy_api::NPY_LONGLONG_, npy_api::NPY_ULONGLONG_,
+        npy_api::NPY_FLOAT_,  npy_api::NPY_DOUBLE_,  npy_api::NPY_LONGDOUBLE_,
+        npy_api::NPY_CFLOAT_, npy_api::NPY_CDOUBLE_, npy_api::NPY_CLONGDOUBLE_
+    };
+
+public:
+    static constexpr int value = values[detail::is_fmt_numeric<T>::index];
+
+    static pybind11::dtype dtype() {
+        if (auto ptr = npy_api::get().PyArray_DescrFromType_(value))
+            return reinterpret_borrow<pybind11::dtype>(ptr);
+        pybind11_fail("Unsupported buffer format!");
+    }
+};
+
+#define PYBIND11_DECL_CHAR_FMT \
+    static constexpr auto name = _("S") + _<N>(); \
+    static pybind11::dtype dtype() { return pybind11::dtype(std::string("S") + std::to_string(N)); }
+template <size_t N> struct npy_format_descriptor<char[N]> { PYBIND11_DECL_CHAR_FMT };
+template <size_t N> struct npy_format_descriptor<std::array<char, N>> { PYBIND11_DECL_CHAR_FMT };
+#undef PYBIND11_DECL_CHAR_FMT
+
+template<typename T> struct npy_format_descriptor<T, enable_if_t<array_info<T>::is_array>> {
+private:
+    using base_descr = npy_format_descriptor<typename array_info<T>::type>;
+public:
+    static_assert(!array_info<T>::is_empty, "Zero-sized arrays are not supported");
+
+    static constexpr auto name = _("(") + array_info<T>::extents + _(")") + base_descr::name;
+    static pybind11::dtype dtype() {
+        list shape;
+        array_info<T>::append_extents(shape);
+        return pybind11::dtype::from_args(pybind11::make_tuple(base_descr::dtype(), shape));
+    }
+};
+
+template<typename T> struct npy_format_descriptor<T, enable_if_t<std::is_enum<T>::value>> {
+private:
+    using base_descr = npy_format_descriptor<typename std::underlying_type<T>::type>;
+public:
+    static constexpr auto name = base_descr::name;
+    static pybind11::dtype dtype() { return base_descr::dtype(); }
+};
+
+struct field_descriptor {
+    const char *name;
+    ssize_t offset;
+    ssize_t size;
+    std::string format;
+    dtype descr;
+};
+
+inline PYBIND11_NOINLINE void register_structured_dtype(
+    any_container<field_descriptor> fields,
+    const std::type_info& tinfo, ssize_t itemsize,
+    bool (*direct_converter)(PyObject *, void *&)) {
+
+    auto& numpy_internals = get_numpy_internals();
+    if (numpy_internals.get_type_info(tinfo, false))
+        pybind11_fail("NumPy: dtype is already registered");
+
+    list names, formats, offsets;
+    for (auto field : *fields) {
+        if (!field.descr)
+            pybind11_fail(std::string("NumPy: unsupported field dtype: `") +
+                            field.name + "` @ " + tinfo.name());
+        names.append(PYBIND11_STR_TYPE(field.name));
+        formats.append(field.descr);
+        offsets.append(pybind11::int_(field.offset));
+    }
+    auto dtype_ptr = pybind11::dtype(names, formats, offsets, itemsize).release().ptr();
+
+    // There is an existing bug in NumPy (as of v1.11): trailing bytes are
+    // not encoded explicitly into the format string. This will supposedly
+    // get fixed in v1.12; for further details, see these:
+    // - https://github.com/numpy/numpy/issues/7797
+    // - https://github.com/numpy/numpy/pull/7798
+    // Because of this, we won't use numpy's logic to generate buffer format
+    // strings and will just do it ourselves.
+    std::vector<field_descriptor> ordered_fields(std::move(fields));
+    std::sort(ordered_fields.begin(), ordered_fields.end(),
+        [](const field_descriptor &a, const field_descriptor &b) { return a.offset < b.offset; });
+    ssize_t offset = 0;
+    std::ostringstream oss;
+    // mark the structure as unaligned with '^', because numpy and C++ don't
+    // always agree about alignment (particularly for complex), and we're
+    // explicitly listing all our padding. This depends on none of the fields
+    // overriding the endianness. Putting the ^ in front of individual fields
+    // isn't guaranteed to work due to https://github.com/numpy/numpy/issues/9049
+    oss << "^T{";
+    for (auto& field : ordered_fields) {
+        if (field.offset > offset)
+            oss << (field.offset - offset) << 'x';
+        oss << field.format << ':' << field.name << ':';
+        offset = field.offset + field.size;
+    }
+    if (itemsize > offset)
+        oss << (itemsize - offset) << 'x';
+    oss << '}';
+    auto format_str = oss.str();
+
+    // Sanity check: verify that NumPy properly parses our buffer format string
+    auto& api = npy_api::get();
+    auto arr =  array(buffer_info(nullptr, itemsize, format_str, 1));
+    if (!api.PyArray_EquivTypes_(dtype_ptr, arr.dtype().ptr()))
+        pybind11_fail("NumPy: invalid buffer descriptor!");
+
+    auto tindex = std::type_index(tinfo);
+    numpy_internals.registered_dtypes[tindex] = { dtype_ptr, format_str };
+    get_internals().direct_conversions[tindex].push_back(direct_converter);
+}
+
+template <typename T, typename SFINAE> struct npy_format_descriptor {
+    static_assert(is_pod_struct<T>::value, "Attempt to use a non-POD or unimplemented POD type as a numpy dtype");
+
+    static constexpr auto name = make_caster<T>::name;
+
+    static pybind11::dtype dtype() {
+        return reinterpret_borrow<pybind11::dtype>(dtype_ptr());
+    }
+
+    static std::string format() {
+        static auto format_str = get_numpy_internals().get_type_info<T>(true)->format_str;
+        return format_str;
+    }
+
+    static void register_dtype(any_container<field_descriptor> fields) {
+        register_structured_dtype(std::move(fields), typeid(typename std::remove_cv<T>::type),
+                                  sizeof(T), &direct_converter);
+    }
+
+private:
+    static PyObject* dtype_ptr() {
+        static PyObject* ptr = get_numpy_internals().get_type_info<T>(true)->dtype_ptr;
+        return ptr;
+    }
+
+    static bool direct_converter(PyObject *obj, void*& value) {
+        auto& api = npy_api::get();
+        if (!PyObject_TypeCheck(obj, api.PyVoidArrType_Type_))
+            return false;
+        if (auto descr = reinterpret_steal<object>(api.PyArray_DescrFromScalar_(obj))) {
+            if (api.PyArray_EquivTypes_(dtype_ptr(), descr.ptr())) {
+                value = ((PyVoidScalarObject_Proxy *) obj)->obval;
+                return true;
+            }
+        }
+        return false;
+    }
+};
+
+#ifdef __CLION_IDE__ // replace heavy macro with dummy code for the IDE (doesn't affect code)
+# define PYBIND11_NUMPY_DTYPE(Type, ...) ((void)0)
+# define PYBIND11_NUMPY_DTYPE_EX(Type, ...) ((void)0)
+#else
+
+#define PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, Name)                                          \
+    ::pybind11::detail::field_descriptor {                                                    \
+        Name, offsetof(T, Field), sizeof(decltype(std::declval<T>().Field)),                  \
+        ::pybind11::format_descriptor<decltype(std::declval<T>().Field)>::format(),           \
+        ::pybind11::detail::npy_format_descriptor<decltype(std::declval<T>().Field)>::dtype() \
+    }
+
+// Extract name, offset and format descriptor for a struct field
+#define PYBIND11_FIELD_DESCRIPTOR(T, Field) PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, #Field)
+
+// The main idea of this macro is borrowed from https://github.com/swansontec/map-macro
+// (C) William Swanson, Paul Fultz
+#define PYBIND11_EVAL0(...) __VA_ARGS__
+#define PYBIND11_EVAL1(...) PYBIND11_EVAL0 (PYBIND11_EVAL0 (PYBIND11_EVAL0 (__VA_ARGS__)))
+#define PYBIND11_EVAL2(...) PYBIND11_EVAL1 (PYBIND11_EVAL1 (PYBIND11_EVAL1 (__VA_ARGS__)))
+#define PYBIND11_EVAL3(...) PYBIND11_EVAL2 (PYBIND11_EVAL2 (PYBIND11_EVAL2 (__VA_ARGS__)))
+#define PYBIND11_EVAL4(...) PYBIND11_EVAL3 (PYBIND11_EVAL3 (PYBIND11_EVAL3 (__VA_ARGS__)))
+#define PYBIND11_EVAL(...)  PYBIND11_EVAL4 (PYBIND11_EVAL4 (PYBIND11_EVAL4 (__VA_ARGS__)))
+#define PYBIND11_MAP_END(...)
+#define PYBIND11_MAP_OUT
+#define PYBIND11_MAP_COMMA ,
+#define PYBIND11_MAP_GET_END() 0, PYBIND11_MAP_END
+#define PYBIND11_MAP_NEXT0(test, next, ...) next PYBIND11_MAP_OUT
+#define PYBIND11_MAP_NEXT1(test, next) PYBIND11_MAP_NEXT0 (test, next, 0)
+#define PYBIND11_MAP_NEXT(test, next)  PYBIND11_MAP_NEXT1 (PYBIND11_MAP_GET_END test, next)
+#ifdef _MSC_VER // MSVC is not as eager to expand macros, hence this workaround
+#define PYBIND11_MAP_LIST_NEXT1(test, next) \
+    PYBIND11_EVAL0 (PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0))
+#else
+#define PYBIND11_MAP_LIST_NEXT1(test, next) \
+    PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0)
+#endif
+#define PYBIND11_MAP_LIST_NEXT(test, next) \
+    PYBIND11_MAP_LIST_NEXT1 (PYBIND11_MAP_GET_END test, next)
+#define PYBIND11_MAP_LIST0(f, t, x, peek, ...) \
+    f(t, x) PYBIND11_MAP_LIST_NEXT (peek, PYBIND11_MAP_LIST1) (f, t, peek, __VA_ARGS__)
+#define PYBIND11_MAP_LIST1(f, t, x, peek, ...) \
+    f(t, x) PYBIND11_MAP_LIST_NEXT (peek, PYBIND11_MAP_LIST0) (f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP_LIST(f, t, a1, a2, ...) expands to f(t, a1), f(t, a2), ...
+#define PYBIND11_MAP_LIST(f, t, ...) \
+    PYBIND11_EVAL (PYBIND11_MAP_LIST1 (f, t, __VA_ARGS__, (), 0))
+
+#define PYBIND11_NUMPY_DTYPE(Type, ...) \
+    ::pybind11::detail::npy_format_descriptor<Type>::register_dtype \
+        (::std::vector<::pybind11::detail::field_descriptor> \
+         {PYBIND11_MAP_LIST (PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)})
+
+#ifdef _MSC_VER
+#define PYBIND11_MAP2_LIST_NEXT1(test, next) \
+    PYBIND11_EVAL0 (PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0))
+#else
+#define PYBIND11_MAP2_LIST_NEXT1(test, next) \
+    PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0)
+#endif
+#define PYBIND11_MAP2_LIST_NEXT(test, next) \
+    PYBIND11_MAP2_LIST_NEXT1 (PYBIND11_MAP_GET_END test, next)
+#define PYBIND11_MAP2_LIST0(f, t, x1, x2, peek, ...) \
+    f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT (peek, PYBIND11_MAP2_LIST1) (f, t, peek, __VA_ARGS__)
+#define PYBIND11_MAP2_LIST1(f, t, x1, x2, peek, ...) \
+    f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT (peek, PYBIND11_MAP2_LIST0) (f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP2_LIST(f, t, a1, a2, ...) expands to f(t, a1, a2), f(t, a3, a4), ...
+#define PYBIND11_MAP2_LIST(f, t, ...) \
+    PYBIND11_EVAL (PYBIND11_MAP2_LIST1 (f, t, __VA_ARGS__, (), 0))
+
+#define PYBIND11_NUMPY_DTYPE_EX(Type, ...) \
+    ::pybind11::detail::npy_format_descriptor<Type>::register_dtype \
+        (::std::vector<::pybind11::detail::field_descriptor> \
+         {PYBIND11_MAP2_LIST (PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)})
+
+#endif // __CLION_IDE__
+
+template  <class T>
+using array_iterator = typename std::add_pointer<T>::type;
+
+template <class T>
+array_iterator<T> array_begin(const buffer_info& buffer) {
+    return array_iterator<T>(reinterpret_cast<T*>(buffer.ptr));
+}
+
+template <class T>
+array_iterator<T> array_end(const buffer_info& buffer) {
+    return array_iterator<T>(reinterpret_cast<T*>(buffer.ptr) + buffer.size);
+}
+
+class common_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+    using value_type = container_type::value_type;
+    using size_type = container_type::size_type;
+
+    common_iterator() : p_ptr(0), m_strides() {}
+
+    common_iterator(void* ptr, const container_type& strides, const container_type& shape)
+        : p_ptr(reinterpret_cast<char*>(ptr)), m_strides(strides.size()) {
+        m_strides.back() = static_cast<value_type>(strides.back());
+        for (size_type i = m_strides.size() - 1; i != 0; --i) {
+            size_type j = i - 1;
+            value_type s = static_cast<value_type>(shape[i]);
+            m_strides[j] = strides[j] + m_strides[i] - strides[i] * s;
+        }
+    }
+
+    void increment(size_type dim) {
+        p_ptr += m_strides[dim];
+    }
+
+    void* data() const {
+        return p_ptr;
+    }
+
+private:
+    char* p_ptr;
+    container_type m_strides;
+};
+
+template <size_t N> class multi_array_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+
+    multi_array_iterator(const std::array<buffer_info, N> &buffers,
+                         const container_type &shape)
+        : m_shape(shape.size()), m_index(shape.size(), 0),
+          m_common_iterator() {
+
+        // Manual copy to avoid conversion warning if using std::copy
+        for (size_t i = 0; i < shape.size(); ++i)
+            m_shape[i] = shape[i];
+
+        container_type strides(shape.size());
+        for (size_t i = 0; i < N; ++i)
+            init_common_iterator(buffers[i], shape, m_common_iterator[i], strides);
+    }
+
+    multi_array_iterator& operator++() {
+        for (size_t j = m_index.size(); j != 0; --j) {
+            size_t i = j - 1;
+            if (++m_index[i] != m_shape[i]) {
+                increment_common_iterator(i);
+                break;
+            } else {
+                m_index[i] = 0;
+            }
+        }
+        return *this;
+    }
+
+    template <size_t K, class T = void> T* data() const {
+        return reinterpret_cast<T*>(m_common_iterator[K].data());
+    }
+
+private:
+
+    using common_iter = common_iterator;
+
+    void init_common_iterator(const buffer_info &buffer,
+                              const container_type &shape,
+                              common_iter &iterator,
+                              container_type &strides) {
+        auto buffer_shape_iter = buffer.shape.rbegin();
+        auto buffer_strides_iter = buffer.strides.rbegin();
+        auto shape_iter = shape.rbegin();
+        auto strides_iter = strides.rbegin();
+
+        while (buffer_shape_iter != buffer.shape.rend()) {
+            if (*shape_iter == *buffer_shape_iter)
+                *strides_iter = *buffer_strides_iter;
+            else
+                *strides_iter = 0;
+
+            ++buffer_shape_iter;
+            ++buffer_strides_iter;
+            ++shape_iter;
+            ++strides_iter;
+        }
+
+        std::fill(strides_iter, strides.rend(), 0);
+        iterator = common_iter(buffer.ptr, strides, shape);
+    }
+
+    void increment_common_iterator(size_t dim) {
+        for (auto &iter : m_common_iterator)
+            iter.increment(dim);
+    }
+
+    container_type m_shape;
+    container_type m_index;
+    std::array<common_iter, N> m_common_iterator;
+};
+
+enum class broadcast_trivial { non_trivial, c_trivial, f_trivial };
+
+// Populates the shape and number of dimensions for the set of buffers.  Returns a broadcast_trivial
+// enum value indicating whether the broadcast is "trivial"--that is, has each buffer being either a
+// singleton or a full-size, C-contiguous (`c_trivial`) or Fortran-contiguous (`f_trivial`) storage
+// buffer; returns `non_trivial` otherwise.
+template <size_t N>
+broadcast_trivial broadcast(const std::array<buffer_info, N> &buffers, ssize_t &ndim, std::vector<ssize_t> &shape) {
+    ndim = std::accumulate(buffers.begin(), buffers.end(), ssize_t(0), [](ssize_t res, const buffer_info &buf) {
+        return std::max(res, buf.ndim);
+    });
+
+    shape.clear();
+    shape.resize((size_t) ndim, 1);
+
+    // Figure out the output size, and make sure all input arrays conform (i.e. are either size 1 or
+    // the full size).
+    for (size_t i = 0; i < N; ++i) {
+        auto res_iter = shape.rbegin();
+        auto end = buffers[i].shape.rend();
+        for (auto shape_iter = buffers[i].shape.rbegin(); shape_iter != end; ++shape_iter, ++res_iter) {
+            const auto &dim_size_in = *shape_iter;
+            auto &dim_size_out = *res_iter;
+
+            // Each input dimension can either be 1 or `n`, but `n` values must match across buffers
+            if (dim_size_out == 1)
+                dim_size_out = dim_size_in;
+            else if (dim_size_in != 1 && dim_size_in != dim_size_out)
+                pybind11_fail("pybind11::vectorize: incompatible size/dimension of inputs!");
+        }
+    }
+
+    bool trivial_broadcast_c = true;
+    bool trivial_broadcast_f = true;
+    for (size_t i = 0; i < N && (trivial_broadcast_c || trivial_broadcast_f); ++i) {
+        if (buffers[i].size == 1)
+            continue;
+
+        // Require the same number of dimensions:
+        if (buffers[i].ndim != ndim)
+            return broadcast_trivial::non_trivial;
+
+        // Require all dimensions be full-size:
+        if (!std::equal(buffers[i].shape.cbegin(), buffers[i].shape.cend(), shape.cbegin()))
+            return broadcast_trivial::non_trivial;
+
+        // Check for C contiguity (but only if previous inputs were also C contiguous)
+        if (trivial_broadcast_c) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.crend();
+            for (auto shape_iter = buffers[i].shape.crbegin(), stride_iter = buffers[i].strides.crbegin();
+                    trivial_broadcast_c && shape_iter != end; ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter)
+                    expect_stride *= *shape_iter;
+                else
+                    trivial_broadcast_c = false;
+            }
+        }
+
+        // Check for Fortran contiguity (if previous inputs were also F contiguous)
+        if (trivial_broadcast_f) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.cend();
+            for (auto shape_iter = buffers[i].shape.cbegin(), stride_iter = buffers[i].strides.cbegin();
+                    trivial_broadcast_f && shape_iter != end; ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter)
+                    expect_stride *= *shape_iter;
+                else
+                    trivial_broadcast_f = false;
+            }
+        }
+    }
+
+    return
+        trivial_broadcast_c ? broadcast_trivial::c_trivial :
+        trivial_broadcast_f ? broadcast_trivial::f_trivial :
+        broadcast_trivial::non_trivial;
+}
+
+template <typename T>
+struct vectorize_arg {
+    static_assert(!std::is_rvalue_reference<T>::value, "Functions with rvalue reference arguments cannot be vectorized");
+    // The wrapped function gets called with this type:
+    using call_type = remove_reference_t<T>;
+    // Is this a vectorized argument?
+    static constexpr bool vectorize =
+        satisfies_any_of<call_type, std::is_arithmetic, is_complex, std::is_pod>::value &&
+        satisfies_none_of<call_type, std::is_pointer, std::is_array, is_std_array, std::is_enum>::value &&
+        (!std::is_reference<T>::value ||
+         (std::is_lvalue_reference<T>::value && std::is_const<call_type>::value));
+    // Accept this type: an array for vectorized types, otherwise the type as-is:
+    using type = conditional_t<vectorize, array_t<remove_cv_t<call_type>, array::forcecast>, T>;
+};
+
+template <typename Func, typename Return, typename... Args>
+struct vectorize_helper {
+private:
+    static constexpr size_t N = sizeof...(Args);
+    static constexpr size_t NVectorized = constexpr_sum(vectorize_arg<Args>::vectorize...);
+    static_assert(NVectorized >= 1,
+            "pybind11::vectorize(...) requires a function with at least one vectorizable argument");
+
+public:
+    template <typename T>
+    explicit vectorize_helper(T &&f) : f(std::forward<T>(f)) { }
+
+    object operator()(typename vectorize_arg<Args>::type... args) {
+        return run(args...,
+                   make_index_sequence<N>(),
+                   select_indices<vectorize_arg<Args>::vectorize...>(),
+                   make_index_sequence<NVectorized>());
+    }
+
+private:
+    remove_reference_t<Func> f;
+
+    // Internal compiler error in MSVC 19.16.27025.1 (Visual Studio 2017 15.9.4), when compiling with "/permissive-" flag
+    // when arg_call_types is manually inlined.
+    using arg_call_types = std::tuple<typename vectorize_arg<Args>::call_type...>;
+    template <size_t Index> using param_n_t = typename std::tuple_element<Index, arg_call_types>::type;
+
+    // Runs a vectorized function given arguments tuple and three index sequences:
+    //     - Index is the full set of 0 ... (N-1) argument indices;
+    //     - VIndex is the subset of argument indices with vectorized parameters, letting us access
+    //       vectorized arguments (anything not in this sequence is passed through)
+    //     - BIndex is a incremental sequence (beginning at 0) of the same size as VIndex, so that
+    //       we can store vectorized buffer_infos in an array (argument VIndex has its buffer at
+    //       index BIndex in the array).
+    template <size_t... Index, size_t... VIndex, size_t... BIndex> object run(
+            typename vectorize_arg<Args>::type &...args,
+            index_sequence<Index...> i_seq, index_sequence<VIndex...> vi_seq, index_sequence<BIndex...> bi_seq) {
+
+        // Pointers to values the function was called with; the vectorized ones set here will start
+        // out as array_t<T> pointers, but they will be changed them to T pointers before we make
+        // call the wrapped function.  Non-vectorized pointers are left as-is.
+        std::array<void *, N> params{{ &args... }};
+
+        // The array of `buffer_info`s of vectorized arguments:
+        std::array<buffer_info, NVectorized> buffers{{ reinterpret_cast<array *>(params[VIndex])->request()... }};
+
+        /* Determine dimensions parameters of output array */
+        ssize_t nd = 0;
+        std::vector<ssize_t> shape(0);
+        auto trivial = broadcast(buffers, nd, shape);
+        size_t ndim = (size_t) nd;
+
+        size_t size = std::accumulate(shape.begin(), shape.end(), (size_t) 1, std::multiplies<size_t>());
+
+        // If all arguments are 0-dimension arrays (i.e. single values) return a plain value (i.e.
+        // not wrapped in an array).
+        if (size == 1 && ndim == 0) {
+            PYBIND11_EXPAND_SIDE_EFFECTS(params[VIndex] = buffers[BIndex].ptr);
+            return cast(f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...));
+        }
+
+        array_t<Return> result;
+        if (trivial == broadcast_trivial::f_trivial) result = array_t<Return, array::f_style>(shape);
+        else result = array_t<Return>(shape);
+
+        if (size == 0) return std::move(result);
+
+        /* Call the function */
+        if (trivial == broadcast_trivial::non_trivial)
+            apply_broadcast(buffers, params, result, i_seq, vi_seq, bi_seq);
+        else
+            apply_trivial(buffers, params, result.mutable_data(), size, i_seq, vi_seq, bi_seq);
+
+        return std::move(result);
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_trivial(std::array<buffer_info, NVectorized> &buffers,
+                       std::array<void *, N> &params,
+                       Return *out,
+                       size_t size,
+                       index_sequence<Index...>, index_sequence<VIndex...>, index_sequence<BIndex...>) {
+
+        // Initialize an array of mutable byte references and sizes with references set to the
+        // appropriate pointer in `params`; as we iterate, we'll increment each pointer by its size
+        // (except for singletons, which get an increment of 0).
+        std::array<std::pair<unsigned char *&, const size_t>, NVectorized> vecparams{{
+            std::pair<unsigned char *&, const size_t>(
+                    reinterpret_cast<unsigned char *&>(params[VIndex] = buffers[BIndex].ptr),
+                    buffers[BIndex].size == 1 ? 0 : sizeof(param_n_t<VIndex>)
+            )...
+        }};
+
+        for (size_t i = 0; i < size; ++i) {
+            out[i] = f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...);
+            for (auto &x : vecparams) x.first += x.second;
+        }
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_broadcast(std::array<buffer_info, NVectorized> &buffers,
+                         std::array<void *, N> &params,
+                         array_t<Return> &output_array,
+                         index_sequence<Index...>, index_sequence<VIndex...>, index_sequence<BIndex...>) {
+
+        buffer_info output = output_array.request();
+        multi_array_iterator<NVectorized> input_iter(buffers, output.shape);
+
+        for (array_iterator<Return> iter = array_begin<Return>(output), end = array_end<Return>(output);
+             iter != end;
+             ++iter, ++input_iter) {
+            PYBIND11_EXPAND_SIDE_EFFECTS((
+                params[VIndex] = input_iter.template data<BIndex>()
+            ));
+            *iter = f(*reinterpret_cast<param_n_t<Index> *>(std::get<Index>(params))...);
+        }
+    }
+};
+
+template <typename Func, typename Return, typename... Args>
+vectorize_helper<Func, Return, Args...>
+vectorize_extractor(const Func &f, Return (*) (Args ...)) {
+    return detail::vectorize_helper<Func, Return, Args...>(f);
+}
+
+template <typename T, int Flags> struct handle_type_name<array_t<T, Flags>> {
+    static constexpr auto name = _("numpy.ndarray[") + npy_format_descriptor<T>::name + _("]");
+};
+
+NAMESPACE_END(detail)
+
+// Vanilla pointer vectorizer:
+template <typename Return, typename... Args>
+detail::vectorize_helper<Return (*)(Args...), Return, Args...>
+vectorize(Return (*f) (Args ...)) {
+    return detail::vectorize_helper<Return (*)(Args...), Return, Args...>(f);
+}
+
+// lambda vectorizer:
+template <typename Func, detail::enable_if_t<detail::is_lambda<Func>::value, int> = 0>
+auto vectorize(Func &&f) -> decltype(
+        detail::vectorize_extractor(std::forward<Func>(f), (detail::function_signature_t<Func> *) nullptr)) {
+    return detail::vectorize_extractor(std::forward<Func>(f), (detail::function_signature_t<Func> *) nullptr);
+}
+
+// Vectorize a class method (non-const):
+template <typename Return, typename Class, typename... Args,
+          typename Helper = detail::vectorize_helper<decltype(std::mem_fn(std::declval<Return (Class::*)(Args...)>())), Return, Class *, Args...>>
+Helper vectorize(Return (Class::*f)(Args...)) {
+    return Helper(std::mem_fn(f));
+}
+
+// Vectorize a class method (const):
+template <typename Return, typename Class, typename... Args,
+          typename Helper = detail::vectorize_helper<decltype(std::mem_fn(std::declval<Return (Class::*)(Args...) const>())), Return, const Class *, Args...>>
+Helper vectorize(Return (Class::*f)(Args...) const) {
+    return Helper(std::mem_fn(f));
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/python/src/pybind11/operators.h b/python/src/pybind11/operators.h
new file mode 100644
index 000000000..b3dd62c3b
--- /dev/null
+++ b/python/src/pybind11/operators.h
@@ -0,0 +1,168 @@
+/*
+    pybind11/operator.h: Metatemplates for operator overloading
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#if defined(__clang__) && !defined(__INTEL_COMPILER)
+#  pragma clang diagnostic ignored "-Wunsequenced" // multiple unsequenced modifications to 'self' (when using def(py::self OP Type()))
+#elif defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// Enumeration with all supported operator types
+enum op_id : int {
+    op_add, op_sub, op_mul, op_div, op_mod, op_divmod, op_pow, op_lshift,
+    op_rshift, op_and, op_xor, op_or, op_neg, op_pos, op_abs, op_invert,
+    op_int, op_long, op_float, op_str, op_cmp, op_gt, op_ge, op_lt, op_le,
+    op_eq, op_ne, op_iadd, op_isub, op_imul, op_idiv, op_imod, op_ilshift,
+    op_irshift, op_iand, op_ixor, op_ior, op_complex, op_bool, op_nonzero,
+    op_repr, op_truediv, op_itruediv, op_hash
+};
+
+enum op_type : int {
+    op_l, /* base type on left */
+    op_r, /* base type on right */
+    op_u  /* unary operator */
+};
+
+struct self_t { };
+static const self_t self = self_t();
+
+/// Type for an unused type slot
+struct undefined_t { };
+
+/// Don't warn about an unused variable
+inline self_t __self() { return self; }
+
+/// base template of operator implementations
+template <op_id, op_type, typename B, typename L, typename R> struct op_impl { };
+
+/// Operator implementation generator
+template <op_id id, op_type ot, typename L, typename R> struct op_ {
+    template <typename Class, typename... Extra> void execute(Class &cl, const Extra&... extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute, is_operator(), extra...);
+        #if PY_MAJOR_VERSION < 3
+        if (id == op_truediv || id == op_itruediv)
+            cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__",
+                    &op::execute, is_operator(), extra...);
+        #endif
+    }
+    template <typename Class, typename... Extra> void execute_cast(Class &cl, const Extra&... extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute_cast, is_operator(), extra...);
+        #if PY_MAJOR_VERSION < 3
+        if (id == op_truediv || id == op_itruediv)
+            cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__",
+                    &op::execute, is_operator(), extra...);
+        #endif
+    }
+};
+
+#define PYBIND11_BINARY_OPERATOR(id, rid, op, expr)                                    \
+template <typename B, typename L, typename R> struct op_impl<op_##id, op_l, B, L, R> { \
+    static char const* name() { return "__" #id "__"; }                                \
+    static auto execute(const L &l, const R &r) -> decltype(expr) { return (expr); }   \
+    static B execute_cast(const L &l, const R &r) { return B(expr); }                  \
+};                                                                                     \
+template <typename B, typename L, typename R> struct op_impl<op_##id, op_r, B, L, R> { \
+    static char const* name() { return "__" #rid "__"; }                               \
+    static auto execute(const R &r, const L &l) -> decltype(expr) { return (expr); }   \
+    static B execute_cast(const R &r, const L &l) { return B(expr); }                  \
+};                                                                                     \
+inline op_<op_##id, op_l, self_t, self_t> op(const self_t &, const self_t &) {         \
+    return op_<op_##id, op_l, self_t, self_t>();                                       \
+}                                                                                      \
+template <typename T> op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {    \
+    return op_<op_##id, op_l, self_t, T>();                                            \
+}                                                                                      \
+template <typename T> op_<op_##id, op_r, T, self_t> op(const T &, const self_t &) {    \
+    return op_<op_##id, op_r, T, self_t>();                                            \
+}
+
+#define PYBIND11_INPLACE_OPERATOR(id, op, expr)                                        \
+template <typename B, typename L, typename R> struct op_impl<op_##id, op_l, B, L, R> { \
+    static char const* name() { return "__" #id "__"; }                                \
+    static auto execute(L &l, const R &r) -> decltype(expr) { return expr; }           \
+    static B execute_cast(L &l, const R &r) { return B(expr); }                        \
+};                                                                                     \
+template <typename T> op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {    \
+    return op_<op_##id, op_l, self_t, T>();                                            \
+}
+
+#define PYBIND11_UNARY_OPERATOR(id, op, expr)                                          \
+template <typename B, typename L> struct op_impl<op_##id, op_u, B, L, undefined_t> {   \
+    static char const* name() { return "__" #id "__"; }                                \
+    static auto execute(const L &l) -> decltype(expr) { return expr; }                 \
+    static B execute_cast(const L &l) { return B(expr); }                              \
+};                                                                                     \
+inline op_<op_##id, op_u, self_t, undefined_t> op(const self_t &) {                    \
+    return op_<op_##id, op_u, self_t, undefined_t>();                                  \
+}
+
+PYBIND11_BINARY_OPERATOR(sub,       rsub,         operator-,    l - r)
+PYBIND11_BINARY_OPERATOR(add,       radd,         operator+,    l + r)
+PYBIND11_BINARY_OPERATOR(mul,       rmul,         operator*,    l * r)
+PYBIND11_BINARY_OPERATOR(truediv,   rtruediv,     operator/,    l / r)
+PYBIND11_BINARY_OPERATOR(mod,       rmod,         operator%,    l % r)
+PYBIND11_BINARY_OPERATOR(lshift,    rlshift,      operator<<,   l << r)
+PYBIND11_BINARY_OPERATOR(rshift,    rrshift,      operator>>,   l >> r)
+PYBIND11_BINARY_OPERATOR(and,       rand,         operator&,    l & r)
+PYBIND11_BINARY_OPERATOR(xor,       rxor,         operator^,    l ^ r)
+PYBIND11_BINARY_OPERATOR(eq,        eq,           operator==,   l == r)
+PYBIND11_BINARY_OPERATOR(ne,        ne,           operator!=,   l != r)
+PYBIND11_BINARY_OPERATOR(or,        ror,          operator|,    l | r)
+PYBIND11_BINARY_OPERATOR(gt,        lt,           operator>,    l > r)
+PYBIND11_BINARY_OPERATOR(ge,        le,           operator>=,   l >= r)
+PYBIND11_BINARY_OPERATOR(lt,        gt,           operator<,    l < r)
+PYBIND11_BINARY_OPERATOR(le,        ge,           operator<=,   l <= r)
+//PYBIND11_BINARY_OPERATOR(pow,       rpow,         pow,          std::pow(l,  r))
+PYBIND11_INPLACE_OPERATOR(iadd,     operator+=,   l += r)
+PYBIND11_INPLACE_OPERATOR(isub,     operator-=,   l -= r)
+PYBIND11_INPLACE_OPERATOR(imul,     operator*=,   l *= r)
+PYBIND11_INPLACE_OPERATOR(itruediv, operator/=,   l /= r)
+PYBIND11_INPLACE_OPERATOR(imod,     operator%=,   l %= r)
+PYBIND11_INPLACE_OPERATOR(ilshift,  operator<<=,  l <<= r)
+PYBIND11_INPLACE_OPERATOR(irshift,  operator>>=,  l >>= r)
+PYBIND11_INPLACE_OPERATOR(iand,     operator&=,   l &= r)
+PYBIND11_INPLACE_OPERATOR(ixor,     operator^=,   l ^= r)
+PYBIND11_INPLACE_OPERATOR(ior,      operator|=,   l |= r)
+PYBIND11_UNARY_OPERATOR(neg,        operator-,    -l)
+PYBIND11_UNARY_OPERATOR(pos,        operator+,    +l)
+PYBIND11_UNARY_OPERATOR(abs,        abs,          std::abs(l))
+PYBIND11_UNARY_OPERATOR(hash,       hash,         std::hash<L>()(l))
+PYBIND11_UNARY_OPERATOR(invert,     operator~,    (~l))
+PYBIND11_UNARY_OPERATOR(bool,       operator!,    !!l)
+PYBIND11_UNARY_OPERATOR(int,        int_,         (int) l)
+PYBIND11_UNARY_OPERATOR(float,      float_,       (double) l)
+
+#undef PYBIND11_BINARY_OPERATOR
+#undef PYBIND11_INPLACE_OPERATOR
+#undef PYBIND11_UNARY_OPERATOR
+NAMESPACE_END(detail)
+
+using detail::self;
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/python/src/pybind11/options.h b/python/src/pybind11/options.h
new file mode 100644
index 000000000..cc1e1f6f0
--- /dev/null
+++ b/python/src/pybind11/options.h
@@ -0,0 +1,65 @@
+/*
+    pybind11/options.h: global settings that are configurable at runtime.
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+class options {
+public:
+
+    // Default RAII constructor, which leaves settings as they currently are.
+    options() : previous_state(global_state()) {}
+
+    // Class is non-copyable.
+    options(const options&) = delete;
+    options& operator=(const options&) = delete;
+
+    // Destructor, which restores settings that were in effect before.
+    ~options() {
+        global_state() = previous_state;
+    }
+
+    // Setter methods (affect the global state):
+
+    options& disable_user_defined_docstrings() & { global_state().show_user_defined_docstrings = false; return *this; }
+
+    options& enable_user_defined_docstrings() & { global_state().show_user_defined_docstrings = true; return *this; }
+
+    options& disable_function_signatures() & { global_state().show_function_signatures = false; return *this; }
+
+    options& enable_function_signatures() & { global_state().show_function_signatures = true; return *this; }
+
+    // Getter methods (return the global state):
+
+    static bool show_user_defined_docstrings() { return global_state().show_user_defined_docstrings; }
+
+    static bool show_function_signatures() { return global_state().show_function_signatures; }
+
+    // This type is not meant to be allocated on the heap.
+    void* operator new(size_t) = delete;
+
+private:
+
+    struct state {
+        bool show_user_defined_docstrings = true;  //< Include user-supplied texts in docstrings.
+        bool show_function_signatures = true;      //< Include auto-generated function signatures in docstrings.
+    };
+
+    static state &global_state() {
+        static state instance;
+        return instance;
+    }
+
+    state previous_state;
+};
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/pybind11.h b/python/src/pybind11/pybind11.h
new file mode 100644
index 000000000..f1d91c788
--- /dev/null
+++ b/python/src/pybind11/pybind11.h
@@ -0,0 +1,2162 @@
+/*
+    pybind11/pybind11.h: Main header file of the C++11 python
+    binding generator library
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#if defined(__INTEL_COMPILER)
+#  pragma warning push
+#  pragma warning disable 68    // integer conversion resulted in a change of sign
+#  pragma warning disable 186   // pointless comparison of unsigned integer with zero
+#  pragma warning disable 878   // incompatible exception specifications
+#  pragma warning disable 1334  // the "template" keyword used for syntactic disambiguation may only be used within a template
+#  pragma warning disable 1682  // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem)
+#  pragma warning disable 1786  // function "strdup" was declared deprecated
+#  pragma warning disable 1875  // offsetof applied to non-POD (Plain Old Data) types is nonstandard
+#  pragma warning disable 2196  // warning #2196: routine is both "inline" and "noinline"
+#elif defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4100) // warning C4100: Unreferenced formal parameter
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#  pragma warning(disable: 4512) // warning C4512: Assignment operator was implicitly defined as deleted
+#  pragma warning(disable: 4800) // warning C4800: 'int': forcing value to bool 'true' or 'false' (performance warning)
+#  pragma warning(disable: 4996) // warning C4996: The POSIX name for this item is deprecated. Instead, use the ISO C and C++ conformant name
+#  pragma warning(disable: 4702) // warning C4702: unreachable code
+#  pragma warning(disable: 4522) // warning C4522: multiple assignment operators specified
+#elif defined(__GNUG__) && !defined(__clang__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wunused-but-set-parameter"
+#  pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#  pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#  pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#  pragma GCC diagnostic ignored "-Wattributes"
+#  if __GNUC__ >= 7
+#    pragma GCC diagnostic ignored "-Wnoexcept-type"
+#  endif
+#endif
+
+#if defined(__GNUG__) && !defined(__clang__)
+ #include <cxxabi.h>
+#endif
+
+
+#include "attr.h"
+#include "options.h"
+#include "detail/class.h"
+#include "detail/init.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// Wraps an arbitrary C++ function/method/lambda function/.. into a callable Python object
+class cpp_function : public function {
+public:
+    cpp_function() { }
+    cpp_function(std::nullptr_t) { }
+
+    /// Construct a cpp_function from a vanilla function pointer
+    template <typename Return, typename... Args, typename... Extra>
+    cpp_function(Return (*f)(Args...), const Extra&... extra) {
+        initialize(f, f, extra...);
+    }
+
+    /// Construct a cpp_function from a lambda function (possibly with internal state)
+    template <typename Func, typename... Extra,
+              typename = detail::enable_if_t<detail::is_lambda<Func>::value>>
+    cpp_function(Func &&f, const Extra&... extra) {
+        initialize(std::forward<Func>(f),
+                   (detail::function_signature_t<Func> *) nullptr, extra...);
+    }
+
+    /// Construct a cpp_function from a class method (non-const)
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    cpp_function(Return (Class::*f)(Arg...), const Extra&... extra) {
+        initialize([f](Class *c, Arg... args) -> Return { return (c->*f)(args...); },
+                   (Return (*) (Class *, Arg...)) nullptr, extra...);
+    }
+
+    /// Construct a cpp_function from a class method (const)
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    cpp_function(Return (Class::*f)(Arg...) const, const Extra&... extra) {
+        initialize([f](const Class *c, Arg... args) -> Return { return (c->*f)(args...); },
+                   (Return (*)(const Class *, Arg ...)) nullptr, extra...);
+    }
+
+    /// Return the function name
+    object name() const { return attr("__name__"); }
+
+protected:
+    /// Space optimization: don't inline this frequently instantiated fragment
+    PYBIND11_NOINLINE detail::function_record *make_function_record() {
+        return new detail::function_record();
+    }
+
+    /// Special internal constructor for functors, lambda functions, etc.
+    template <typename Func, typename Return, typename... Args, typename... Extra>
+    void initialize(Func &&f, Return (*)(Args...), const Extra&... extra) {
+        using namespace detail;
+        struct capture { remove_reference_t<Func> f; };
+
+        /* Store the function including any extra state it might have (e.g. a lambda capture object) */
+        auto rec = make_function_record();
+
+        /* Store the capture object directly in the function record if there is enough space */
+        if (sizeof(capture) <= sizeof(rec->data)) {
+            /* Without these pragmas, GCC warns that there might not be
+               enough space to use the placement new operator. However, the
+               'if' statement above ensures that this is the case. */
+#if defined(__GNUG__) && !defined(__clang__) && __GNUC__ >= 6
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wplacement-new"
+#endif
+            new ((capture *) &rec->data) capture { std::forward<Func>(f) };
+#if defined(__GNUG__) && !defined(__clang__) && __GNUC__ >= 6
+#  pragma GCC diagnostic pop
+#endif
+            if (!std::is_trivially_destructible<Func>::value)
+                rec->free_data = [](function_record *r) { ((capture *) &r->data)->~capture(); };
+        } else {
+            rec->data[0] = new capture { std::forward<Func>(f) };
+            rec->free_data = [](function_record *r) { delete ((capture *) r->data[0]); };
+        }
+
+        /* Type casters for the function arguments and return value */
+        using cast_in = argument_loader<Args...>;
+        using cast_out = make_caster<
+            conditional_t<std::is_void<Return>::value, void_type, Return>
+        >;
+
+        static_assert(expected_num_args<Extra...>(sizeof...(Args), cast_in::has_args, cast_in::has_kwargs),
+                      "The number of argument annotations does not match the number of function arguments");
+
+        /* Dispatch code which converts function arguments and performs the actual function call */
+        rec->impl = [](function_call &call) -> handle {
+            cast_in args_converter;
+
+            /* Try to cast the function arguments into the C++ domain */
+            if (!args_converter.load_args(call))
+                return PYBIND11_TRY_NEXT_OVERLOAD;
+
+            /* Invoke call policy pre-call hook */
+            process_attributes<Extra...>::precall(call);
+
+            /* Get a pointer to the capture object */
+            auto data = (sizeof(capture) <= sizeof(call.func.data)
+                         ? &call.func.data : call.func.data[0]);
+            capture *cap = const_cast<capture *>(reinterpret_cast<const capture *>(data));
+
+            /* Override policy for rvalues -- usually to enforce rvp::move on an rvalue */
+            return_value_policy policy = return_value_policy_override<Return>::policy(call.func.policy);
+
+            /* Function scope guard -- defaults to the compile-to-nothing `void_type` */
+            using Guard = extract_guard_t<Extra...>;
+
+            /* Perform the function call */
+            handle result = cast_out::cast(
+                std::move(args_converter).template call<Return, Guard>(cap->f), policy, call.parent);
+
+            /* Invoke call policy post-call hook */
+            process_attributes<Extra...>::postcall(call, result);
+
+            return result;
+        };
+
+        /* Process any user-provided function attributes */
+        process_attributes<Extra...>::init(extra..., rec);
+
+        /* Generate a readable signature describing the function's arguments and return value types */
+        static constexpr auto signature = _("(") + cast_in::arg_names + _(") -> ") + cast_out::name;
+        PYBIND11_DESCR_CONSTEXPR auto types = decltype(signature)::types();
+
+        /* Register the function with Python from generic (non-templated) code */
+        initialize_generic(rec, signature.text, types.data(), sizeof...(Args));
+
+        if (cast_in::has_args) rec->has_args = true;
+        if (cast_in::has_kwargs) rec->has_kwargs = true;
+
+        /* Stash some additional information used by an important optimization in 'functional.h' */
+        using FunctionType = Return (*)(Args...);
+        constexpr bool is_function_ptr =
+            std::is_convertible<Func, FunctionType>::value &&
+            sizeof(capture) == sizeof(void *);
+        if (is_function_ptr) {
+            rec->is_stateless = true;
+            rec->data[1] = const_cast<void *>(reinterpret_cast<const void *>(&typeid(FunctionType)));
+        }
+    }
+
+    /// Register a function call with Python (generic non-templated code goes here)
+    void initialize_generic(detail::function_record *rec, const char *text,
+                            const std::type_info *const *types, size_t args) {
+
+        /* Create copies of all referenced C-style strings */
+        rec->name = strdup(rec->name ? rec->name : "");
+        if (rec->doc) rec->doc = strdup(rec->doc);
+        for (auto &a: rec->args) {
+            if (a.name)
+                a.name = strdup(a.name);
+            if (a.descr)
+                a.descr = strdup(a.descr);
+            else if (a.value)
+                a.descr = strdup(a.value.attr("__repr__")().cast<std::string>().c_str());
+        }
+
+        rec->is_constructor = !strcmp(rec->name, "__init__") || !strcmp(rec->name, "__setstate__");
+
+#if !defined(NDEBUG) && !defined(PYBIND11_DISABLE_NEW_STYLE_INIT_WARNING)
+        if (rec->is_constructor && !rec->is_new_style_constructor) {
+            const auto class_name = std::string(((PyTypeObject *) rec->scope.ptr())->tp_name);
+            const auto func_name = std::string(rec->name);
+            PyErr_WarnEx(
+                PyExc_FutureWarning,
+                ("pybind11-bound class '" + class_name + "' is using an old-style "
+                 "placement-new '" + func_name + "' which has been deprecated. See "
+                 "the upgrade guide in pybind11's docs. This message is only visible "
+                 "when compiled in debug mode.").c_str(), 0
+            );
+        }
+#endif
+
+        /* Generate a proper function signature */
+        std::string signature;
+        size_t type_index = 0, arg_index = 0;
+        for (auto *pc = text; *pc != '\0'; ++pc) {
+            const auto c = *pc;
+
+            if (c == '{') {
+                // Write arg name for everything except *args and **kwargs.
+                if (*(pc + 1) == '*')
+                    continue;
+
+                if (arg_index < rec->args.size() && rec->args[arg_index].name) {
+                    signature += rec->args[arg_index].name;
+                } else if (arg_index == 0 && rec->is_method) {
+                    signature += "self";
+                } else {
+                    signature += "arg" + std::to_string(arg_index - (rec->is_method ? 1 : 0));
+                }
+                signature += ": ";
+            } else if (c == '}') {
+                // Write default value if available.
+                if (arg_index < rec->args.size() && rec->args[arg_index].descr) {
+                    signature += " = ";
+                    signature += rec->args[arg_index].descr;
+                }
+                arg_index++;
+            } else if (c == '%') {
+                const std::type_info *t = types[type_index++];
+                if (!t)
+                    pybind11_fail("Internal error while parsing type signature (1)");
+                if (auto tinfo = detail::get_type_info(*t)) {
+                    handle th((PyObject *) tinfo->type);
+                    signature +=
+                        th.attr("__module__").cast<std::string>() + "." +
+                        th.attr("__qualname__").cast<std::string>(); // Python 3.3+, but we backport it to earlier versions
+                } else if (rec->is_new_style_constructor && arg_index == 0) {
+                    // A new-style `__init__` takes `self` as `value_and_holder`.
+                    // Rewrite it to the proper class type.
+                    signature +=
+                        rec->scope.attr("__module__").cast<std::string>() + "." +
+                        rec->scope.attr("__qualname__").cast<std::string>();
+                } else {
+                    std::string tname(t->name());
+                    detail::clean_type_id(tname);
+                    signature += tname;
+                }
+            } else {
+                signature += c;
+            }
+        }
+        if (arg_index != args || types[type_index] != nullptr)
+            pybind11_fail("Internal error while parsing type signature (2)");
+
+#if PY_MAJOR_VERSION < 3
+        if (strcmp(rec->name, "__next__") == 0) {
+            std::free(rec->name);
+            rec->name = strdup("next");
+        } else if (strcmp(rec->name, "__bool__") == 0) {
+            std::free(rec->name);
+            rec->name = strdup("__nonzero__");
+        }
+#endif
+        rec->signature = strdup(signature.c_str());
+        rec->args.shrink_to_fit();
+        rec->nargs = (std::uint16_t) args;
+
+        if (rec->sibling && PYBIND11_INSTANCE_METHOD_CHECK(rec->sibling.ptr()))
+            rec->sibling = PYBIND11_INSTANCE_METHOD_GET_FUNCTION(rec->sibling.ptr());
+
+        detail::function_record *chain = nullptr, *chain_start = rec;
+        if (rec->sibling) {
+            if (PyCFunction_Check(rec->sibling.ptr())) {
+                auto rec_capsule = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(rec->sibling.ptr()));
+                chain = (detail::function_record *) rec_capsule;
+                /* Never append a method to an overload chain of a parent class;
+                   instead, hide the parent's overloads in this case */
+                if (!chain->scope.is(rec->scope))
+                    chain = nullptr;
+            }
+            // Don't trigger for things like the default __init__, which are wrapper_descriptors that we are intentionally replacing
+            else if (!rec->sibling.is_none() && rec->name[0] != '_')
+                pybind11_fail("Cannot overload existing non-function object \"" + std::string(rec->name) +
+                        "\" with a function of the same name");
+        }
+
+        if (!chain) {
+            /* No existing overload was found, create a new function object */
+            rec->def = new PyMethodDef();
+            std::memset(rec->def, 0, sizeof(PyMethodDef));
+            rec->def->ml_name = rec->name;
+            rec->def->ml_meth = reinterpret_cast<PyCFunction>(reinterpret_cast<void (*) (void)>(*dispatcher));
+            rec->def->ml_flags = METH_VARARGS | METH_KEYWORDS;
+
+            capsule rec_capsule(rec, [](void *ptr) {
+                destruct((detail::function_record *) ptr);
+            });
+
+            object scope_module;
+            if (rec->scope) {
+                if (hasattr(rec->scope, "__module__")) {
+                    scope_module = rec->scope.attr("__module__");
+                } else if (hasattr(rec->scope, "__name__")) {
+                    scope_module = rec->scope.attr("__name__");
+                }
+            }
+
+            m_ptr = PyCFunction_NewEx(rec->def, rec_capsule.ptr(), scope_module.ptr());
+            if (!m_ptr)
+                pybind11_fail("cpp_function::cpp_function(): Could not allocate function object");
+        } else {
+            /* Append at the end of the overload chain */
+            m_ptr = rec->sibling.ptr();
+            inc_ref();
+            chain_start = chain;
+            if (chain->is_method != rec->is_method)
+                pybind11_fail("overloading a method with both static and instance methods is not supported; "
+                    #if defined(NDEBUG)
+                        "compile in debug mode for more details"
+                    #else
+                        "error while attempting to bind " + std::string(rec->is_method ? "instance" : "static") + " method " +
+                        std::string(pybind11::str(rec->scope.attr("__name__"))) + "." + std::string(rec->name) + signature
+                    #endif
+                );
+            while (chain->next)
+                chain = chain->next;
+            chain->next = rec;
+        }
+
+        std::string signatures;
+        int index = 0;
+        /* Create a nice pydoc rec including all signatures and
+           docstrings of the functions in the overload chain */
+        if (chain && options::show_function_signatures()) {
+            // First a generic signature
+            signatures += rec->name;
+            signatures += "(*args, **kwargs)\n";
+            signatures += "Overloaded function.\n\n";
+        }
+        // Then specific overload signatures
+        bool first_user_def = true;
+        for (auto it = chain_start; it != nullptr; it = it->next) {
+            if (options::show_function_signatures()) {
+                if (index > 0) signatures += "\n";
+                if (chain)
+                    signatures += std::to_string(++index) + ". ";
+                signatures += rec->name;
+                signatures += it->signature;
+                signatures += "\n";
+            }
+            if (it->doc && strlen(it->doc) > 0 && options::show_user_defined_docstrings()) {
+                // If we're appending another docstring, and aren't printing function signatures, we
+                // need to append a newline first:
+                if (!options::show_function_signatures()) {
+                    if (first_user_def) first_user_def = false;
+                    else signatures += "\n";
+                }
+                if (options::show_function_signatures()) signatures += "\n";
+                signatures += it->doc;
+                if (options::show_function_signatures()) signatures += "\n";
+            }
+        }
+
+        /* Install docstring */
+        PyCFunctionObject *func = (PyCFunctionObject *) m_ptr;
+        if (func->m_ml->ml_doc)
+            std::free(const_cast<char *>(func->m_ml->ml_doc));
+        func->m_ml->ml_doc = strdup(signatures.c_str());
+
+        if (rec->is_method) {
+            m_ptr = PYBIND11_INSTANCE_METHOD_NEW(m_ptr, rec->scope.ptr());
+            if (!m_ptr)
+                pybind11_fail("cpp_function::cpp_function(): Could not allocate instance method object");
+            Py_DECREF(func);
+        }
+    }
+
+    /// When a cpp_function is GCed, release any memory allocated by pybind11
+    static void destruct(detail::function_record *rec) {
+        while (rec) {
+            detail::function_record *next = rec->next;
+            if (rec->free_data)
+                rec->free_data(rec);
+            std::free((char *) rec->name);
+            std::free((char *) rec->doc);
+            std::free((char *) rec->signature);
+            for (auto &arg: rec->args) {
+                std::free(const_cast<char *>(arg.name));
+                std::free(const_cast<char *>(arg.descr));
+                arg.value.dec_ref();
+            }
+            if (rec->def) {
+                std::free(const_cast<char *>(rec->def->ml_doc));
+                delete rec->def;
+            }
+            delete rec;
+            rec = next;
+        }
+    }
+
+    /// Main dispatch logic for calls to functions bound using pybind11
+    static PyObject *dispatcher(PyObject *self, PyObject *args_in, PyObject *kwargs_in) {
+        using namespace detail;
+
+        /* Iterator over the list of potentially admissible overloads */
+        const function_record *overloads = (function_record *) PyCapsule_GetPointer(self, nullptr),
+                              *it = overloads;
+
+        /* Need to know how many arguments + keyword arguments there are to pick the right overload */
+        const size_t n_args_in = (size_t) PyTuple_GET_SIZE(args_in);
+
+        handle parent = n_args_in > 0 ? PyTuple_GET_ITEM(args_in, 0) : nullptr,
+               result = PYBIND11_TRY_NEXT_OVERLOAD;
+
+        auto self_value_and_holder = value_and_holder();
+        if (overloads->is_constructor) {
+            const auto tinfo = get_type_info((PyTypeObject *) overloads->scope.ptr());
+            const auto pi = reinterpret_cast<instance *>(parent.ptr());
+            self_value_and_holder = pi->get_value_and_holder(tinfo, false);
+
+            if (!self_value_and_holder.type || !self_value_and_holder.inst) {
+                PyErr_SetString(PyExc_TypeError, "__init__(self, ...) called with invalid `self` argument");
+                return nullptr;
+            }
+
+            // If this value is already registered it must mean __init__ is invoked multiple times;
+            // we really can't support that in C++, so just ignore the second __init__.
+            if (self_value_and_holder.instance_registered())
+                return none().release().ptr();
+        }
+
+        try {
+            // We do this in two passes: in the first pass, we load arguments with `convert=false`;
+            // in the second, we allow conversion (except for arguments with an explicit
+            // py::arg().noconvert()).  This lets us prefer calls without conversion, with
+            // conversion as a fallback.
+            std::vector<function_call> second_pass;
+
+            // However, if there are no overloads, we can just skip the no-convert pass entirely
+            const bool overloaded = it != nullptr && it->next != nullptr;
+
+            for (; it != nullptr; it = it->next) {
+
+                /* For each overload:
+                   1. Copy all positional arguments we were given, also checking to make sure that
+                      named positional arguments weren't *also* specified via kwarg.
+                   2. If we weren't given enough, try to make up the omitted ones by checking
+                      whether they were provided by a kwarg matching the `py::arg("name")` name.  If
+                      so, use it (and remove it from kwargs; if not, see if the function binding
+                      provided a default that we can use.
+                   3. Ensure that either all keyword arguments were "consumed", or that the function
+                      takes a kwargs argument to accept unconsumed kwargs.
+                   4. Any positional arguments still left get put into a tuple (for args), and any
+                      leftover kwargs get put into a dict.
+                   5. Pack everything into a vector; if we have py::args or py::kwargs, they are an
+                      extra tuple or dict at the end of the positional arguments.
+                   6. Call the function call dispatcher (function_record::impl)
+
+                   If one of these fail, move on to the next overload and keep trying until we get a
+                   result other than PYBIND11_TRY_NEXT_OVERLOAD.
+                 */
+
+                const function_record &func = *it;
+                size_t pos_args = func.nargs;    // Number of positional arguments that we need
+                if (func.has_args) --pos_args;   // (but don't count py::args
+                if (func.has_kwargs) --pos_args; //  or py::kwargs)
+
+                if (!func.has_args && n_args_in > pos_args)
+                    continue; // Too many arguments for this overload
+
+                if (n_args_in < pos_args && func.args.size() < pos_args)
+                    continue; // Not enough arguments given, and not enough defaults to fill in the blanks
+
+                function_call call(func, parent);
+
+                size_t args_to_copy = std::min(pos_args, n_args_in);
+                size_t args_copied = 0;
+
+                // 0. Inject new-style `self` argument
+                if (func.is_new_style_constructor) {
+                    // The `value` may have been preallocated by an old-style `__init__`
+                    // if it was a preceding candidate for overload resolution.
+                    if (self_value_and_holder)
+                        self_value_and_holder.type->dealloc(self_value_and_holder);
+
+                    call.init_self = PyTuple_GET_ITEM(args_in, 0);
+                    call.args.push_back(reinterpret_cast<PyObject *>(&self_value_and_holder));
+                    call.args_convert.push_back(false);
+                    ++args_copied;
+                }
+
+                // 1. Copy any position arguments given.
+                bool bad_arg = false;
+                for (; args_copied < args_to_copy; ++args_copied) {
+                    const argument_record *arg_rec = args_copied < func.args.size() ? &func.args[args_copied] : nullptr;
+                    if (kwargs_in && arg_rec && arg_rec->name && PyDict_GetItemString(kwargs_in, arg_rec->name)) {
+                        bad_arg = true;
+                        break;
+                    }
+
+                    handle arg(PyTuple_GET_ITEM(args_in, args_copied));
+                    if (arg_rec && !arg_rec->none && arg.is_none()) {
+                        bad_arg = true;
+                        break;
+                    }
+                    call.args.push_back(arg);
+                    call.args_convert.push_back(arg_rec ? arg_rec->convert : true);
+                }
+                if (bad_arg)
+                    continue; // Maybe it was meant for another overload (issue #688)
+
+                // We'll need to copy this if we steal some kwargs for defaults
+                dict kwargs = reinterpret_borrow<dict>(kwargs_in);
+
+                // 2. Check kwargs and, failing that, defaults that may help complete the list
+                if (args_copied < pos_args) {
+                    bool copied_kwargs = false;
+
+                    for (; args_copied < pos_args; ++args_copied) {
+                        const auto &arg = func.args[args_copied];
+
+                        handle value;
+                        if (kwargs_in && arg.name)
+                            value = PyDict_GetItemString(kwargs.ptr(), arg.name);
+
+                        if (value) {
+                            // Consume a kwargs value
+                            if (!copied_kwargs) {
+                                kwargs = reinterpret_steal<dict>(PyDict_Copy(kwargs.ptr()));
+                                copied_kwargs = true;
+                            }
+                            PyDict_DelItemString(kwargs.ptr(), arg.name);
+                        } else if (arg.value) {
+                            value = arg.value;
+                        }
+
+                        if (value) {
+                            call.args.push_back(value);
+                            call.args_convert.push_back(arg.convert);
+                        }
+                        else
+                            break;
+                    }
+
+                    if (args_copied < pos_args)
+                        continue; // Not enough arguments, defaults, or kwargs to fill the positional arguments
+                }
+
+                // 3. Check everything was consumed (unless we have a kwargs arg)
+                if (kwargs && kwargs.size() > 0 && !func.has_kwargs)
+                    continue; // Unconsumed kwargs, but no py::kwargs argument to accept them
+
+                // 4a. If we have a py::args argument, create a new tuple with leftovers
+                if (func.has_args) {
+                    tuple extra_args;
+                    if (args_to_copy == 0) {
+                        // We didn't copy out any position arguments from the args_in tuple, so we
+                        // can reuse it directly without copying:
+                        extra_args = reinterpret_borrow<tuple>(args_in);
+                    } else if (args_copied >= n_args_in) {
+                        extra_args = tuple(0);
+                    } else {
+                        size_t args_size = n_args_in - args_copied;
+                        extra_args = tuple(args_size);
+                        for (size_t i = 0; i < args_size; ++i) {
+                            extra_args[i] = PyTuple_GET_ITEM(args_in, args_copied + i);
+                        }
+                    }
+                    call.args.push_back(extra_args);
+                    call.args_convert.push_back(false);
+                    call.args_ref = std::move(extra_args);
+                }
+
+                // 4b. If we have a py::kwargs, pass on any remaining kwargs
+                if (func.has_kwargs) {
+                    if (!kwargs.ptr())
+                        kwargs = dict(); // If we didn't get one, send an empty one
+                    call.args.push_back(kwargs);
+                    call.args_convert.push_back(false);
+                    call.kwargs_ref = std::move(kwargs);
+                }
+
+                // 5. Put everything in a vector.  Not technically step 5, we've been building it
+                // in `call.args` all along.
+                #if !defined(NDEBUG)
+                if (call.args.size() != func.nargs || call.args_convert.size() != func.nargs)
+                    pybind11_fail("Internal error: function call dispatcher inserted wrong number of arguments!");
+                #endif
+
+                std::vector<bool> second_pass_convert;
+                if (overloaded) {
+                    // We're in the first no-convert pass, so swap out the conversion flags for a
+                    // set of all-false flags.  If the call fails, we'll swap the flags back in for
+                    // the conversion-allowed call below.
+                    second_pass_convert.resize(func.nargs, false);
+                    call.args_convert.swap(second_pass_convert);
+                }
+
+                // 6. Call the function.
+                try {
+                    loader_life_support guard{};
+                    result = func.impl(call);
+                } catch (reference_cast_error &) {
+                    result = PYBIND11_TRY_NEXT_OVERLOAD;
+                }
+
+                if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD)
+                    break;
+
+                if (overloaded) {
+                    // The (overloaded) call failed; if the call has at least one argument that
+                    // permits conversion (i.e. it hasn't been explicitly specified `.noconvert()`)
+                    // then add this call to the list of second pass overloads to try.
+                    for (size_t i = func.is_method ? 1 : 0; i < pos_args; i++) {
+                        if (second_pass_convert[i]) {
+                            // Found one: swap the converting flags back in and store the call for
+                            // the second pass.
+                            call.args_convert.swap(second_pass_convert);
+                            second_pass.push_back(std::move(call));
+                            break;
+                        }
+                    }
+                }
+            }
+
+            if (overloaded && !second_pass.empty() && result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+                // The no-conversion pass finished without success, try again with conversion allowed
+                for (auto &call : second_pass) {
+                    try {
+                        loader_life_support guard{};
+                        result = call.func.impl(call);
+                    } catch (reference_cast_error &) {
+                        result = PYBIND11_TRY_NEXT_OVERLOAD;
+                    }
+
+                    if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) {
+                        // The error reporting logic below expects 'it' to be valid, as it would be
+                        // if we'd encountered this failure in the first-pass loop.
+                        if (!result)
+                            it = &call.func;
+                        break;
+                    }
+                }
+            }
+        } catch (error_already_set &e) {
+            e.restore();
+            return nullptr;
+#if defined(__GNUG__) && !defined(__clang__)
+        } catch ( abi::__forced_unwind& ) {
+            throw;
+#endif
+        } catch (...) {
+            /* When an exception is caught, give each registered exception
+               translator a chance to translate it to a Python exception
+               in reverse order of registration.
+
+               A translator may choose to do one of the following:
+
+                - catch the exception and call PyErr_SetString or PyErr_SetObject
+                  to set a standard (or custom) Python exception, or
+                - do nothing and let the exception fall through to the next translator, or
+                - delegate translation to the next translator by throwing a new type of exception. */
+
+            auto last_exception = std::current_exception();
+            auto &registered_exception_translators = get_internals().registered_exception_translators;
+            for (auto& translator : registered_exception_translators) {
+                try {
+                    translator(last_exception);
+                } catch (...) {
+                    last_exception = std::current_exception();
+                    continue;
+                }
+                return nullptr;
+            }
+            PyErr_SetString(PyExc_SystemError, "Exception escaped from default exception translator!");
+            return nullptr;
+        }
+
+        auto append_note_if_missing_header_is_suspected = [](std::string &msg) {
+            if (msg.find("std::") != std::string::npos) {
+                msg += "\n\n"
+                       "Did you forget to `#include <pybind11/stl.h>`? Or <pybind11/complex.h>,\n"
+                       "<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some automatic\n"
+                       "conversions are optional and require extra headers to be included\n"
+                       "when compiling your pybind11 module.";
+            }
+        };
+
+        if (result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+            if (overloads->is_operator)
+                return handle(Py_NotImplemented).inc_ref().ptr();
+
+            std::string msg = std::string(overloads->name) + "(): incompatible " +
+                std::string(overloads->is_constructor ? "constructor" : "function") +
+                " arguments. The following argument types are supported:\n";
+
+            int ctr = 0;
+            for (const function_record *it2 = overloads; it2 != nullptr; it2 = it2->next) {
+                msg += "    "+ std::to_string(++ctr) + ". ";
+
+                bool wrote_sig = false;
+                if (overloads->is_constructor) {
+                    // For a constructor, rewrite `(self: Object, arg0, ...) -> NoneType` as `Object(arg0, ...)`
+                    std::string sig = it2->signature;
+                    size_t start = sig.find('(') + 7; // skip "(self: "
+                    if (start < sig.size()) {
+                        // End at the , for the next argument
+                        size_t end = sig.find(", "), next = end + 2;
+                        size_t ret = sig.rfind(" -> ");
+                        // Or the ), if there is no comma:
+                        if (end >= sig.size()) next = end = sig.find(')');
+                        if (start < end && next < sig.size()) {
+                            msg.append(sig, start, end - start);
+                            msg += '(';
+                            msg.append(sig, next, ret - next);
+                            wrote_sig = true;
+                        }
+                    }
+                }
+                if (!wrote_sig) msg += it2->signature;
+
+                msg += "\n";
+            }
+            msg += "\nInvoked with: ";
+            auto args_ = reinterpret_borrow<tuple>(args_in);
+            bool some_args = false;
+            for (size_t ti = overloads->is_constructor ? 1 : 0; ti < args_.size(); ++ti) {
+                if (!some_args) some_args = true;
+                else msg += ", ";
+                msg += pybind11::repr(args_[ti]);
+            }
+            if (kwargs_in) {
+                auto kwargs = reinterpret_borrow<dict>(kwargs_in);
+                if (kwargs.size() > 0) {
+                    if (some_args) msg += "; ";
+                    msg += "kwargs: ";
+                    bool first = true;
+                    for (auto kwarg : kwargs) {
+                        if (first) first = false;
+                        else msg += ", ";
+                        msg += pybind11::str("{}={!r}").format(kwarg.first, kwarg.second);
+                    }
+                }
+            }
+
+            append_note_if_missing_header_is_suspected(msg);
+            PyErr_SetString(PyExc_TypeError, msg.c_str());
+            return nullptr;
+        } else if (!result) {
+            std::string msg = "Unable to convert function return value to a "
+                              "Python type! The signature was\n\t";
+            msg += it->signature;
+            append_note_if_missing_header_is_suspected(msg);
+            PyErr_SetString(PyExc_TypeError, msg.c_str());
+            return nullptr;
+        } else {
+            if (overloads->is_constructor && !self_value_and_holder.holder_constructed()) {
+                auto *pi = reinterpret_cast<instance *>(parent.ptr());
+                self_value_and_holder.type->init_instance(pi, nullptr);
+            }
+            return result.ptr();
+        }
+    }
+};
+
+/// Wrapper for Python extension modules
+class module : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(module, object, PyModule_Check)
+
+    /// Create a new top-level Python module with the given name and docstring
+    explicit module(const char *name, const char *doc = nullptr) {
+        if (!options::show_user_defined_docstrings()) doc = nullptr;
+#if PY_MAJOR_VERSION >= 3
+        PyModuleDef *def = new PyModuleDef();
+        std::memset(def, 0, sizeof(PyModuleDef));
+        def->m_name = name;
+        def->m_doc = doc;
+        def->m_size = -1;
+        Py_INCREF(def);
+        m_ptr = PyModule_Create(def);
+#else
+        m_ptr = Py_InitModule3(name, nullptr, doc);
+#endif
+        if (m_ptr == nullptr)
+            pybind11_fail("Internal error in module::module()");
+        inc_ref();
+    }
+
+    /** \rst
+        Create Python binding for a new function within the module scope. ``Func``
+        can be a plain C++ function, a function pointer, or a lambda function. For
+        details on the ``Extra&& ... extra`` argument, see section :ref:`extras`.
+    \endrst */
+    template <typename Func, typename... Extra>
+    module &def(const char *name_, Func &&f, const Extra& ... extra) {
+        cpp_function func(std::forward<Func>(f), name(name_), scope(*this),
+                          sibling(getattr(*this, name_, none())), extra...);
+        // NB: allow overwriting here because cpp_function sets up a chain with the intention of
+        // overwriting (and has already checked internally that it isn't overwriting non-functions).
+        add_object(name_, func, true /* overwrite */);
+        return *this;
+    }
+
+    /** \rst
+        Create and return a new Python submodule with the given name and docstring.
+        This also works recursively, i.e.
+
+        .. code-block:: cpp
+
+            py::module m("example", "pybind11 example plugin");
+            py::module m2 = m.def_submodule("sub", "A submodule of 'example'");
+            py::module m3 = m2.def_submodule("subsub", "A submodule of 'example.sub'");
+    \endrst */
+    module def_submodule(const char *name, const char *doc = nullptr) {
+        std::string full_name = std::string(PyModule_GetName(m_ptr))
+            + std::string(".") + std::string(name);
+        auto result = reinterpret_borrow<module>(PyImport_AddModule(full_name.c_str()));
+        if (doc && options::show_user_defined_docstrings())
+            result.attr("__doc__") = pybind11::str(doc);
+        attr(name) = result;
+        return result;
+    }
+
+    /// Import and return a module or throws `error_already_set`.
+    static module import(const char *name) {
+        PyObject *obj = PyImport_ImportModule(name);
+        if (!obj)
+            throw error_already_set();
+        return reinterpret_steal<module>(obj);
+    }
+
+    /// Reload the module or throws `error_already_set`.
+    void reload() {
+        PyObject *obj = PyImport_ReloadModule(ptr());
+        if (!obj)
+            throw error_already_set();
+        *this = reinterpret_steal<module>(obj);
+    }
+
+    // Adds an object to the module using the given name.  Throws if an object with the given name
+    // already exists.
+    //
+    // overwrite should almost always be false: attempting to overwrite objects that pybind11 has
+    // established will, in most cases, break things.
+    PYBIND11_NOINLINE void add_object(const char *name, handle obj, bool overwrite = false) {
+        if (!overwrite && hasattr(*this, name))
+            pybind11_fail("Error during initialization: multiple incompatible definitions with name \"" +
+                    std::string(name) + "\"");
+
+        PyModule_AddObject(ptr(), name, obj.inc_ref().ptr() /* steals a reference */);
+    }
+};
+
+/// \ingroup python_builtins
+/// Return a dictionary representing the global variables in the current execution frame,
+/// or ``__main__.__dict__`` if there is no frame (usually when the interpreter is embedded).
+inline dict globals() {
+    PyObject *p = PyEval_GetGlobals();
+    return reinterpret_borrow<dict>(p ? p : module::import("__main__").attr("__dict__").ptr());
+}
+
+NAMESPACE_BEGIN(detail)
+/// Generic support for creating new Python heap types
+class generic_type : public object {
+    template <typename...> friend class class_;
+public:
+    PYBIND11_OBJECT_DEFAULT(generic_type, object, PyType_Check)
+protected:
+    void initialize(const type_record &rec) {
+        if (rec.scope && hasattr(rec.scope, rec.name))
+            pybind11_fail("generic_type: cannot initialize type \"" + std::string(rec.name) +
+                          "\": an object with that name is already defined");
+
+        if (rec.module_local ? get_local_type_info(*rec.type) : get_global_type_info(*rec.type))
+            pybind11_fail("generic_type: type \"" + std::string(rec.name) +
+                          "\" is already registered!");
+
+        m_ptr = make_new_python_type(rec);
+
+        /* Register supplemental type information in C++ dict */
+        auto *tinfo = new detail::type_info();
+        tinfo->type = (PyTypeObject *) m_ptr;
+        tinfo->cpptype = rec.type;
+        tinfo->type_size = rec.type_size;
+        tinfo->type_align = rec.type_align;
+        tinfo->operator_new = rec.operator_new;
+        tinfo->holder_size_in_ptrs = size_in_ptrs(rec.holder_size);
+        tinfo->init_instance = rec.init_instance;
+        tinfo->dealloc = rec.dealloc;
+        tinfo->simple_type = true;
+        tinfo->simple_ancestors = true;
+        tinfo->default_holder = rec.default_holder;
+        tinfo->module_local = rec.module_local;
+
+        auto &internals = get_internals();
+        auto tindex = std::type_index(*rec.type);
+        tinfo->direct_conversions = &internals.direct_conversions[tindex];
+        if (rec.module_local)
+            registered_local_types_cpp()[tindex] = tinfo;
+        else
+            internals.registered_types_cpp[tindex] = tinfo;
+        internals.registered_types_py[(PyTypeObject *) m_ptr] = { tinfo };
+
+        if (rec.bases.size() > 1 || rec.multiple_inheritance) {
+            mark_parents_nonsimple(tinfo->type);
+            tinfo->simple_ancestors = false;
+        }
+        else if (rec.bases.size() == 1) {
+            auto parent_tinfo = get_type_info((PyTypeObject *) rec.bases[0].ptr());
+            tinfo->simple_ancestors = parent_tinfo->simple_ancestors;
+        }
+
+        if (rec.module_local) {
+            // Stash the local typeinfo and loader so that external modules can access it.
+            tinfo->module_local_load = &type_caster_generic::local_load;
+            setattr(m_ptr, PYBIND11_MODULE_LOCAL_ID, capsule(tinfo));
+        }
+    }
+
+    /// Helper function which tags all parents of a type using mult. inheritance
+    void mark_parents_nonsimple(PyTypeObject *value) {
+        auto t = reinterpret_borrow<tuple>(value->tp_bases);
+        for (handle h : t) {
+            auto tinfo2 = get_type_info((PyTypeObject *) h.ptr());
+            if (tinfo2)
+                tinfo2->simple_type = false;
+            mark_parents_nonsimple((PyTypeObject *) h.ptr());
+        }
+    }
+
+    void install_buffer_funcs(
+            buffer_info *(*get_buffer)(PyObject *, void *),
+            void *get_buffer_data) {
+        PyHeapTypeObject *type = (PyHeapTypeObject*) m_ptr;
+        auto tinfo = detail::get_type_info(&type->ht_type);
+
+        if (!type->ht_type.tp_as_buffer)
+            pybind11_fail(
+                "To be able to register buffer protocol support for the type '" +
+                std::string(tinfo->type->tp_name) +
+                "' the associated class<>(..) invocation must "
+                "include the pybind11::buffer_protocol() annotation!");
+
+        tinfo->get_buffer = get_buffer;
+        tinfo->get_buffer_data = get_buffer_data;
+    }
+
+    // rec_func must be set for either fget or fset.
+    void def_property_static_impl(const char *name,
+                                  handle fget, handle fset,
+                                  detail::function_record *rec_func) {
+        const auto is_static = rec_func && !(rec_func->is_method && rec_func->scope);
+        const auto has_doc = rec_func && rec_func->doc && pybind11::options::show_user_defined_docstrings();
+        auto property = handle((PyObject *) (is_static ? get_internals().static_property_type
+                                                       : &PyProperty_Type));
+        attr(name) = property(fget.ptr() ? fget : none(),
+                              fset.ptr() ? fset : none(),
+                              /*deleter*/none(),
+                              pybind11::str(has_doc ? rec_func->doc : ""));
+    }
+};
+
+/// Set the pointer to operator new if it exists. The cast is needed because it can be overloaded.
+template <typename T, typename = void_t<decltype(static_cast<void *(*)(size_t)>(T::operator new))>>
+void set_operator_new(type_record *r) { r->operator_new = &T::operator new; }
+
+template <typename> void set_operator_new(...) { }
+
+template <typename T, typename SFINAE = void> struct has_operator_delete : std::false_type { };
+template <typename T> struct has_operator_delete<T, void_t<decltype(static_cast<void (*)(void *)>(T::operator delete))>>
+    : std::true_type { };
+template <typename T, typename SFINAE = void> struct has_operator_delete_size : std::false_type { };
+template <typename T> struct has_operator_delete_size<T, void_t<decltype(static_cast<void (*)(void *, size_t)>(T::operator delete))>>
+    : std::true_type { };
+/// Call class-specific delete if it exists or global otherwise. Can also be an overload set.
+template <typename T, enable_if_t<has_operator_delete<T>::value, int> = 0>
+void call_operator_delete(T *p, size_t, size_t) { T::operator delete(p); }
+template <typename T, enable_if_t<!has_operator_delete<T>::value && has_operator_delete_size<T>::value, int> = 0>
+void call_operator_delete(T *p, size_t s, size_t) { T::operator delete(p, s); }
+
+inline void call_operator_delete(void *p, size_t s, size_t a) {
+    (void)s; (void)a;
+#if defined(PYBIND11_CPP17)
+    if (a > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
+        ::operator delete(p, s, std::align_val_t(a));
+    else
+        ::operator delete(p, s);
+#else
+    ::operator delete(p);
+#endif
+}
+
+NAMESPACE_END(detail)
+
+/// Given a pointer to a member function, cast it to its `Derived` version.
+/// Forward everything else unchanged.
+template <typename /*Derived*/, typename F>
+auto method_adaptor(F &&f) -> decltype(std::forward<F>(f)) { return std::forward<F>(f); }
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...)) -> Return (Derived::*)(Args...) {
+    static_assert(detail::is_accessible_base_of<Class, Derived>::value,
+        "Cannot bind an inaccessible base class method; use a lambda definition instead");
+    return pmf;
+}
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...) const) -> Return (Derived::*)(Args...) const {
+    static_assert(detail::is_accessible_base_of<Class, Derived>::value,
+        "Cannot bind an inaccessible base class method; use a lambda definition instead");
+    return pmf;
+}
+
+template <typename type_, typename... options>
+class class_ : public detail::generic_type {
+    template <typename T> using is_holder = detail::is_holder_type<type_, T>;
+    template <typename T> using is_subtype = detail::is_strict_base_of<type_, T>;
+    template <typename T> using is_base = detail::is_strict_base_of<T, type_>;
+    // struct instead of using here to help MSVC:
+    template <typename T> struct is_valid_class_option :
+        detail::any_of<is_holder<T>, is_subtype<T>, is_base<T>> {};
+
+public:
+    using type = type_;
+    using type_alias = detail::exactly_one_t<is_subtype, void, options...>;
+    constexpr static bool has_alias = !std::is_void<type_alias>::value;
+    using holder_type = detail::exactly_one_t<is_holder, std::unique_ptr<type>, options...>;
+
+    static_assert(detail::all_of<is_valid_class_option<options>...>::value,
+            "Unknown/invalid class_ template parameters provided");
+
+    static_assert(!has_alias || std::is_polymorphic<type>::value,
+            "Cannot use an alias class with a non-polymorphic type");
+
+    PYBIND11_OBJECT(class_, generic_type, PyType_Check)
+
+    template <typename... Extra>
+    class_(handle scope, const char *name, const Extra &... extra) {
+        using namespace detail;
+
+        // MI can only be specified via class_ template options, not constructor parameters
+        static_assert(
+            none_of<is_pyobject<Extra>...>::value || // no base class arguments, or:
+            (   constexpr_sum(is_pyobject<Extra>::value...) == 1 && // Exactly one base
+                constexpr_sum(is_base<options>::value...)   == 0 && // no template option bases
+                none_of<std::is_same<multiple_inheritance, Extra>...>::value), // no multiple_inheritance attr
+            "Error: multiple inheritance bases must be specified via class_ template options");
+
+        type_record record;
+        record.scope = scope;
+        record.name = name;
+        record.type = &typeid(type);
+        record.type_size = sizeof(conditional_t<has_alias, type_alias, type>);
+        record.type_align = alignof(conditional_t<has_alias, type_alias, type>&);
+        record.holder_size = sizeof(holder_type);
+        record.init_instance = init_instance;
+        record.dealloc = dealloc;
+        record.default_holder = detail::is_instantiation<std::unique_ptr, holder_type>::value;
+
+        set_operator_new<type>(&record);
+
+        /* Register base classes specified via template arguments to class_, if any */
+        PYBIND11_EXPAND_SIDE_EFFECTS(add_base<options>(record));
+
+        /* Process optional arguments, if any */
+        process_attributes<Extra...>::init(extra..., &record);
+
+        generic_type::initialize(record);
+
+        if (has_alias) {
+            auto &instances = record.module_local ? registered_local_types_cpp() : get_internals().registered_types_cpp;
+            instances[std::type_index(typeid(type_alias))] = instances[std::type_index(typeid(type))];
+        }
+    }
+
+    template <typename Base, detail::enable_if_t<is_base<Base>::value, int> = 0>
+    static void add_base(detail::type_record &rec) {
+        rec.add_base(typeid(Base), [](void *src) -> void * {
+            return static_cast<Base *>(reinterpret_cast<type *>(src));
+        });
+    }
+
+    template <typename Base, detail::enable_if_t<!is_base<Base>::value, int> = 0>
+    static void add_base(detail::type_record &) { }
+
+    template <typename Func, typename... Extra>
+    class_ &def(const char *name_, Func&& f, const Extra&... extra) {
+        cpp_function cf(method_adaptor<type>(std::forward<Func>(f)), name(name_), is_method(*this),
+                        sibling(getattr(*this, name_, none())), extra...);
+        attr(cf.name()) = cf;
+        return *this;
+    }
+
+    template <typename Func, typename... Extra> class_ &
+    def_static(const char *name_, Func &&f, const Extra&... extra) {
+        static_assert(!std::is_member_function_pointer<Func>::value,
+                "def_static(...) called with a non-static member function pointer");
+        cpp_function cf(std::forward<Func>(f), name(name_), scope(*this),
+                        sibling(getattr(*this, name_, none())), extra...);
+        attr(cf.name()) = staticmethod(cf);
+        return *this;
+    }
+
+    template <detail::op_id id, detail::op_type ot, typename L, typename R, typename... Extra>
+    class_ &def(const detail::op_<id, ot, L, R> &op, const Extra&... extra) {
+        op.execute(*this, extra...);
+        return *this;
+    }
+
+    template <detail::op_id id, detail::op_type ot, typename L, typename R, typename... Extra>
+    class_ & def_cast(const detail::op_<id, ot, L, R> &op, const Extra&... extra) {
+        op.execute_cast(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(const detail::initimpl::constructor<Args...> &init, const Extra&... extra) {
+        init.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(const detail::initimpl::alias_constructor<Args...> &init, const Extra&... extra) {
+        init.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::factory<Args...> &&init, const Extra&... extra) {
+        std::move(init).execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::pickle_factory<Args...> &&pf, const Extra &...extra) {
+        std::move(pf).execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename Func> class_& def_buffer(Func &&func) {
+        struct capture { Func func; };
+        capture *ptr = new capture { std::forward<Func>(func) };
+        install_buffer_funcs([](PyObject *obj, void *ptr) -> buffer_info* {
+            detail::make_caster<type> caster;
+            if (!caster.load(obj, false))
+                return nullptr;
+            return new buffer_info(((capture *) ptr)->func(caster));
+        }, ptr);
+        return *this;
+    }
+
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...)) {
+        return def_buffer([func] (type &obj) { return (obj.*func)(); });
+    }
+
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...) const) {
+        return def_buffer([func] (const type &obj) { return (obj.*func)(); });
+    }
+
+    template <typename C, typename D, typename... Extra>
+    class_ &def_readwrite(const char *name, D C::*pm, const Extra&... extra) {
+        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value, "def_readwrite() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this)),
+                     fset([pm](type &c, const D &value) { c.*pm = value; }, is_method(*this));
+        def_property(name, fget, fset, return_value_policy::reference_internal, extra...);
+        return *this;
+    }
+
+    template <typename C, typename D, typename... Extra>
+    class_ &def_readonly(const char *name, const D C::*pm, const Extra& ...extra) {
+        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value, "def_readonly() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this));
+        def_property_readonly(name, fget, return_value_policy::reference_internal, extra...);
+        return *this;
+    }
+
+    template <typename D, typename... Extra>
+    class_ &def_readwrite_static(const char *name, D *pm, const Extra& ...extra) {
+        cpp_function fget([pm](object) -> const D &{ return *pm; }, scope(*this)),
+                     fset([pm](object, const D &value) { *pm = value; }, scope(*this));
+        def_property_static(name, fget, fset, return_value_policy::reference, extra...);
+        return *this;
+    }
+
+    template <typename D, typename... Extra>
+    class_ &def_readonly_static(const char *name, const D *pm, const Extra& ...extra) {
+        cpp_function fget([pm](object) -> const D &{ return *pm; }, scope(*this));
+        def_property_readonly_static(name, fget, return_value_policy::reference, extra...);
+        return *this;
+    }
+
+    /// Uses return_value_policy::reference_internal by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_readonly(const char *name, const Getter &fget, const Extra& ...extra) {
+        return def_property_readonly(name, cpp_function(method_adaptor<type>(fget)),
+                                     return_value_policy::reference_internal, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_readonly(const char *name, const cpp_function &fget, const Extra& ...extra) {
+        return def_property(name, fget, nullptr, extra...);
+    }
+
+    /// Uses return_value_policy::reference by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_readonly_static(const char *name, const Getter &fget, const Extra& ...extra) {
+        return def_property_readonly_static(name, cpp_function(fget), return_value_policy::reference, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_readonly_static(const char *name, const cpp_function &fget, const Extra& ...extra) {
+        return def_property_static(name, fget, nullptr, extra...);
+    }
+
+    /// Uses return_value_policy::reference_internal by default
+    template <typename Getter, typename Setter, typename... Extra>
+    class_ &def_property(const char *name, const Getter &fget, const Setter &fset, const Extra& ...extra) {
+        return def_property(name, fget, cpp_function(method_adaptor<type>(fset)), extra...);
+    }
+    template <typename Getter, typename... Extra>
+    class_ &def_property(const char *name, const Getter &fget, const cpp_function &fset, const Extra& ...extra) {
+        return def_property(name, cpp_function(method_adaptor<type>(fget)), fset,
+                            return_value_policy::reference_internal, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property(const char *name, const cpp_function &fget, const cpp_function &fset, const Extra& ...extra) {
+        return def_property_static(name, fget, fset, is_method(*this), extra...);
+    }
+
+    /// Uses return_value_policy::reference by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_static(const char *name, const Getter &fget, const cpp_function &fset, const Extra& ...extra) {
+        return def_property_static(name, cpp_function(fget), fset, return_value_policy::reference, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_static(const char *name, const cpp_function &fget, const cpp_function &fset, const Extra& ...extra) {
+        static_assert( 0 == detail::constexpr_sum(std::is_base_of<arg, Extra>::value...),
+                      "Argument annotations are not allowed for properties");
+        auto rec_fget = get_function_record(fget), rec_fset = get_function_record(fset);
+        auto *rec_active = rec_fget;
+        if (rec_fget) {
+           char *doc_prev = rec_fget->doc; /* 'extra' field may include a property-specific documentation string */
+           detail::process_attributes<Extra...>::init(extra..., rec_fget);
+           if (rec_fget->doc && rec_fget->doc != doc_prev) {
+              free(doc_prev);
+              rec_fget->doc = strdup(rec_fget->doc);
+           }
+        }
+        if (rec_fset) {
+            char *doc_prev = rec_fset->doc;
+            detail::process_attributes<Extra...>::init(extra..., rec_fset);
+            if (rec_fset->doc && rec_fset->doc != doc_prev) {
+                free(doc_prev);
+                rec_fset->doc = strdup(rec_fset->doc);
+            }
+            if (! rec_active) rec_active = rec_fset;
+        }
+        def_property_static_impl(name, fget, fset, rec_active);
+        return *this;
+    }
+
+private:
+    /// Initialize holder object, variant 1: object derives from enable_shared_from_this
+    template <typename T>
+    static void init_holder(detail::instance *inst, detail::value_and_holder &v_h,
+            const holder_type * /* unused */, const std::enable_shared_from_this<T> * /* dummy */) {
+        try {
+            auto sh = std::dynamic_pointer_cast<typename holder_type::element_type>(
+                    v_h.value_ptr<type>()->shared_from_this());
+            if (sh) {
+                new (std::addressof(v_h.holder<holder_type>())) holder_type(std::move(sh));
+                v_h.set_holder_constructed();
+            }
+        } catch (const std::bad_weak_ptr &) {}
+
+        if (!v_h.holder_constructed() && inst->owned) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
+        }
+    }
+
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+            const holder_type *holder_ptr, std::true_type /*is_copy_constructible*/) {
+        new (std::addressof(v_h.holder<holder_type>())) holder_type(*reinterpret_cast<const holder_type *>(holder_ptr));
+    }
+
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+            const holder_type *holder_ptr, std::false_type /*is_copy_constructible*/) {
+        new (std::addressof(v_h.holder<holder_type>())) holder_type(std::move(*const_cast<holder_type *>(holder_ptr)));
+    }
+
+    /// Initialize holder object, variant 2: try to construct from existing holder object, if possible
+    static void init_holder(detail::instance *inst, detail::value_and_holder &v_h,
+            const holder_type *holder_ptr, const void * /* dummy -- not enable_shared_from_this<T>) */) {
+        if (holder_ptr) {
+            init_holder_from_existing(v_h, holder_ptr, std::is_copy_constructible<holder_type>());
+            v_h.set_holder_constructed();
+        } else if (inst->owned || detail::always_construct_holder<holder_type>::value) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
+        }
+    }
+
+    /// Performs instance initialization including constructing a holder and registering the known
+    /// instance.  Should be called as soon as the `type` value_ptr is set for an instance.  Takes an
+    /// optional pointer to an existing holder to use; if not specified and the instance is
+    /// `.owned`, a new holder will be constructed to manage the value pointer.
+    static void init_instance(detail::instance *inst, const void *holder_ptr) {
+        auto v_h = inst->get_value_and_holder(detail::get_type_info(typeid(type)));
+        if (!v_h.instance_registered()) {
+            register_instance(inst, v_h.value_ptr(), v_h.type);
+            v_h.set_instance_registered();
+        }
+        init_holder(inst, v_h, (const holder_type *) holder_ptr, v_h.value_ptr<type>());
+    }
+
+    /// Deallocates an instance; via holder, if constructed; otherwise via operator delete.
+    static void dealloc(detail::value_and_holder &v_h) {
+        if (v_h.holder_constructed()) {
+            v_h.holder<holder_type>().~holder_type();
+            v_h.set_holder_constructed(false);
+        }
+        else {
+            detail::call_operator_delete(v_h.value_ptr<type>(),
+                v_h.type->type_size,
+                v_h.type->type_align
+            );
+        }
+        v_h.value_ptr() = nullptr;
+    }
+
+    static detail::function_record *get_function_record(handle h) {
+        h = detail::get_function(h);
+        return h ? (detail::function_record *) reinterpret_borrow<capsule>(PyCFunction_GET_SELF(h.ptr()))
+                 : nullptr;
+    }
+};
+
+/// Binds an existing constructor taking arguments Args...
+template <typename... Args> detail::initimpl::constructor<Args...> init() { return {}; }
+/// Like `init<Args...>()`, but the instance is always constructed through the alias class (even
+/// when not inheriting on the Python side).
+template <typename... Args> detail::initimpl::alias_constructor<Args...> init_alias() { return {}; }
+
+/// Binds a factory function as a constructor
+template <typename Func, typename Ret = detail::initimpl::factory<Func>>
+Ret init(Func &&f) { return {std::forward<Func>(f)}; }
+
+/// Dual-argument factory function: the first function is called when no alias is needed, the second
+/// when an alias is needed (i.e. due to python-side inheritance).  Arguments must be identical.
+template <typename CFunc, typename AFunc, typename Ret = detail::initimpl::factory<CFunc, AFunc>>
+Ret init(CFunc &&c, AFunc &&a) {
+    return {std::forward<CFunc>(c), std::forward<AFunc>(a)};
+}
+
+/// Binds pickling functions `__getstate__` and `__setstate__` and ensures that the type
+/// returned by `__getstate__` is the same as the argument accepted by `__setstate__`.
+template <typename GetState, typename SetState>
+detail::initimpl::pickle_factory<GetState, SetState> pickle(GetState &&g, SetState &&s) {
+    return {std::forward<GetState>(g), std::forward<SetState>(s)};
+}
+
+NAMESPACE_BEGIN(detail)
+struct enum_base {
+    enum_base(handle base, handle parent) : m_base(base), m_parent(parent) { }
+
+    PYBIND11_NOINLINE void init(bool is_arithmetic, bool is_convertible) {
+        m_base.attr("__entries") = dict();
+        auto property = handle((PyObject *) &PyProperty_Type);
+        auto static_property = handle((PyObject *) get_internals().static_property_type);
+
+        m_base.attr("__repr__") = cpp_function(
+            [](handle arg) -> str {
+                handle type = arg.get_type();
+                object type_name = type.attr("__name__");
+                dict entries = type.attr("__entries");
+                for (const auto &kv : entries) {
+                    object other = kv.second[int_(0)];
+                    if (other.equal(arg))
+                        return pybind11::str("{}.{}").format(type_name, kv.first);
+                }
+                return pybind11::str("{}.???").format(type_name);
+            }, is_method(m_base)
+        );
+
+        m_base.attr("name") = property(cpp_function(
+            [](handle arg) -> str {
+                dict entries = arg.get_type().attr("__entries");
+                for (const auto &kv : entries) {
+                    if (handle(kv.second[int_(0)]).equal(arg))
+                        return pybind11::str(kv.first);
+                }
+                return "???";
+            }, is_method(m_base)
+        ));
+
+        m_base.attr("__doc__") = static_property(cpp_function(
+            [](handle arg) -> std::string {
+                std::string docstring;
+                dict entries = arg.attr("__entries");
+                if (((PyTypeObject *) arg.ptr())->tp_doc)
+                    docstring += std::string(((PyTypeObject *) arg.ptr())->tp_doc) + "\n\n";
+                docstring += "Members:";
+                for (const auto &kv : entries) {
+                    auto key = std::string(pybind11::str(kv.first));
+                    auto comment = kv.second[int_(1)];
+                    docstring += "\n\n  " + key;
+                    if (!comment.is_none())
+                        docstring += " : " + (std::string) pybind11::str(comment);
+                }
+                return docstring;
+            }
+        ), none(), none(), "");
+
+        m_base.attr("__members__") = static_property(cpp_function(
+            [](handle arg) -> dict {
+                dict entries = arg.attr("__entries"), m;
+                for (const auto &kv : entries)
+                    m[kv.first] = kv.second[int_(0)];
+                return m;
+            }), none(), none(), ""
+        );
+
+        #define PYBIND11_ENUM_OP_STRICT(op, expr, strict_behavior)                     \
+            m_base.attr(op) = cpp_function(                                            \
+                [](object a, object b) {                                               \
+                    if (!a.get_type().is(b.get_type()))                                \
+                        strict_behavior;                                               \
+                    return expr;                                                       \
+                },                                                                     \
+                is_method(m_base))
+
+        #define PYBIND11_ENUM_OP_CONV(op, expr)                                        \
+            m_base.attr(op) = cpp_function(                                            \
+                [](object a_, object b_) {                                             \
+                    int_ a(a_), b(b_);                                                 \
+                    return expr;                                                       \
+                },                                                                     \
+                is_method(m_base))
+
+        if (is_convertible) {
+            PYBIND11_ENUM_OP_CONV("__eq__", !b.is_none() &&  a.equal(b));
+            PYBIND11_ENUM_OP_CONV("__ne__",  b.is_none() || !a.equal(b));
+
+            if (is_arithmetic) {
+                PYBIND11_ENUM_OP_CONV("__lt__",   a <  b);
+                PYBIND11_ENUM_OP_CONV("__gt__",   a >  b);
+                PYBIND11_ENUM_OP_CONV("__le__",   a <= b);
+                PYBIND11_ENUM_OP_CONV("__ge__",   a >= b);
+                PYBIND11_ENUM_OP_CONV("__and__",  a &  b);
+                PYBIND11_ENUM_OP_CONV("__rand__", a &  b);
+                PYBIND11_ENUM_OP_CONV("__or__",   a |  b);
+                PYBIND11_ENUM_OP_CONV("__ror__",  a |  b);
+                PYBIND11_ENUM_OP_CONV("__xor__",  a ^  b);
+                PYBIND11_ENUM_OP_CONV("__rxor__", a ^  b);
+            }
+        } else {
+            PYBIND11_ENUM_OP_STRICT("__eq__",  int_(a).equal(int_(b)), return false);
+            PYBIND11_ENUM_OP_STRICT("__ne__", !int_(a).equal(int_(b)), return true);
+
+            if (is_arithmetic) {
+                #define PYBIND11_THROW throw type_error("Expected an enumeration of matching type!");
+                PYBIND11_ENUM_OP_STRICT("__lt__", int_(a) <  int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__gt__", int_(a) >  int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__le__", int_(a) <= int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__ge__", int_(a) >= int_(b), PYBIND11_THROW);
+                #undef PYBIND11_THROW
+            }
+        }
+
+        #undef PYBIND11_ENUM_OP_CONV
+        #undef PYBIND11_ENUM_OP_STRICT
+
+        object getstate = cpp_function(
+            [](object arg) { return int_(arg); }, is_method(m_base));
+
+        m_base.attr("__getstate__") = getstate;
+        m_base.attr("__hash__") = getstate;
+    }
+
+    PYBIND11_NOINLINE void value(char const* name_, object value, const char *doc = nullptr) {
+        dict entries = m_base.attr("__entries");
+        str name(name_);
+        if (entries.contains(name)) {
+            std::string type_name = (std::string) str(m_base.attr("__name__"));
+            throw value_error(type_name + ": element \"" + std::string(name_) + "\" already exists!");
+        }
+
+        entries[name] = std::make_pair(value, doc);
+        m_base.attr(name) = value;
+    }
+
+    PYBIND11_NOINLINE void export_values() {
+        dict entries = m_base.attr("__entries");
+        for (const auto &kv : entries)
+            m_parent.attr(kv.first) = kv.second[int_(0)];
+    }
+
+    handle m_base;
+    handle m_parent;
+};
+
+NAMESPACE_END(detail)
+
+/// Binds C++ enumerations and enumeration classes to Python
+template <typename Type> class enum_ : public class_<Type> {
+public:
+    using Base = class_<Type>;
+    using Base::def;
+    using Base::attr;
+    using Base::def_property_readonly;
+    using Base::def_property_readonly_static;
+    using Scalar = typename std::underlying_type<Type>::type;
+
+    template <typename... Extra>
+    enum_(const handle &scope, const char *name, const Extra&... extra)
+      : class_<Type>(scope, name, extra...), m_base(*this, scope) {
+        constexpr bool is_arithmetic = detail::any_of<std::is_same<arithmetic, Extra>...>::value;
+        constexpr bool is_convertible = std::is_convertible<Type, Scalar>::value;
+        m_base.init(is_arithmetic, is_convertible);
+
+        def(init([](Scalar i) { return static_cast<Type>(i); }));
+        def("__int__", [](Type value) { return (Scalar) value; });
+        #if PY_MAJOR_VERSION < 3
+            def("__long__", [](Type value) { return (Scalar) value; });
+        #endif
+        cpp_function setstate(
+            [](Type &value, Scalar arg) { value = static_cast<Type>(arg); },
+            is_method(*this));
+        attr("__setstate__") = setstate;
+    }
+
+    /// Export enumeration entries into the parent scope
+    enum_& export_values() {
+        m_base.export_values();
+        return *this;
+    }
+
+    /// Add an enumeration entry
+    enum_& value(char const* name, Type value, const char *doc = nullptr) {
+        m_base.value(name, pybind11::cast(value, return_value_policy::copy), doc);
+        return *this;
+    }
+
+private:
+    detail::enum_base m_base;
+};
+
+NAMESPACE_BEGIN(detail)
+
+
+inline void keep_alive_impl(handle nurse, handle patient) {
+    if (!nurse || !patient)
+        pybind11_fail("Could not activate keep_alive!");
+
+    if (patient.is_none() || nurse.is_none())
+        return; /* Nothing to keep alive or nothing to be kept alive by */
+
+    auto tinfo = all_type_info(Py_TYPE(nurse.ptr()));
+    if (!tinfo.empty()) {
+        /* It's a pybind-registered type, so we can store the patient in the
+         * internal list. */
+        add_patient(nurse.ptr(), patient.ptr());
+    }
+    else {
+        /* Fall back to clever approach based on weak references taken from
+         * Boost.Python. This is not used for pybind-registered types because
+         * the objects can be destroyed out-of-order in a GC pass. */
+        cpp_function disable_lifesupport(
+            [patient](handle weakref) { patient.dec_ref(); weakref.dec_ref(); });
+
+        weakref wr(nurse, disable_lifesupport);
+
+        patient.inc_ref(); /* reference patient and leak the weak reference */
+        (void) wr.release();
+    }
+}
+
+PYBIND11_NOINLINE inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret) {
+    auto get_arg = [&](size_t n) {
+        if (n == 0)
+            return ret;
+        else if (n == 1 && call.init_self)
+            return call.init_self;
+        else if (n <= call.args.size())
+            return call.args[n - 1];
+        return handle();
+    };
+
+    keep_alive_impl(get_arg(Nurse), get_arg(Patient));
+}
+
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_type_info_get_cache(PyTypeObject *type) {
+    auto res = get_internals().registered_types_py
+#ifdef __cpp_lib_unordered_map_try_emplace
+        .try_emplace(type);
+#else
+        .emplace(type, std::vector<detail::type_info *>());
+#endif
+    if (res.second) {
+        // New cache entry created; set up a weak reference to automatically remove it if the type
+        // gets destroyed:
+        weakref((PyObject *) type, cpp_function([type](handle wr) {
+            get_internals().registered_types_py.erase(type);
+            wr.dec_ref();
+        })).release();
+    }
+
+    return res;
+}
+
+template <typename Iterator, typename Sentinel, bool KeyIterator, return_value_policy Policy>
+struct iterator_state {
+    Iterator it;
+    Sentinel end;
+    bool first_or_done;
+};
+
+NAMESPACE_END(detail)
+
+/// Makes a python iterator from a first and past-the-end C++ InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType = decltype(*std::declval<Iterator>()),
+          typename... Extra>
+iterator make_iterator(Iterator first, Sentinel last, Extra &&... extra) {
+    typedef detail::iterator_state<Iterator, Sentinel, false, Policy> state;
+
+    if (!detail::get_type_info(typeid(state), false)) {
+        class_<state>(handle(), "iterator", pybind11::module_local())
+            .def("__iter__", [](state &s) -> state& { return s; })
+            .def("__next__", [](state &s) -> ValueType {
+                if (!s.first_or_done)
+                    ++s.it;
+                else
+                    s.first_or_done = false;
+                if (s.it == s.end) {
+                    s.first_or_done = true;
+                    throw stop_iteration();
+                }
+                return *s.it;
+            }, std::forward<Extra>(extra)..., Policy);
+    }
+
+    return cast(state{first, last, true});
+}
+
+/// Makes an python iterator over the keys (`.first`) of a iterator over pairs from a
+/// first and past-the-end InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename KeyType = decltype((*std::declval<Iterator>()).first),
+          typename... Extra>
+iterator make_key_iterator(Iterator first, Sentinel last, Extra &&... extra) {
+    typedef detail::iterator_state<Iterator, Sentinel, true, Policy> state;
+
+    if (!detail::get_type_info(typeid(state), false)) {
+        class_<state>(handle(), "iterator", pybind11::module_local())
+            .def("__iter__", [](state &s) -> state& { return s; })
+            .def("__next__", [](state &s) -> KeyType {
+                if (!s.first_or_done)
+                    ++s.it;
+                else
+                    s.first_or_done = false;
+                if (s.it == s.end) {
+                    s.first_or_done = true;
+                    throw stop_iteration();
+                }
+                return (*s.it).first;
+            }, std::forward<Extra>(extra)..., Policy);
+    }
+
+    return cast(state{first, last, true});
+}
+
+/// Makes an iterator over values of an stl container or other container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type, typename... Extra> iterator make_iterator(Type &value, Extra&&... extra) {
+    return make_iterator<Policy>(std::begin(value), std::end(value), extra...);
+}
+
+/// Makes an iterator over the keys (`.first`) of a stl map-like container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type, typename... Extra> iterator make_key_iterator(Type &value, Extra&&... extra) {
+    return make_key_iterator<Policy>(std::begin(value), std::end(value), extra...);
+}
+
+template <typename InputType, typename OutputType> void implicitly_convertible() {
+    struct set_flag {
+        bool &flag;
+        set_flag(bool &flag) : flag(flag) { flag = true; }
+        ~set_flag() { flag = false; }
+    };
+    auto implicit_caster = [](PyObject *obj, PyTypeObject *type) -> PyObject * {
+        static bool currently_used = false;
+        if (currently_used) // implicit conversions are non-reentrant
+            return nullptr;
+        set_flag flag_helper(currently_used);
+        if (!detail::make_caster<InputType>().load(obj, false))
+            return nullptr;
+        tuple args(1);
+        args[0] = obj;
+        PyObject *result = PyObject_Call((PyObject *) type, args.ptr(), nullptr);
+        if (result == nullptr)
+            PyErr_Clear();
+        return result;
+    };
+
+    if (auto tinfo = detail::get_type_info(typeid(OutputType)))
+        tinfo->implicit_conversions.push_back(implicit_caster);
+    else
+        pybind11_fail("implicitly_convertible: Unable to find type " + type_id<OutputType>());
+}
+
+template <typename ExceptionTranslator>
+void register_exception_translator(ExceptionTranslator&& translator) {
+    detail::get_internals().registered_exception_translators.push_front(
+        std::forward<ExceptionTranslator>(translator));
+}
+
+/**
+ * Wrapper to generate a new Python exception type.
+ *
+ * This should only be used with PyErr_SetString for now.
+ * It is not (yet) possible to use as a py::base.
+ * Template type argument is reserved for future use.
+ */
+template <typename type>
+class exception : public object {
+public:
+    exception() = default;
+    exception(handle scope, const char *name, PyObject *base = PyExc_Exception) {
+        std::string full_name = scope.attr("__name__").cast<std::string>() +
+                                std::string(".") + name;
+        m_ptr = PyErr_NewException(const_cast<char *>(full_name.c_str()), base, NULL);
+        if (hasattr(scope, name))
+            pybind11_fail("Error during initialization: multiple incompatible "
+                          "definitions with name \"" + std::string(name) + "\"");
+        scope.attr(name) = *this;
+    }
+
+    // Sets the current python exception to this exception object with the given message
+    void operator()(const char *message) {
+        PyErr_SetString(m_ptr, message);
+    }
+};
+
+NAMESPACE_BEGIN(detail)
+// Returns a reference to a function-local static exception object used in the simple
+// register_exception approach below.  (It would be simpler to have the static local variable
+// directly in register_exception, but that makes clang <3.5 segfault - issue #1349).
+template <typename CppException>
+exception<CppException> &get_exception_object() { static exception<CppException> ex; return ex; }
+NAMESPACE_END(detail)
+
+/**
+ * Registers a Python exception in `m` of the given `name` and installs an exception translator to
+ * translate the C++ exception to the created Python exception using the exceptions what() method.
+ * This is intended for simple exception translations; for more complex translation, register the
+ * exception object and translator directly.
+ */
+template <typename CppException>
+exception<CppException> &register_exception(handle scope,
+                                            const char *name,
+                                            PyObject *base = PyExc_Exception) {
+    auto &ex = detail::get_exception_object<CppException>();
+    if (!ex) ex = exception<CppException>(scope, name, base);
+
+    register_exception_translator([](std::exception_ptr p) {
+        if (!p) return;
+        try {
+            std::rethrow_exception(p);
+        } catch (const CppException &e) {
+            detail::get_exception_object<CppException>()(e.what());
+        }
+    });
+    return ex;
+}
+
+NAMESPACE_BEGIN(detail)
+PYBIND11_NOINLINE inline void print(tuple args, dict kwargs) {
+    auto strings = tuple(args.size());
+    for (size_t i = 0; i < args.size(); ++i) {
+        strings[i] = str(args[i]);
+    }
+    auto sep = kwargs.contains("sep") ? kwargs["sep"] : cast(" ");
+    auto line = sep.attr("join")(strings);
+
+    object file;
+    if (kwargs.contains("file")) {
+        file = kwargs["file"].cast<object>();
+    } else {
+        try {
+            file = module::import("sys").attr("stdout");
+        } catch (const error_already_set &) {
+            /* If print() is called from code that is executed as
+               part of garbage collection during interpreter shutdown,
+               importing 'sys' can fail. Give up rather than crashing the
+               interpreter in this case. */
+            return;
+        }
+    }
+
+    auto write = file.attr("write");
+    write(line);
+    write(kwargs.contains("end") ? kwargs["end"] : cast("\n"));
+
+    if (kwargs.contains("flush") && kwargs["flush"].cast<bool>())
+        file.attr("flush")();
+}
+NAMESPACE_END(detail)
+
+template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+void print(Args &&...args) {
+    auto c = detail::collect_arguments<policy>(std::forward<Args>(args)...);
+    detail::print(c.args(), c.kwargs());
+}
+
+#if defined(WITH_THREAD) && !defined(PYPY_VERSION)
+
+/* The functions below essentially reproduce the PyGILState_* API using a RAII
+ * pattern, but there are a few important differences:
+ *
+ * 1. When acquiring the GIL from an non-main thread during the finalization
+ *    phase, the GILState API blindly terminates the calling thread, which
+ *    is often not what is wanted. This API does not do this.
+ *
+ * 2. The gil_scoped_release function can optionally cut the relationship
+ *    of a PyThreadState and its associated thread, which allows moving it to
+ *    another thread (this is a fairly rare/advanced use case).
+ *
+ * 3. The reference count of an acquired thread state can be controlled. This
+ *    can be handy to prevent cases where callbacks issued from an external
+ *    thread would otherwise constantly construct and destroy thread state data
+ *    structures.
+ *
+ * See the Python bindings of NanoGUI (http://github.com/wjakob/nanogui) for an
+ * example which uses features 2 and 3 to migrate the Python thread of
+ * execution to another thread (to run the event loop on the original thread,
+ * in this case).
+ */
+
+class gil_scoped_acquire {
+public:
+    PYBIND11_NOINLINE gil_scoped_acquire() {
+        auto const &internals = detail::get_internals();
+        tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate);
+
+        if (!tstate) {
+            /* Check if the GIL was acquired using the PyGILState_* API instead (e.g. if
+               calling from a Python thread). Since we use a different key, this ensures
+               we don't create a new thread state and deadlock in PyEval_AcquireThread
+               below. Note we don't save this state with internals.tstate, since we don't
+               create it we would fail to clear it (its reference count should be > 0). */
+            tstate = PyGILState_GetThisThreadState();
+        }
+
+        if (!tstate) {
+            tstate = PyThreadState_New(internals.istate);
+            #if !defined(NDEBUG)
+                if (!tstate)
+                    pybind11_fail("scoped_acquire: could not create thread state!");
+            #endif
+            tstate->gilstate_counter = 0;
+            PYBIND11_TLS_REPLACE_VALUE(internals.tstate, tstate);
+        } else {
+            release = detail::get_thread_state_unchecked() != tstate;
+        }
+
+        if (release) {
+            /* Work around an annoying assertion in PyThreadState_Swap */
+            #if defined(Py_DEBUG)
+                PyInterpreterState *interp = tstate->interp;
+                tstate->interp = nullptr;
+            #endif
+            PyEval_AcquireThread(tstate);
+            #if defined(Py_DEBUG)
+                tstate->interp = interp;
+            #endif
+        }
+
+        inc_ref();
+    }
+
+    void inc_ref() {
+        ++tstate->gilstate_counter;
+    }
+
+    PYBIND11_NOINLINE void dec_ref() {
+        --tstate->gilstate_counter;
+        #if !defined(NDEBUG)
+            if (detail::get_thread_state_unchecked() != tstate)
+                pybind11_fail("scoped_acquire::dec_ref(): thread state must be current!");
+            if (tstate->gilstate_counter < 0)
+                pybind11_fail("scoped_acquire::dec_ref(): reference count underflow!");
+        #endif
+        if (tstate->gilstate_counter == 0) {
+            #if !defined(NDEBUG)
+                if (!release)
+                    pybind11_fail("scoped_acquire::dec_ref(): internal error!");
+            #endif
+            PyThreadState_Clear(tstate);
+            PyThreadState_DeleteCurrent();
+            PYBIND11_TLS_DELETE_VALUE(detail::get_internals().tstate);
+            release = false;
+        }
+    }
+
+    PYBIND11_NOINLINE ~gil_scoped_acquire() {
+        dec_ref();
+        if (release)
+           PyEval_SaveThread();
+    }
+private:
+    PyThreadState *tstate = nullptr;
+    bool release = true;
+};
+
+class gil_scoped_release {
+public:
+    explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) {
+        // `get_internals()` must be called here unconditionally in order to initialize
+        // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an
+        // initialization race could occur as multiple threads try `gil_scoped_acquire`.
+        const auto &internals = detail::get_internals();
+        tstate = PyEval_SaveThread();
+        if (disassoc) {
+            auto key = internals.tstate;
+            PYBIND11_TLS_DELETE_VALUE(key);
+        }
+    }
+    ~gil_scoped_release() {
+        if (!tstate)
+            return;
+        PyEval_RestoreThread(tstate);
+        if (disassoc) {
+            auto key = detail::get_internals().tstate;
+            PYBIND11_TLS_REPLACE_VALUE(key, tstate);
+        }
+    }
+private:
+    PyThreadState *tstate;
+    bool disassoc;
+};
+#elif defined(PYPY_VERSION)
+class gil_scoped_acquire {
+    PyGILState_STATE state;
+public:
+    gil_scoped_acquire() { state = PyGILState_Ensure(); }
+    ~gil_scoped_acquire() { PyGILState_Release(state); }
+};
+
+class gil_scoped_release {
+    PyThreadState *state;
+public:
+    gil_scoped_release() { state = PyEval_SaveThread(); }
+    ~gil_scoped_release() { PyEval_RestoreThread(state); }
+};
+#else
+class gil_scoped_acquire { };
+class gil_scoped_release { };
+#endif
+
+error_already_set::~error_already_set() {
+    if (m_type) {
+        error_scope scope;
+        gil_scoped_acquire gil;
+        m_type.release().dec_ref();
+        m_value.release().dec_ref();
+        m_trace.release().dec_ref();
+    }
+}
+
+inline function get_type_overload(const void *this_ptr, const detail::type_info *this_type, const char *name)  {
+    handle self = detail::get_object_handle(this_ptr, this_type);
+    if (!self)
+        return function();
+    handle type = self.get_type();
+    auto key = std::make_pair(type.ptr(), name);
+
+    /* Cache functions that aren't overloaded in Python to avoid
+       many costly Python dictionary lookups below */
+    auto &cache = detail::get_internals().inactive_overload_cache;
+    if (cache.find(key) != cache.end())
+        return function();
+
+    function overload = getattr(self, name, function());
+    if (overload.is_cpp_function()) {
+        cache.insert(key);
+        return function();
+    }
+
+    /* Don't call dispatch code if invoked from overridden function.
+       Unfortunately this doesn't work on PyPy. */
+#if !defined(PYPY_VERSION)
+    PyFrameObject *frame = PyThreadState_Get()->frame;
+    if (frame && (std::string) str(frame->f_code->co_name) == name &&
+        frame->f_code->co_argcount > 0) {
+        PyFrame_FastToLocals(frame);
+        PyObject *self_caller = PyDict_GetItem(
+            frame->f_locals, PyTuple_GET_ITEM(frame->f_code->co_varnames, 0));
+        if (self_caller == self.ptr())
+            return function();
+    }
+#else
+    /* PyPy currently doesn't provide a detailed cpyext emulation of
+       frame objects, so we have to emulate this using Python. This
+       is going to be slow..*/
+    dict d; d["self"] = self; d["name"] = pybind11::str(name);
+    PyObject *result = PyRun_String(
+        "import inspect\n"
+        "frame = inspect.currentframe()\n"
+        "if frame is not None:\n"
+        "    frame = frame.f_back\n"
+        "    if frame is not None and str(frame.f_code.co_name) == name and "
+        "frame.f_code.co_argcount > 0:\n"
+        "        self_caller = frame.f_locals[frame.f_code.co_varnames[0]]\n"
+        "        if self_caller == self:\n"
+        "            self = None\n",
+        Py_file_input, d.ptr(), d.ptr());
+    if (result == nullptr)
+        throw error_already_set();
+    if (d["self"].is_none())
+        return function();
+    Py_DECREF(result);
+#endif
+
+    return overload;
+}
+
+/** \rst
+  Try to retrieve a python method by the provided name from the instance pointed to by the this_ptr.
+
+  :this_ptr: The pointer to the object the overload should be retrieved for. This should be the first
+                   non-trampoline class encountered in the inheritance chain.
+  :name: The name of the overloaded Python method to retrieve.
+  :return: The Python method by this name from the object or an empty function wrapper.
+ \endrst */
+template <class T> function get_overload(const T *this_ptr, const char *name) {
+    auto tinfo = detail::get_type_info(typeid(T));
+    return tinfo ? get_type_overload(this_ptr, tinfo, name) : function();
+}
+
+#define PYBIND11_OVERLOAD_INT(ret_type, cname, name, ...) { \
+        pybind11::gil_scoped_acquire gil; \
+        pybind11::function overload = pybind11::get_overload(static_cast<const cname *>(this), name); \
+        if (overload) { \
+            auto o = overload(__VA_ARGS__); \
+            if (pybind11::detail::cast_is_temporary_value_reference<ret_type>::value) { \
+                static pybind11::detail::overload_caster_t<ret_type> caster; \
+                return pybind11::detail::cast_ref<ret_type>(std::move(o), caster); \
+            } \
+            else return pybind11::detail::cast_safe<ret_type>(std::move(o)); \
+        } \
+    }
+
+/** \rst
+    Macro to populate the virtual method in the trampoline class. This macro tries to look up a method named 'fn'
+    from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return
+    the appropriate type. See :ref:`overriding_virtuals` for more information. This macro should be used when the method
+    name in C is not the same as the method name in Python. For example with `__str__`.
+
+    .. code-block:: cpp
+
+      std::string toString() override {
+        PYBIND11_OVERLOAD_NAME(
+            std::string, // Return type (ret_type)
+            Animal,      // Parent class (cname)
+            toString,    // Name of function in C++ (name)
+            "__str__",   // Name of method in Python (fn)
+        );
+      }
+\endrst */
+#define PYBIND11_OVERLOAD_NAME(ret_type, cname, name, fn, ...) \
+    PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \
+    return cname::fn(__VA_ARGS__)
+
+/** \rst
+    Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD_NAME`, except that it
+    throws if no overload can be found.
+\endrst */
+#define PYBIND11_OVERLOAD_PURE_NAME(ret_type, cname, name, fn, ...) \
+    PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \
+    pybind11::pybind11_fail("Tried to call pure virtual function \"" PYBIND11_STRINGIFY(cname) "::" name "\"");
+
+/** \rst
+    Macro to populate the virtual method in the trampoline class. This macro tries to look up the method
+    from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return
+    the appropriate type. This macro should be used if the method name in C and in Python are identical.
+    See :ref:`overriding_virtuals` for more information.
+
+    .. code-block:: cpp
+
+      class PyAnimal : public Animal {
+      public:
+          // Inherit the constructors
+          using Animal::Animal;
+
+          // Trampoline (need one for each virtual function)
+          std::string go(int n_times) override {
+              PYBIND11_OVERLOAD_PURE(
+                  std::string, // Return type (ret_type)
+                  Animal,      // Parent class (cname)
+                  go,          // Name of function in C++ (must match Python name) (fn)
+                  n_times      // Argument(s) (...)
+              );
+          }
+      };
+\endrst */
+#define PYBIND11_OVERLOAD(ret_type, cname, fn, ...) \
+    PYBIND11_OVERLOAD_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+
+/** \rst
+    Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD`, except that it throws
+    if no overload can be found.
+\endrst */
+#define PYBIND11_OVERLOAD_PURE(ret_type, cname, fn, ...) \
+    PYBIND11_OVERLOAD_PURE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#  pragma warning(pop)
+#elif defined(__GNUG__) && !defined(__clang__)
+#  pragma GCC diagnostic pop
+#endif
diff --git a/python/src/pybind11/pytypes.h b/python/src/pybind11/pytypes.h
new file mode 100644
index 000000000..2d573dfad
--- /dev/null
+++ b/python/src/pybind11/pytypes.h
@@ -0,0 +1,1471 @@
+/*
+    pybind11/pytypes.h: Convenience wrapper classes for basic Python types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "buffer_info.h"
+#include <utility>
+#include <type_traits>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/* A few forward declarations */
+class handle; class object;
+class str; class iterator;
+struct arg; struct arg_v;
+
+NAMESPACE_BEGIN(detail)
+class args_proxy;
+inline bool isinstance_generic(handle obj, const std::type_info &tp);
+
+// Accessor forward declarations
+template <typename Policy> class accessor;
+namespace accessor_policies {
+    struct obj_attr;
+    struct str_attr;
+    struct generic_item;
+    struct sequence_item;
+    struct list_item;
+    struct tuple_item;
+}
+using obj_attr_accessor = accessor<accessor_policies::obj_attr>;
+using str_attr_accessor = accessor<accessor_policies::str_attr>;
+using item_accessor = accessor<accessor_policies::generic_item>;
+using sequence_accessor = accessor<accessor_policies::sequence_item>;
+using list_accessor = accessor<accessor_policies::list_item>;
+using tuple_accessor = accessor<accessor_policies::tuple_item>;
+
+/// Tag and check to identify a class which implements the Python object API
+class pyobject_tag { };
+template <typename T> using is_pyobject = std::is_base_of<pyobject_tag, remove_reference_t<T>>;
+
+/** \rst
+    A mixin class which adds common functions to `handle`, `object` and various accessors.
+    The only requirement for `Derived` is to implement ``PyObject *Derived::ptr() const``.
+\endrst */
+template <typename Derived>
+class object_api : public pyobject_tag {
+    const Derived &derived() const { return static_cast<const Derived &>(*this); }
+
+public:
+    /** \rst
+        Return an iterator equivalent to calling ``iter()`` in Python. The object
+        must be a collection which supports the iteration protocol.
+    \endrst */
+    iterator begin() const;
+    /// Return a sentinel which ends iteration.
+    iterator end() const;
+
+    /** \rst
+        Return an internal functor to invoke the object's sequence protocol. Casting
+        the returned ``detail::item_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``__getitem__``. Assigning a `handle`
+        or `object` subclass causes a call to ``__setitem__``.
+    \endrst */
+    item_accessor operator[](handle key) const;
+    /// See above (the only difference is that they key is provided as a string literal)
+    item_accessor operator[](const char *key) const;
+
+    /** \rst
+        Return an internal functor to access the object's attributes. Casting the
+        returned ``detail::obj_attr_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``getattr``. Assigning a `handle`
+        or `object` subclass causes a call to ``setattr``.
+    \endrst */
+    obj_attr_accessor attr(handle key) const;
+    /// See above (the only difference is that they key is provided as a string literal)
+    str_attr_accessor attr(const char *key) const;
+
+    /** \rst
+        Matches * unpacking in Python, e.g. to unpack arguments out of a ``tuple``
+        or ``list`` for a function call. Applying another * to the result yields
+        ** unpacking, e.g. to unpack a dict as function keyword arguments.
+        See :ref:`calling_python_functions`.
+    \endrst */
+    args_proxy operator*() const;
+
+    /// Check if the given item is contained within this object, i.e. ``item in obj``.
+    template <typename T> bool contains(T &&item) const;
+
+    /** \rst
+        Assuming the Python object is a function or implements the ``__call__``
+        protocol, ``operator()`` invokes the underlying function, passing an
+        arbitrary set of parameters. The result is returned as a `object` and
+        may need to be converted back into a Python object using `handle::cast()`.
+
+        When some of the arguments cannot be converted to Python objects, the
+        function will throw a `cast_error` exception. When the Python function
+        call fails, a `error_already_set` exception is thrown.
+    \endrst */
+    template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+    object operator()(Args &&...args) const;
+    template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+    PYBIND11_DEPRECATED("call(...) was deprecated in favor of operator()(...)")
+        object call(Args&&... args) const;
+
+    /// Equivalent to ``obj is other`` in Python.
+    bool is(object_api const& other) const { return derived().ptr() == other.derived().ptr(); }
+    /// Equivalent to ``obj is None`` in Python.
+    bool is_none() const { return derived().ptr() == Py_None; }
+    /// Equivalent to obj == other in Python
+    bool equal(object_api const &other) const      { return rich_compare(other, Py_EQ); }
+    bool not_equal(object_api const &other) const  { return rich_compare(other, Py_NE); }
+    bool operator<(object_api const &other) const  { return rich_compare(other, Py_LT); }
+    bool operator<=(object_api const &other) const { return rich_compare(other, Py_LE); }
+    bool operator>(object_api const &other) const  { return rich_compare(other, Py_GT); }
+    bool operator>=(object_api const &other) const { return rich_compare(other, Py_GE); }
+
+    object operator-() const;
+    object operator~() const;
+    object operator+(object_api const &other) const;
+    object operator+=(object_api const &other) const;
+    object operator-(object_api const &other) const;
+    object operator-=(object_api const &other) const;
+    object operator*(object_api const &other) const;
+    object operator*=(object_api const &other) const;
+    object operator/(object_api const &other) const;
+    object operator/=(object_api const &other) const;
+    object operator|(object_api const &other) const;
+    object operator|=(object_api const &other) const;
+    object operator&(object_api const &other) const;
+    object operator&=(object_api const &other) const;
+    object operator^(object_api const &other) const;
+    object operator^=(object_api const &other) const;
+    object operator<<(object_api const &other) const;
+    object operator<<=(object_api const &other) const;
+    object operator>>(object_api const &other) const;
+    object operator>>=(object_api const &other) const;
+
+    PYBIND11_DEPRECATED("Use py::str(obj) instead")
+    pybind11::str str() const;
+
+    /// Get or set the object's docstring, i.e. ``obj.__doc__``.
+    str_attr_accessor doc() const;
+
+    /// Return the object's current reference count
+    int ref_count() const { return static_cast<int>(Py_REFCNT(derived().ptr())); }
+    /// Return a handle to the Python type object underlying the instance
+    handle get_type() const;
+
+private:
+    bool rich_compare(object_api const &other, int value) const;
+};
+
+NAMESPACE_END(detail)
+
+/** \rst
+    Holds a reference to a Python object (no reference counting)
+
+    The `handle` class is a thin wrapper around an arbitrary Python object (i.e. a
+    ``PyObject *`` in Python's C API). It does not perform any automatic reference
+    counting and merely provides a basic C++ interface to various Python API functions.
+
+    .. seealso::
+        The `object` class inherits from `handle` and adds automatic reference
+        counting features.
+\endrst */
+class handle : public detail::object_api<handle> {
+public:
+    /// The default constructor creates a handle with a ``nullptr``-valued pointer
+    handle() = default;
+    /// Creates a ``handle`` from the given raw Python object pointer
+    handle(PyObject *ptr) : m_ptr(ptr) { } // Allow implicit conversion from PyObject*
+
+    /// Return the underlying ``PyObject *`` pointer
+    PyObject *ptr() const { return m_ptr; }
+    PyObject *&ptr() { return m_ptr; }
+
+    /** \rst
+        Manually increase the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle& inc_ref() const & { Py_XINCREF(m_ptr); return *this; }
+
+    /** \rst
+        Manually decrease the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle& dec_ref() const & { Py_XDECREF(m_ptr); return *this; }
+
+    /** \rst
+        Attempt to cast the Python object into the given C++ type. A `cast_error`
+        will be throw upon failure.
+    \endrst */
+    template <typename T> T cast() const;
+    /// Return ``true`` when the `handle` wraps a valid Python object
+    explicit operator bool() const { return m_ptr != nullptr; }
+    /** \rst
+        Deprecated: Check that the underlying pointers are the same.
+        Equivalent to ``obj1 is obj2`` in Python.
+    \endrst */
+    PYBIND11_DEPRECATED("Use obj1.is(obj2) instead")
+    bool operator==(const handle &h) const { return m_ptr == h.m_ptr; }
+    PYBIND11_DEPRECATED("Use !obj1.is(obj2) instead")
+    bool operator!=(const handle &h) const { return m_ptr != h.m_ptr; }
+    PYBIND11_DEPRECATED("Use handle::operator bool() instead")
+    bool check() const { return m_ptr != nullptr; }
+protected:
+    PyObject *m_ptr = nullptr;
+};
+
+/** \rst
+    Holds a reference to a Python object (with reference counting)
+
+    Like `handle`, the `object` class is a thin wrapper around an arbitrary Python
+    object (i.e. a ``PyObject *`` in Python's C API). In contrast to `handle`, it
+    optionally increases the object's reference count upon construction, and it
+    *always* decreases the reference count when the `object` instance goes out of
+    scope and is destructed. When using `object` instances consistently, it is much
+    easier to get reference counting right at the first attempt.
+\endrst */
+class object : public handle {
+public:
+    object() = default;
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<object>() or reinterpret_steal<object>()")
+    object(handle h, bool is_borrowed) : handle(h) { if (is_borrowed) inc_ref(); }
+    /// Copy constructor; always increases the reference count
+    object(const object &o) : handle(o) { inc_ref(); }
+    /// Move constructor; steals the object from ``other`` and preserves its reference count
+    object(object &&other) noexcept { m_ptr = other.m_ptr; other.m_ptr = nullptr; }
+    /// Destructor; automatically calls `handle::dec_ref()`
+    ~object() { dec_ref(); }
+
+    /** \rst
+        Resets the internal pointer to ``nullptr`` without without decreasing the
+        object's reference count. The function returns a raw handle to the original
+        Python object.
+    \endrst */
+    handle release() {
+      PyObject *tmp = m_ptr;
+      m_ptr = nullptr;
+      return handle(tmp);
+    }
+
+    object& operator=(const object &other) {
+        other.inc_ref();
+        dec_ref();
+        m_ptr = other.m_ptr;
+        return *this;
+    }
+
+    object& operator=(object &&other) noexcept {
+        if (this != &other) {
+            handle temp(m_ptr);
+            m_ptr = other.m_ptr;
+            other.m_ptr = nullptr;
+            temp.dec_ref();
+        }
+        return *this;
+    }
+
+    // Calling cast() on an object lvalue just copies (via handle::cast)
+    template <typename T> T cast() const &;
+    // Calling on an object rvalue does a move, if needed and/or possible
+    template <typename T> T cast() &&;
+
+protected:
+    // Tags for choosing constructors from raw PyObject *
+    struct borrowed_t { };
+    struct stolen_t { };
+
+    template <typename T> friend T reinterpret_borrow(handle);
+    template <typename T> friend T reinterpret_steal(handle);
+
+public:
+    // Only accessible from derived classes and the reinterpret_* functions
+    object(handle h, borrowed_t) : handle(h) { inc_ref(); }
+    object(handle h, stolen_t) : handle(h) { }
+};
+
+/** \rst
+    Declare that a `handle` or ``PyObject *`` is a certain type and borrow the reference.
+    The target type ``T`` must be `object` or one of its derived classes. The function
+    doesn't do any conversions or checks. It's up to the user to make sure that the
+    target type is correct.
+
+    .. code-block:: cpp
+
+        PyObject *p = PyList_GetItem(obj, index);
+        py::object o = reinterpret_borrow<py::object>(p);
+        // or
+        py::tuple t = reinterpret_borrow<py::tuple>(p); // <-- `p` must be already be a `tuple`
+\endrst */
+template <typename T> T reinterpret_borrow(handle h) { return {h, object::borrowed_t{}}; }
+
+/** \rst
+    Like `reinterpret_borrow`, but steals the reference.
+
+     .. code-block:: cpp
+
+        PyObject *p = PyObject_Str(obj);
+        py::str s = reinterpret_steal<py::str>(p); // <-- `p` must be already be a `str`
+\endrst */
+template <typename T> T reinterpret_steal(handle h) { return {h, object::stolen_t{}}; }
+
+NAMESPACE_BEGIN(detail)
+inline std::string error_string();
+NAMESPACE_END(detail)
+
+/// Fetch and hold an error which was already set in Python.  An instance of this is typically
+/// thrown to propagate python-side errors back through C++ which can either be caught manually or
+/// else falls back to the function dispatcher (which then raises the captured error back to
+/// python).
+class error_already_set : public std::runtime_error {
+public:
+    /// Constructs a new exception from the current Python error indicator, if any.  The current
+    /// Python error indicator will be cleared.
+    error_already_set() : std::runtime_error(detail::error_string()) {
+        PyErr_Fetch(&m_type.ptr(), &m_value.ptr(), &m_trace.ptr());
+    }
+
+    error_already_set(const error_already_set &) = default;
+    error_already_set(error_already_set &&) = default;
+
+    inline ~error_already_set();
+
+    /// Give the currently-held error back to Python, if any.  If there is currently a Python error
+    /// already set it is cleared first.  After this call, the current object no longer stores the
+    /// error variables (but the `.what()` string is still available).
+    void restore() { PyErr_Restore(m_type.release().ptr(), m_value.release().ptr(), m_trace.release().ptr()); }
+
+    // Does nothing; provided for backwards compatibility.
+    PYBIND11_DEPRECATED("Use of error_already_set.clear() is deprecated")
+    void clear() {}
+
+    /// Check if the currently trapped error type matches the given Python exception class (or a
+    /// subclass thereof).  May also be passed a tuple to search for any exception class matches in
+    /// the given tuple.
+    bool matches(handle exc) const { return PyErr_GivenExceptionMatches(m_type.ptr(), exc.ptr()); }
+
+    const object& type() const { return m_type; }
+    const object& value() const { return m_value; }
+    const object& trace() const { return m_trace; }
+
+private:
+    object m_type, m_value, m_trace;
+};
+
+/** \defgroup python_builtins _
+    Unless stated otherwise, the following C++ functions behave the same
+    as their Python counterparts.
+ */
+
+/** \ingroup python_builtins
+    \rst
+    Return true if ``obj`` is an instance of ``T``. Type ``T`` must be a subclass of
+    `object` or a class which was exposed to Python as ``py::class_<T>``.
+\endrst */
+template <typename T, detail::enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) { return T::check_(obj); }
+
+template <typename T, detail::enable_if_t<!std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) { return detail::isinstance_generic(obj, typeid(T)); }
+
+template <> inline bool isinstance<handle>(handle obj) = delete;
+template <> inline bool isinstance<object>(handle obj) { return obj.ptr() != nullptr; }
+
+/// \ingroup python_builtins
+/// Return true if ``obj`` is an instance of the ``type``.
+inline bool isinstance(handle obj, handle type) {
+    const auto result = PyObject_IsInstance(obj.ptr(), type.ptr());
+    if (result == -1)
+        throw error_already_set();
+    return result != 0;
+}
+
+/// \addtogroup python_builtins
+/// @{
+inline bool hasattr(handle obj, handle name) {
+    return PyObject_HasAttr(obj.ptr(), name.ptr()) == 1;
+}
+
+inline bool hasattr(handle obj, const char *name) {
+    return PyObject_HasAttrString(obj.ptr(), name) == 1;
+}
+
+inline void delattr(handle obj, handle name) {
+    if (PyObject_DelAttr(obj.ptr(), name.ptr()) != 0) { throw error_already_set(); }
+}
+
+inline void delattr(handle obj, const char *name) {
+    if (PyObject_DelAttrString(obj.ptr(), name) != 0) { throw error_already_set(); }
+}
+
+inline object getattr(handle obj, handle name) {
+    PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr());
+    if (!result) { throw error_already_set(); }
+    return reinterpret_steal<object>(result);
+}
+
+inline object getattr(handle obj, const char *name) {
+    PyObject *result = PyObject_GetAttrString(obj.ptr(), name);
+    if (!result) { throw error_already_set(); }
+    return reinterpret_steal<object>(result);
+}
+
+inline object getattr(handle obj, handle name, handle default_) {
+    if (PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr())) {
+        return reinterpret_steal<object>(result);
+    } else {
+        PyErr_Clear();
+        return reinterpret_borrow<object>(default_);
+    }
+}
+
+inline object getattr(handle obj, const char *name, handle default_) {
+    if (PyObject *result = PyObject_GetAttrString(obj.ptr(), name)) {
+        return reinterpret_steal<object>(result);
+    } else {
+        PyErr_Clear();
+        return reinterpret_borrow<object>(default_);
+    }
+}
+
+inline void setattr(handle obj, handle name, handle value) {
+    if (PyObject_SetAttr(obj.ptr(), name.ptr(), value.ptr()) != 0) { throw error_already_set(); }
+}
+
+inline void setattr(handle obj, const char *name, handle value) {
+    if (PyObject_SetAttrString(obj.ptr(), name, value.ptr()) != 0) { throw error_already_set(); }
+}
+
+inline ssize_t hash(handle obj) {
+    auto h = PyObject_Hash(obj.ptr());
+    if (h == -1) { throw error_already_set(); }
+    return h;
+}
+
+/// @} python_builtins
+
+NAMESPACE_BEGIN(detail)
+inline handle get_function(handle value) {
+    if (value) {
+#if PY_MAJOR_VERSION >= 3
+        if (PyInstanceMethod_Check(value.ptr()))
+            value = PyInstanceMethod_GET_FUNCTION(value.ptr());
+        else
+#endif
+        if (PyMethod_Check(value.ptr()))
+            value = PyMethod_GET_FUNCTION(value.ptr());
+    }
+    return value;
+}
+
+// Helper aliases/functions to support implicit casting of values given to python accessors/methods.
+// When given a pyobject, this simply returns the pyobject as-is; for other C++ type, the value goes
+// through pybind11::cast(obj) to convert it to an `object`.
+template <typename T, enable_if_t<is_pyobject<T>::value, int> = 0>
+auto object_or_cast(T &&o) -> decltype(std::forward<T>(o)) { return std::forward<T>(o); }
+// The following casting version is implemented in cast.h:
+template <typename T, enable_if_t<!is_pyobject<T>::value, int> = 0>
+object object_or_cast(T &&o);
+// Match a PyObject*, which we want to convert directly to handle via its converting constructor
+inline handle object_or_cast(PyObject *ptr) { return ptr; }
+
+template <typename Policy>
+class accessor : public object_api<accessor<Policy>> {
+    using key_type = typename Policy::key_type;
+
+public:
+    accessor(handle obj, key_type key) : obj(obj), key(std::move(key)) { }
+    accessor(const accessor &) = default;
+    accessor(accessor &&) = default;
+
+    // accessor overload required to override default assignment operator (templates are not allowed
+    // to replace default compiler-generated assignments).
+    void operator=(const accessor &a) && { std::move(*this).operator=(handle(a)); }
+    void operator=(const accessor &a) & { operator=(handle(a)); }
+
+    template <typename T> void operator=(T &&value) && {
+        Policy::set(obj, key, object_or_cast(std::forward<T>(value)));
+    }
+    template <typename T> void operator=(T &&value) & {
+        get_cache() = reinterpret_borrow<object>(object_or_cast(std::forward<T>(value)));
+    }
+
+    template <typename T = Policy>
+    PYBIND11_DEPRECATED("Use of obj.attr(...) as bool is deprecated in favor of pybind11::hasattr(obj, ...)")
+    explicit operator enable_if_t<std::is_same<T, accessor_policies::str_attr>::value ||
+            std::is_same<T, accessor_policies::obj_attr>::value, bool>() const {
+        return hasattr(obj, key);
+    }
+    template <typename T = Policy>
+    PYBIND11_DEPRECATED("Use of obj[key] as bool is deprecated in favor of obj.contains(key)")
+    explicit operator enable_if_t<std::is_same<T, accessor_policies::generic_item>::value, bool>() const {
+        return obj.contains(key);
+    }
+
+    operator object() const { return get_cache(); }
+    PyObject *ptr() const { return get_cache().ptr(); }
+    template <typename T> T cast() const { return get_cache().template cast<T>(); }
+
+private:
+    object &get_cache() const {
+        if (!cache) { cache = Policy::get(obj, key); }
+        return cache;
+    }
+
+private:
+    handle obj;
+    key_type key;
+    mutable object cache;
+};
+
+NAMESPACE_BEGIN(accessor_policies)
+struct obj_attr {
+    using key_type = object;
+    static object get(handle obj, handle key) { return getattr(obj, key); }
+    static void set(handle obj, handle key, handle val) { setattr(obj, key, val); }
+};
+
+struct str_attr {
+    using key_type = const char *;
+    static object get(handle obj, const char *key) { return getattr(obj, key); }
+    static void set(handle obj, const char *key, handle val) { setattr(obj, key, val); }
+};
+
+struct generic_item {
+    using key_type = object;
+
+    static object get(handle obj, handle key) {
+        PyObject *result = PyObject_GetItem(obj.ptr(), key.ptr());
+        if (!result) { throw error_already_set(); }
+        return reinterpret_steal<object>(result);
+    }
+
+    static void set(handle obj, handle key, handle val) {
+        if (PyObject_SetItem(obj.ptr(), key.ptr(), val.ptr()) != 0) { throw error_already_set(); }
+    }
+};
+
+struct sequence_item {
+    using key_type = size_t;
+
+    static object get(handle obj, size_t index) {
+        PyObject *result = PySequence_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+        if (!result) { throw error_already_set(); }
+        return reinterpret_steal<object>(result);
+    }
+
+    static void set(handle obj, size_t index, handle val) {
+        // PySequence_SetItem does not steal a reference to 'val'
+        if (PySequence_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct list_item {
+    using key_type = size_t;
+
+    static object get(handle obj, size_t index) {
+        PyObject *result = PyList_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+        if (!result) { throw error_already_set(); }
+        return reinterpret_borrow<object>(result);
+    }
+
+    static void set(handle obj, size_t index, handle val) {
+        // PyList_SetItem steals a reference to 'val'
+        if (PyList_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.inc_ref().ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct tuple_item {
+    using key_type = size_t;
+
+    static object get(handle obj, size_t index) {
+        PyObject *result = PyTuple_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+        if (!result) { throw error_already_set(); }
+        return reinterpret_borrow<object>(result);
+    }
+
+    static void set(handle obj, size_t index, handle val) {
+        // PyTuple_SetItem steals a reference to 'val'
+        if (PyTuple_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.inc_ref().ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+NAMESPACE_END(accessor_policies)
+
+/// STL iterator template used for tuple, list, sequence and dict
+template <typename Policy>
+class generic_iterator : public Policy {
+    using It = generic_iterator;
+
+public:
+    using difference_type = ssize_t;
+    using iterator_category = typename Policy::iterator_category;
+    using value_type = typename Policy::value_type;
+    using reference = typename Policy::reference;
+    using pointer = typename Policy::pointer;
+
+    generic_iterator() = default;
+    generic_iterator(handle seq, ssize_t index) : Policy(seq, index) { }
+
+    reference operator*() const { return Policy::dereference(); }
+    reference operator[](difference_type n) const { return *(*this + n); }
+    pointer operator->() const { return **this; }
+
+    It &operator++() { Policy::increment(); return *this; }
+    It operator++(int) { auto copy = *this; Policy::increment(); return copy; }
+    It &operator--() { Policy::decrement(); return *this; }
+    It operator--(int) { auto copy = *this; Policy::decrement(); return copy; }
+    It &operator+=(difference_type n) { Policy::advance(n); return *this; }
+    It &operator-=(difference_type n) { Policy::advance(-n); return *this; }
+
+    friend It operator+(const It &a, difference_type n) { auto copy = a; return copy += n; }
+    friend It operator+(difference_type n, const It &b) { return b + n; }
+    friend It operator-(const It &a, difference_type n) { auto copy = a; return copy -= n; }
+    friend difference_type operator-(const It &a, const It &b) { return a.distance_to(b); }
+
+    friend bool operator==(const It &a, const It &b) { return a.equal(b); }
+    friend bool operator!=(const It &a, const It &b) { return !(a == b); }
+    friend bool operator< (const It &a, const It &b) { return b - a > 0; }
+    friend bool operator> (const It &a, const It &b) { return b < a; }
+    friend bool operator>=(const It &a, const It &b) { return !(a < b); }
+    friend bool operator<=(const It &a, const It &b) { return !(a > b); }
+};
+
+NAMESPACE_BEGIN(iterator_policies)
+/// Quick proxy class needed to implement ``operator->`` for iterators which can't return pointers
+template <typename T>
+struct arrow_proxy {
+    T value;
+
+    arrow_proxy(T &&value) : value(std::move(value)) { }
+    T *operator->() const { return &value; }
+};
+
+/// Lightweight iterator policy using just a simple pointer: see ``PySequence_Fast_ITEMS``
+class sequence_fast_readonly {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = handle;
+    using reference = const handle;
+    using pointer = arrow_proxy<const handle>;
+
+    sequence_fast_readonly(handle obj, ssize_t n) : ptr(PySequence_Fast_ITEMS(obj.ptr()) + n) { }
+
+    reference dereference() const { return *ptr; }
+    void increment() { ++ptr; }
+    void decrement() { --ptr; }
+    void advance(ssize_t n) { ptr += n; }
+    bool equal(const sequence_fast_readonly &b) const { return ptr == b.ptr; }
+    ssize_t distance_to(const sequence_fast_readonly &b) const { return ptr - b.ptr; }
+
+private:
+    PyObject **ptr;
+};
+
+/// Full read and write access using the sequence protocol: see ``detail::sequence_accessor``
+class sequence_slow_readwrite {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = object;
+    using reference = sequence_accessor;
+    using pointer = arrow_proxy<const sequence_accessor>;
+
+    sequence_slow_readwrite(handle obj, ssize_t index) : obj(obj), index(index) { }
+
+    reference dereference() const { return {obj, static_cast<size_t>(index)}; }
+    void increment() { ++index; }
+    void decrement() { --index; }
+    void advance(ssize_t n) { index += n; }
+    bool equal(const sequence_slow_readwrite &b) const { return index == b.index; }
+    ssize_t distance_to(const sequence_slow_readwrite &b) const { return index - b.index; }
+
+private:
+    handle obj;
+    ssize_t index;
+};
+
+/// Python's dictionary protocol permits this to be a forward iterator
+class dict_readonly {
+protected:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = std::pair<handle, handle>;
+    using reference = const value_type;
+    using pointer = arrow_proxy<const value_type>;
+
+    dict_readonly() = default;
+    dict_readonly(handle obj, ssize_t pos) : obj(obj), pos(pos) { increment(); }
+
+    reference dereference() const { return {key, value}; }
+    void increment() { if (!PyDict_Next(obj.ptr(), &pos, &key, &value)) { pos = -1; } }
+    bool equal(const dict_readonly &b) const { return pos == b.pos; }
+
+private:
+    handle obj;
+    PyObject *key = nullptr, *value = nullptr;
+    ssize_t pos = -1;
+};
+NAMESPACE_END(iterator_policies)
+
+#if !defined(PYPY_VERSION)
+using tuple_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+using list_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+#else
+using tuple_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using list_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+#endif
+
+using sequence_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using dict_iterator = generic_iterator<iterator_policies::dict_readonly>;
+
+inline bool PyIterable_Check(PyObject *obj) {
+    PyObject *iter = PyObject_GetIter(obj);
+    if (iter) {
+        Py_DECREF(iter);
+        return true;
+    } else {
+        PyErr_Clear();
+        return false;
+    }
+}
+
+inline bool PyNone_Check(PyObject *o) { return o == Py_None; }
+#if PY_MAJOR_VERSION >= 3
+inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; }
+#endif
+
+inline bool PyUnicode_Check_Permissive(PyObject *o) { return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o); }
+
+inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; }
+
+class kwargs_proxy : public handle {
+public:
+    explicit kwargs_proxy(handle h) : handle(h) { }
+};
+
+class args_proxy : public handle {
+public:
+    explicit args_proxy(handle h) : handle(h) { }
+    kwargs_proxy operator*() const { return kwargs_proxy(*this); }
+};
+
+/// Python argument categories (using PEP 448 terms)
+template <typename T> using is_keyword = std::is_base_of<arg, T>;
+template <typename T> using is_s_unpacking = std::is_same<args_proxy, T>; // * unpacking
+template <typename T> using is_ds_unpacking = std::is_same<kwargs_proxy, T>; // ** unpacking
+template <typename T> using is_positional = satisfies_none_of<T,
+    is_keyword, is_s_unpacking, is_ds_unpacking
+>;
+template <typename T> using is_keyword_or_ds = satisfies_any_of<T, is_keyword, is_ds_unpacking>;
+
+// Call argument collector forward declarations
+template <return_value_policy policy = return_value_policy::automatic_reference>
+class simple_collector;
+template <return_value_policy policy = return_value_policy::automatic_reference>
+class unpacking_collector;
+
+NAMESPACE_END(detail)
+
+// TODO: After the deprecated constructors are removed, this macro can be simplified by
+//       inheriting ctors: `using Parent::Parent`. It's not an option right now because
+//       the `using` statement triggers the parent deprecation warning even if the ctor
+//       isn't even used.
+#define PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
+    public: \
+        PYBIND11_DEPRECATED("Use reinterpret_borrow<"#Name">() or reinterpret_steal<"#Name">()") \
+        Name(handle h, bool is_borrowed) : Parent(is_borrowed ? Parent(h, borrowed_t{}) : Parent(h, stolen_t{})) { } \
+        Name(handle h, borrowed_t) : Parent(h, borrowed_t{}) { } \
+        Name(handle h, stolen_t) : Parent(h, stolen_t{}) { } \
+        PYBIND11_DEPRECATED("Use py::isinstance<py::python_type>(obj) instead") \
+        bool check() const { return m_ptr != nullptr && (bool) CheckFun(m_ptr); } \
+        static bool check_(handle h) { return h.ptr() != nullptr && CheckFun(h.ptr()); }
+
+#define PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun) \
+    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
+    /* This is deliberately not 'explicit' to allow implicit conversion from object: */ \
+    Name(const object &o) \
+    : Parent(check_(o) ? o.inc_ref().ptr() : ConvertFun(o.ptr()), stolen_t{}) \
+    { if (!m_ptr) throw error_already_set(); } \
+    Name(object &&o) \
+    : Parent(check_(o) ? o.release().ptr() : ConvertFun(o.ptr()), stolen_t{}) \
+    { if (!m_ptr) throw error_already_set(); } \
+    template <typename Policy_> \
+    Name(const ::pybind11::detail::accessor<Policy_> &a) : Name(object(a)) { }
+
+#define PYBIND11_OBJECT(Name, Parent, CheckFun) \
+    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
+    /* This is deliberately not 'explicit' to allow implicit conversion from object: */ \
+    Name(const object &o) : Parent(o) { } \
+    Name(object &&o) : Parent(std::move(o)) { }
+
+#define PYBIND11_OBJECT_DEFAULT(Name, Parent, CheckFun) \
+    PYBIND11_OBJECT(Name, Parent, CheckFun) \
+    Name() : Parent() { }
+
+/// \addtogroup pytypes
+/// @{
+
+/** \rst
+    Wraps a Python iterator so that it can also be used as a C++ input iterator
+
+    Caveat: copying an iterator does not (and cannot) clone the internal
+    state of the Python iterable. This also applies to the post-increment
+    operator. This iterator should only be used to retrieve the current
+    value using ``operator*()``.
+\endrst */
+class iterator : public object {
+public:
+    using iterator_category = std::input_iterator_tag;
+    using difference_type = ssize_t;
+    using value_type = handle;
+    using reference = const handle;
+    using pointer = const handle *;
+
+    PYBIND11_OBJECT_DEFAULT(iterator, object, PyIter_Check)
+
+    iterator& operator++() {
+        advance();
+        return *this;
+    }
+
+    iterator operator++(int) {
+        auto rv = *this;
+        advance();
+        return rv;
+    }
+
+    reference operator*() const {
+        if (m_ptr && !value.ptr()) {
+            auto& self = const_cast<iterator &>(*this);
+            self.advance();
+        }
+        return value;
+    }
+
+    pointer operator->() const { operator*(); return &value; }
+
+    /** \rst
+         The value which marks the end of the iteration. ``it == iterator::sentinel()``
+         is equivalent to catching ``StopIteration`` in Python.
+
+         .. code-block:: cpp
+
+             void foo(py::iterator it) {
+                 while (it != py::iterator::sentinel()) {
+                    // use `*it`
+                    ++it;
+                 }
+             }
+    \endrst */
+    static iterator sentinel() { return {}; }
+
+    friend bool operator==(const iterator &a, const iterator &b) { return a->ptr() == b->ptr(); }
+    friend bool operator!=(const iterator &a, const iterator &b) { return a->ptr() != b->ptr(); }
+
+private:
+    void advance() {
+        value = reinterpret_steal<object>(PyIter_Next(m_ptr));
+        if (PyErr_Occurred()) { throw error_already_set(); }
+    }
+
+private:
+    object value = {};
+};
+
+class iterable : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(iterable, object, detail::PyIterable_Check)
+};
+
+class bytes;
+
+class str : public object {
+public:
+    PYBIND11_OBJECT_CVT(str, object, detail::PyUnicode_Check_Permissive, raw_str)
+
+    str(const char *c, size_t n)
+        : object(PyUnicode_FromStringAndSize(c, (ssize_t) n), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate string object!");
+    }
+
+    // 'explicit' is explicitly omitted from the following constructors to allow implicit conversion to py::str from C++ string-like objects
+    str(const char *c = "")
+        : object(PyUnicode_FromString(c), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate string object!");
+    }
+
+    str(const std::string &s) : str(s.data(), s.size()) { }
+
+    explicit str(const bytes &b);
+
+    /** \rst
+        Return a string representation of the object. This is analogous to
+        the ``str()`` function in Python.
+    \endrst */
+    explicit str(handle h) : object(raw_str(h.ptr()), stolen_t{}) { }
+
+    operator std::string() const {
+        object temp = *this;
+        if (PyUnicode_Check(m_ptr)) {
+            temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(m_ptr));
+            if (!temp)
+                pybind11_fail("Unable to extract string contents! (encoding issue)");
+        }
+        char *buffer;
+        ssize_t length;
+        if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length))
+            pybind11_fail("Unable to extract string contents! (invalid type)");
+        return std::string(buffer, (size_t) length);
+    }
+
+    template <typename... Args>
+    str format(Args &&...args) const {
+        return attr("format")(std::forward<Args>(args)...);
+    }
+
+private:
+    /// Return string representation -- always returns a new reference, even if already a str
+    static PyObject *raw_str(PyObject *op) {
+        PyObject *str_value = PyObject_Str(op);
+#if PY_MAJOR_VERSION < 3
+        if (!str_value) throw error_already_set();
+        PyObject *unicode = PyUnicode_FromEncodedObject(str_value, "utf-8", nullptr);
+        Py_XDECREF(str_value); str_value = unicode;
+#endif
+        return str_value;
+    }
+};
+/// @} pytypes
+
+inline namespace literals {
+/** \rst
+    String literal version of `str`
+ \endrst */
+inline str operator"" _s(const char *s, size_t size) { return {s, size}; }
+}
+
+/// \addtogroup pytypes
+/// @{
+class bytes : public object {
+public:
+    PYBIND11_OBJECT(bytes, object, PYBIND11_BYTES_CHECK)
+
+    // Allow implicit conversion:
+    bytes(const char *c = "")
+        : object(PYBIND11_BYTES_FROM_STRING(c), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate bytes object!");
+    }
+
+    bytes(const char *c, size_t n)
+        : object(PYBIND11_BYTES_FROM_STRING_AND_SIZE(c, (ssize_t) n), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate bytes object!");
+    }
+
+    // Allow implicit conversion:
+    bytes(const std::string &s) : bytes(s.data(), s.size()) { }
+
+    explicit bytes(const pybind11::str &s);
+
+    operator std::string() const {
+        char *buffer;
+        ssize_t length;
+        if (PYBIND11_BYTES_AS_STRING_AND_SIZE(m_ptr, &buffer, &length))
+            pybind11_fail("Unable to extract bytes contents!");
+        return std::string(buffer, (size_t) length);
+    }
+};
+
+inline bytes::bytes(const pybind11::str &s) {
+    object temp = s;
+    if (PyUnicode_Check(s.ptr())) {
+        temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(s.ptr()));
+        if (!temp)
+            pybind11_fail("Unable to extract string contents! (encoding issue)");
+    }
+    char *buffer;
+    ssize_t length;
+    if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length))
+        pybind11_fail("Unable to extract string contents! (invalid type)");
+    auto obj = reinterpret_steal<object>(PYBIND11_BYTES_FROM_STRING_AND_SIZE(buffer, length));
+    if (!obj)
+        pybind11_fail("Could not allocate bytes object!");
+    m_ptr = obj.release().ptr();
+}
+
+inline str::str(const bytes& b) {
+    char *buffer;
+    ssize_t length;
+    if (PYBIND11_BYTES_AS_STRING_AND_SIZE(b.ptr(), &buffer, &length))
+        pybind11_fail("Unable to extract bytes contents!");
+    auto obj = reinterpret_steal<object>(PyUnicode_FromStringAndSize(buffer, (ssize_t) length));
+    if (!obj)
+        pybind11_fail("Could not allocate string object!");
+    m_ptr = obj.release().ptr();
+}
+
+class none : public object {
+public:
+    PYBIND11_OBJECT(none, object, detail::PyNone_Check)
+    none() : object(Py_None, borrowed_t{}) { }
+};
+
+#if PY_MAJOR_VERSION >= 3
+class ellipsis : public object {
+public:
+    PYBIND11_OBJECT(ellipsis, object, detail::PyEllipsis_Check)
+    ellipsis() : object(Py_Ellipsis, borrowed_t{}) { }
+};
+#endif
+
+class bool_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(bool_, object, PyBool_Check, raw_bool)
+    bool_() : object(Py_False, borrowed_t{}) { }
+    // Allow implicit conversion from and to `bool`:
+    bool_(bool value) : object(value ? Py_True : Py_False, borrowed_t{}) { }
+    operator bool() const { return m_ptr && PyLong_AsLong(m_ptr) != 0; }
+
+private:
+    /// Return the truth value of an object -- always returns a new reference
+    static PyObject *raw_bool(PyObject *op) {
+        const auto value = PyObject_IsTrue(op);
+        if (value == -1) return nullptr;
+        return handle(value ? Py_True : Py_False).inc_ref().ptr();
+    }
+};
+
+NAMESPACE_BEGIN(detail)
+// Converts a value to the given unsigned type.  If an error occurs, you get back (Unsigned) -1;
+// otherwise you get back the unsigned long or unsigned long long value cast to (Unsigned).
+// (The distinction is critically important when casting a returned -1 error value to some other
+// unsigned type: (A)-1 != (B)-1 when A and B are unsigned types of different sizes).
+template <typename Unsigned>
+Unsigned as_unsigned(PyObject *o) {
+    if (sizeof(Unsigned) <= sizeof(unsigned long)
+#if PY_VERSION_HEX < 0x03000000
+            || PyInt_Check(o)
+#endif
+    ) {
+        unsigned long v = PyLong_AsUnsignedLong(o);
+        return v == (unsigned long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+    }
+    else {
+        unsigned long long v = PyLong_AsUnsignedLongLong(o);
+        return v == (unsigned long long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+    }
+}
+NAMESPACE_END(detail)
+
+class int_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(int_, object, PYBIND11_LONG_CHECK, PyNumber_Long)
+    int_() : object(PyLong_FromLong(0), stolen_t{}) { }
+    // Allow implicit conversion from C++ integral types:
+    template <typename T,
+              detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    int_(T value) {
+        if (sizeof(T) <= sizeof(long)) {
+            if (std::is_signed<T>::value)
+                m_ptr = PyLong_FromLong((long) value);
+            else
+                m_ptr = PyLong_FromUnsignedLong((unsigned long) value);
+        } else {
+            if (std::is_signed<T>::value)
+                m_ptr = PyLong_FromLongLong((long long) value);
+            else
+                m_ptr = PyLong_FromUnsignedLongLong((unsigned long long) value);
+        }
+        if (!m_ptr) pybind11_fail("Could not allocate int object!");
+    }
+
+    template <typename T,
+              detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    operator T() const {
+        return std::is_unsigned<T>::value
+            ? detail::as_unsigned<T>(m_ptr)
+            : sizeof(T) <= sizeof(long)
+              ? (T) PyLong_AsLong(m_ptr)
+              : (T) PYBIND11_LONG_AS_LONGLONG(m_ptr);
+    }
+};
+
+class float_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(float_, object, PyFloat_Check, PyNumber_Float)
+    // Allow implicit conversion from float/double:
+    float_(float value) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate float object!");
+    }
+    float_(double value = .0) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate float object!");
+    }
+    operator float() const { return (float) PyFloat_AsDouble(m_ptr); }
+    operator double() const { return (double) PyFloat_AsDouble(m_ptr); }
+};
+
+class weakref : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(weakref, object, PyWeakref_Check)
+    explicit weakref(handle obj, handle callback = {})
+        : object(PyWeakref_NewRef(obj.ptr(), callback.ptr()), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate weak reference!");
+    }
+};
+
+class slice : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(slice, object, PySlice_Check)
+    slice(ssize_t start_, ssize_t stop_, ssize_t step_) {
+        int_ start(start_), stop(stop_), step(step_);
+        m_ptr = PySlice_New(start.ptr(), stop.ptr(), step.ptr());
+        if (!m_ptr) pybind11_fail("Could not allocate slice object!");
+    }
+    bool compute(size_t length, size_t *start, size_t *stop, size_t *step,
+                 size_t *slicelength) const {
+        return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr,
+                                    (ssize_t) length, (ssize_t *) start,
+                                    (ssize_t *) stop, (ssize_t *) step,
+                                    (ssize_t *) slicelength) == 0;
+    }
+    bool compute(ssize_t length, ssize_t *start, ssize_t *stop, ssize_t *step,
+      ssize_t *slicelength) const {
+      return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr,
+          length, start,
+          stop, step,
+          slicelength) == 0;
+    }
+};
+
+class capsule : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(capsule, object, PyCapsule_CheckExact)
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<capsule>() or reinterpret_steal<capsule>()")
+    capsule(PyObject *ptr, bool is_borrowed) : object(is_borrowed ? object(ptr, borrowed_t{}) : object(ptr, stolen_t{})) { }
+
+    explicit capsule(const void *value, const char *name = nullptr, void (*destructor)(PyObject *) = nullptr)
+        : object(PyCapsule_New(const_cast<void *>(value), name, destructor), stolen_t{}) {
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+    }
+
+    PYBIND11_DEPRECATED("Please pass a destructor that takes a void pointer as input")
+    capsule(const void *value, void (*destruct)(PyObject *))
+        : object(PyCapsule_New(const_cast<void*>(value), nullptr, destruct), stolen_t{}) {
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+    }
+
+    capsule(const void *value, void (*destructor)(void *)) {
+        m_ptr = PyCapsule_New(const_cast<void *>(value), nullptr, [](PyObject *o) {
+            auto destructor = reinterpret_cast<void (*)(void *)>(PyCapsule_GetContext(o));
+            void *ptr = PyCapsule_GetPointer(o, nullptr);
+            destructor(ptr);
+        });
+
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+
+        if (PyCapsule_SetContext(m_ptr, (void *) destructor) != 0)
+            pybind11_fail("Could not set capsule context!");
+    }
+
+    capsule(void (*destructor)()) {
+        m_ptr = PyCapsule_New(reinterpret_cast<void *>(destructor), nullptr, [](PyObject *o) {
+            auto destructor = reinterpret_cast<void (*)()>(PyCapsule_GetPointer(o, nullptr));
+            destructor();
+        });
+
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+    }
+
+    template <typename T> operator T *() const {
+        auto name = this->name();
+        T * result = static_cast<T *>(PyCapsule_GetPointer(m_ptr, name));
+        if (!result) pybind11_fail("Unable to extract capsule contents!");
+        return result;
+    }
+
+    const char *name() const { return PyCapsule_GetName(m_ptr); }
+};
+
+class tuple : public object {
+public:
+    PYBIND11_OBJECT_CVT(tuple, object, PyTuple_Check, PySequence_Tuple)
+    explicit tuple(size_t size = 0) : object(PyTuple_New((ssize_t) size), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate tuple object!");
+    }
+    size_t size() const { return (size_t) PyTuple_Size(m_ptr); }
+    detail::tuple_accessor operator[](size_t index) const { return {*this, index}; }
+    detail::item_accessor operator[](handle h) const { return object::operator[](h); }
+    detail::tuple_iterator begin() const { return {*this, 0}; }
+    detail::tuple_iterator end() const { return {*this, PyTuple_GET_SIZE(m_ptr)}; }
+};
+
+class dict : public object {
+public:
+    PYBIND11_OBJECT_CVT(dict, object, PyDict_Check, raw_dict)
+    dict() : object(PyDict_New(), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate dict object!");
+    }
+    template <typename... Args,
+              typename = detail::enable_if_t<detail::all_of<detail::is_keyword_or_ds<Args>...>::value>,
+              // MSVC workaround: it can't compile an out-of-line definition, so defer the collector
+              typename collector = detail::deferred_t<detail::unpacking_collector<>, Args...>>
+    explicit dict(Args &&...args) : dict(collector(std::forward<Args>(args)...).kwargs()) { }
+
+    size_t size() const { return (size_t) PyDict_Size(m_ptr); }
+    detail::dict_iterator begin() const { return {*this, 0}; }
+    detail::dict_iterator end() const { return {}; }
+    void clear() const { PyDict_Clear(ptr()); }
+    bool contains(handle key) const { return PyDict_Contains(ptr(), key.ptr()) == 1; }
+    bool contains(const char *key) const { return PyDict_Contains(ptr(), pybind11::str(key).ptr()) == 1; }
+
+private:
+    /// Call the `dict` Python type -- always returns a new reference
+    static PyObject *raw_dict(PyObject *op) {
+        if (PyDict_Check(op))
+            return handle(op).inc_ref().ptr();
+        return PyObject_CallFunctionObjArgs((PyObject *) &PyDict_Type, op, nullptr);
+    }
+};
+
+class sequence : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(sequence, object, PySequence_Check)
+    size_t size() const { return (size_t) PySequence_Size(m_ptr); }
+    detail::sequence_accessor operator[](size_t index) const { return {*this, index}; }
+    detail::item_accessor operator[](handle h) const { return object::operator[](h); }
+    detail::sequence_iterator begin() const { return {*this, 0}; }
+    detail::sequence_iterator end() const { return {*this, PySequence_Size(m_ptr)}; }
+};
+
+class list : public object {
+public:
+    PYBIND11_OBJECT_CVT(list, object, PyList_Check, PySequence_List)
+    explicit list(size_t size = 0) : object(PyList_New((ssize_t) size), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate list object!");
+    }
+    size_t size() const { return (size_t) PyList_Size(m_ptr); }
+    detail::list_accessor operator[](size_t index) const { return {*this, index}; }
+    detail::item_accessor operator[](handle h) const { return object::operator[](h); }
+    detail::list_iterator begin() const { return {*this, 0}; }
+    detail::list_iterator end() const { return {*this, PyList_GET_SIZE(m_ptr)}; }
+    template <typename T> void append(T &&val) const {
+        PyList_Append(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr());
+    }
+};
+
+class args : public tuple { PYBIND11_OBJECT_DEFAULT(args, tuple, PyTuple_Check) };
+class kwargs : public dict { PYBIND11_OBJECT_DEFAULT(kwargs, dict, PyDict_Check)  };
+
+class set : public object {
+public:
+    PYBIND11_OBJECT_CVT(set, object, PySet_Check, PySet_New)
+    set() : object(PySet_New(nullptr), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate set object!");
+    }
+    size_t size() const { return (size_t) PySet_Size(m_ptr); }
+    template <typename T> bool add(T &&val) const {
+        return PySet_Add(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) == 0;
+    }
+    void clear() const { PySet_Clear(m_ptr); }
+};
+
+class function : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(function, object, PyCallable_Check)
+    handle cpp_function() const {
+        handle fun = detail::get_function(m_ptr);
+        if (fun && PyCFunction_Check(fun.ptr()))
+            return fun;
+        return handle();
+    }
+    bool is_cpp_function() const { return (bool) cpp_function(); }
+};
+
+class staticmethod : public object {
+public:
+    PYBIND11_OBJECT_CVT(staticmethod, object, detail::PyStaticMethod_Check, PyStaticMethod_New)
+};
+
+class buffer : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(buffer, object, PyObject_CheckBuffer)
+
+    buffer_info request(bool writable = false) {
+        int flags = PyBUF_STRIDES | PyBUF_FORMAT;
+        if (writable) flags |= PyBUF_WRITABLE;
+        Py_buffer *view = new Py_buffer();
+        if (PyObject_GetBuffer(m_ptr, view, flags) != 0) {
+            delete view;
+            throw error_already_set();
+        }
+        return buffer_info(view);
+    }
+};
+
+class memoryview : public object {
+public:
+    explicit memoryview(const buffer_info& info) {
+        static Py_buffer buf { };
+        // Py_buffer uses signed sizes, strides and shape!..
+        static std::vector<Py_ssize_t> py_strides { };
+        static std::vector<Py_ssize_t> py_shape { };
+        buf.buf = info.ptr;
+        buf.itemsize = info.itemsize;
+        buf.format = const_cast<char *>(info.format.c_str());
+        buf.ndim = (int) info.ndim;
+        buf.len = info.size;
+        py_strides.clear();
+        py_shape.clear();
+        for (size_t i = 0; i < (size_t) info.ndim; ++i) {
+            py_strides.push_back(info.strides[i]);
+            py_shape.push_back(info.shape[i]);
+        }
+        buf.strides = py_strides.data();
+        buf.shape = py_shape.data();
+        buf.suboffsets = nullptr;
+        buf.readonly = false;
+        buf.internal = nullptr;
+
+        m_ptr = PyMemoryView_FromBuffer(&buf);
+        if (!m_ptr)
+            pybind11_fail("Unable to create memoryview from buffer descriptor");
+    }
+
+    PYBIND11_OBJECT_CVT(memoryview, object, PyMemoryView_Check, PyMemoryView_FromObject)
+};
+/// @} pytypes
+
+/// \addtogroup python_builtins
+/// @{
+inline size_t len(handle h) {
+    ssize_t result = PyObject_Length(h.ptr());
+    if (result < 0)
+        pybind11_fail("Unable to compute length of object");
+    return (size_t) result;
+}
+
+inline size_t len_hint(handle h) {
+#if PY_VERSION_HEX >= 0x03040000
+    ssize_t result = PyObject_LengthHint(h.ptr(), 0);
+#else
+    ssize_t result = PyObject_Length(h.ptr());
+#endif
+    if (result < 0) {
+        // Sometimes a length can't be determined at all (eg generators)
+        // In which case simply return 0
+        PyErr_Clear();
+        return 0;
+    }
+    return (size_t) result;
+}
+
+inline str repr(handle h) {
+    PyObject *str_value = PyObject_Repr(h.ptr());
+    if (!str_value) throw error_already_set();
+#if PY_MAJOR_VERSION < 3
+    PyObject *unicode = PyUnicode_FromEncodedObject(str_value, "utf-8", nullptr);
+    Py_XDECREF(str_value); str_value = unicode;
+    if (!str_value) throw error_already_set();
+#endif
+    return reinterpret_steal<str>(str_value);
+}
+
+inline iterator iter(handle obj) {
+    PyObject *result = PyObject_GetIter(obj.ptr());
+    if (!result) { throw error_already_set(); }
+    return reinterpret_steal<iterator>(result);
+}
+/// @} python_builtins
+
+NAMESPACE_BEGIN(detail)
+template <typename D> iterator object_api<D>::begin() const { return iter(derived()); }
+template <typename D> iterator object_api<D>::end() const { return iterator::sentinel(); }
+template <typename D> item_accessor object_api<D>::operator[](handle key) const {
+    return {derived(), reinterpret_borrow<object>(key)};
+}
+template <typename D> item_accessor object_api<D>::operator[](const char *key) const {
+    return {derived(), pybind11::str(key)};
+}
+template <typename D> obj_attr_accessor object_api<D>::attr(handle key) const {
+    return {derived(), reinterpret_borrow<object>(key)};
+}
+template <typename D> str_attr_accessor object_api<D>::attr(const char *key) const {
+    return {derived(), key};
+}
+template <typename D> args_proxy object_api<D>::operator*() const {
+    return args_proxy(derived().ptr());
+}
+template <typename D> template <typename T> bool object_api<D>::contains(T &&item) const {
+    return attr("__contains__")(std::forward<T>(item)).template cast<bool>();
+}
+
+template <typename D>
+pybind11::str object_api<D>::str() const { return pybind11::str(derived()); }
+
+template <typename D>
+str_attr_accessor object_api<D>::doc() const { return attr("__doc__"); }
+
+template <typename D>
+handle object_api<D>::get_type() const { return (PyObject *) Py_TYPE(derived().ptr()); }
+
+template <typename D>
+bool object_api<D>::rich_compare(object_api const &other, int value) const {
+    int rv = PyObject_RichCompareBool(derived().ptr(), other.derived().ptr(), value);
+    if (rv == -1)
+        throw error_already_set();
+    return rv == 1;
+}
+
+#define PYBIND11_MATH_OPERATOR_UNARY(op, fn)                                   \
+    template <typename D> object object_api<D>::op() const {                   \
+        object result = reinterpret_steal<object>(fn(derived().ptr()));        \
+        if (!result.ptr())                                                     \
+            throw error_already_set();                                         \
+        return result;                                                         \
+    }
+
+#define PYBIND11_MATH_OPERATOR_BINARY(op, fn)                                  \
+    template <typename D>                                                      \
+    object object_api<D>::op(object_api const &other) const {                  \
+        object result = reinterpret_steal<object>(                             \
+            fn(derived().ptr(), other.derived().ptr()));                       \
+        if (!result.ptr())                                                     \
+            throw error_already_set();                                         \
+        return result;                                                         \
+    }
+
+PYBIND11_MATH_OPERATOR_UNARY (operator~,   PyNumber_Invert)
+PYBIND11_MATH_OPERATOR_UNARY (operator-,   PyNumber_Negative)
+PYBIND11_MATH_OPERATOR_BINARY(operator+,   PyNumber_Add)
+PYBIND11_MATH_OPERATOR_BINARY(operator+=,  PyNumber_InPlaceAdd)
+PYBIND11_MATH_OPERATOR_BINARY(operator-,   PyNumber_Subtract)
+PYBIND11_MATH_OPERATOR_BINARY(operator-=,  PyNumber_InPlaceSubtract)
+PYBIND11_MATH_OPERATOR_BINARY(operator*,   PyNumber_Multiply)
+PYBIND11_MATH_OPERATOR_BINARY(operator*=,  PyNumber_InPlaceMultiply)
+PYBIND11_MATH_OPERATOR_BINARY(operator/,   PyNumber_TrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY(operator/=,  PyNumber_InPlaceTrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY(operator|,   PyNumber_Or)
+PYBIND11_MATH_OPERATOR_BINARY(operator|=,  PyNumber_InPlaceOr)
+PYBIND11_MATH_OPERATOR_BINARY(operator&,   PyNumber_And)
+PYBIND11_MATH_OPERATOR_BINARY(operator&=,  PyNumber_InPlaceAnd)
+PYBIND11_MATH_OPERATOR_BINARY(operator^,   PyNumber_Xor)
+PYBIND11_MATH_OPERATOR_BINARY(operator^=,  PyNumber_InPlaceXor)
+PYBIND11_MATH_OPERATOR_BINARY(operator<<,  PyNumber_Lshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator<<=, PyNumber_InPlaceLshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator>>,  PyNumber_Rshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator>>=, PyNumber_InPlaceRshift)
+
+#undef PYBIND11_MATH_OPERATOR_UNARY
+#undef PYBIND11_MATH_OPERATOR_BINARY
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/src/pybind11/stl.h b/python/src/pybind11/stl.h
new file mode 100644
index 000000000..32f8d294a
--- /dev/null
+++ b/python/src/pybind11/stl.h
@@ -0,0 +1,386 @@
+/*
+    pybind11/stl.h: Transparent conversion for STL data types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <set>
+#include <unordered_set>
+#include <map>
+#include <unordered_map>
+#include <iostream>
+#include <list>
+#include <deque>
+#include <valarray>
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+#ifdef __has_include
+// std::optional (but including it in c++14 mode isn't allowed)
+#  if defined(PYBIND11_CPP17) && __has_include(<optional>)
+#    include <optional>
+#    define PYBIND11_HAS_OPTIONAL 1
+#  endif
+// std::experimental::optional (but not allowed in c++11 mode)
+#  if defined(PYBIND11_CPP14) && (__has_include(<experimental/optional>) && \
+                                 !__has_include(<optional>))
+#    include <experimental/optional>
+#    define PYBIND11_HAS_EXP_OPTIONAL 1
+#  endif
+// std::variant
+#  if defined(PYBIND11_CPP17) && __has_include(<variant>)
+#    include <variant>
+#    define PYBIND11_HAS_VARIANT 1
+#  endif
+#elif defined(_MSC_VER) && defined(PYBIND11_CPP17)
+#  include <optional>
+#  include <variant>
+#  define PYBIND11_HAS_OPTIONAL 1
+#  define PYBIND11_HAS_VARIANT 1
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// Extracts an const lvalue reference or rvalue reference for U based on the type of T (e.g. for
+/// forwarding a container element).  Typically used indirect via forwarded_type(), below.
+template <typename T, typename U>
+using forwarded_type = conditional_t<
+    std::is_lvalue_reference<T>::value, remove_reference_t<U> &, remove_reference_t<U> &&>;
+
+/// Forwards a value U as rvalue or lvalue according to whether T is rvalue or lvalue; typically
+/// used for forwarding a container's elements.
+template <typename T, typename U>
+forwarded_type<T, U> forward_like(U &&u) {
+    return std::forward<detail::forwarded_type<T, U>>(std::forward<U>(u));
+}
+
+template <typename Type, typename Key> struct set_caster {
+    using type = Type;
+    using key_conv = make_caster<Key>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<pybind11::set>(src))
+            return false;
+        auto s = reinterpret_borrow<pybind11::set>(src);
+        value.clear();
+        for (auto entry : s) {
+            key_conv conv;
+            if (!conv.load(entry, convert))
+                return false;
+            value.insert(cast_op<Key &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value)
+            policy = return_value_policy_override<Key>::policy(policy);
+        pybind11::set s;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(key_conv::cast(forward_like<T>(value), policy, parent));
+            if (!value_ || !s.add(value_))
+                return handle();
+        }
+        return s.release();
+    }
+
+    PYBIND11_TYPE_CASTER(type, _("Set[") + key_conv::name + _("]"));
+};
+
+template <typename Type, typename Key, typename Value> struct map_caster {
+    using key_conv   = make_caster<Key>;
+    using value_conv = make_caster<Value>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<dict>(src))
+            return false;
+        auto d = reinterpret_borrow<dict>(src);
+        value.clear();
+        for (auto it : d) {
+            key_conv kconv;
+            value_conv vconv;
+            if (!kconv.load(it.first.ptr(), convert) ||
+                !vconv.load(it.second.ptr(), convert))
+                return false;
+            value.emplace(cast_op<Key &&>(std::move(kconv)), cast_op<Value &&>(std::move(vconv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        dict d;
+        return_value_policy policy_key = policy;
+        return_value_policy policy_value = policy;
+        if (!std::is_lvalue_reference<T>::value) {
+            policy_key = return_value_policy_override<Key>::policy(policy_key);
+            policy_value = return_value_policy_override<Value>::policy(policy_value);
+        }
+        for (auto &&kv : src) {
+            auto key = reinterpret_steal<object>(key_conv::cast(forward_like<T>(kv.first), policy_key, parent));
+            auto value = reinterpret_steal<object>(value_conv::cast(forward_like<T>(kv.second), policy_value, parent));
+            if (!key || !value)
+                return handle();
+            d[key] = value;
+        }
+        return d.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, _("Dict[") + key_conv::name + _(", ") + value_conv::name + _("]"));
+};
+
+template <typename Type, typename Value> struct list_caster {
+    using value_conv = make_caster<Value>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src) || isinstance<str>(src))
+            return false;
+        auto s = reinterpret_borrow<sequence>(src);
+        value.clear();
+        reserve_maybe(s, &value);
+        for (auto it : s) {
+            value_conv conv;
+            if (!conv.load(it, convert))
+                return false;
+            value.push_back(cast_op<Value &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+private:
+    template <typename T = Type,
+              enable_if_t<std::is_same<decltype(std::declval<T>().reserve(0)), void>::value, int> = 0>
+    void reserve_maybe(sequence s, Type *) { value.reserve(s.size()); }
+    void reserve_maybe(sequence, void *) { }
+
+public:
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value)
+            policy = return_value_policy_override<Value>::policy(policy);
+        list l(src.size());
+        size_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
+            if (!value_)
+                return handle();
+            PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, _("List[") + value_conv::name + _("]"));
+};
+
+template <typename Type, typename Alloc> struct type_caster<std::vector<Type, Alloc>>
+ : list_caster<std::vector<Type, Alloc>, Type> { };
+
+template <typename Type, typename Alloc> struct type_caster<std::deque<Type, Alloc>>
+ : list_caster<std::deque<Type, Alloc>, Type> { };
+
+template <typename Type, typename Alloc> struct type_caster<std::list<Type, Alloc>>
+ : list_caster<std::list<Type, Alloc>, Type> { };
+
+template <typename ArrayType, typename Value, bool Resizable, size_t Size = 0> struct array_caster {
+    using value_conv = make_caster<Value>;
+
+private:
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<R, size_t> size) {
+        if (value.size() != size)
+            value.resize(size);
+        return true;
+    }
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<!R, size_t> size) {
+        return size == Size;
+    }
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src))
+            return false;
+        auto l = reinterpret_borrow<sequence>(src);
+        if (!require_size(l.size()))
+            return false;
+        size_t ctr = 0;
+        for (auto it : l) {
+            value_conv conv;
+            if (!conv.load(it, convert))
+                return false;
+            value[ctr++] = cast_op<Value &&>(std::move(conv));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        list l(src.size());
+        size_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
+            if (!value_)
+                return handle();
+            PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(ArrayType, _("List[") + value_conv::name + _<Resizable>(_(""), _("[") + _<Size>() + _("]")) + _("]"));
+};
+
+template <typename Type, size_t Size> struct type_caster<std::array<Type, Size>>
+ : array_caster<std::array<Type, Size>, Type, false, Size> { };
+
+template <typename Type> struct type_caster<std::valarray<Type>>
+ : array_caster<std::valarray<Type>, Type, true> { };
+
+template <typename Key, typename Compare, typename Alloc> struct type_caster<std::set<Key, Compare, Alloc>>
+  : set_caster<std::set<Key, Compare, Alloc>, Key> { };
+
+template <typename Key, typename Hash, typename Equal, typename Alloc> struct type_caster<std::unordered_set<Key, Hash, Equal, Alloc>>
+  : set_caster<std::unordered_set<Key, Hash, Equal, Alloc>, Key> { };
+
+template <typename Key, typename Value, typename Compare, typename Alloc> struct type_caster<std::map<Key, Value, Compare, Alloc>>
+  : map_caster<std::map<Key, Value, Compare, Alloc>, Key, Value> { };
+
+template <typename Key, typename Value, typename Hash, typename Equal, typename Alloc> struct type_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>>
+  : map_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>, Key, Value> { };
+
+// This type caster is intended to be used for std::optional and std::experimental::optional
+template<typename T> struct optional_caster {
+    using value_conv = make_caster<typename T::value_type>;
+
+    template <typename T_>
+    static handle cast(T_ &&src, return_value_policy policy, handle parent) {
+        if (!src)
+            return none().inc_ref();
+        policy = return_value_policy_override<typename T::value_type>::policy(policy);
+        return value_conv::cast(*std::forward<T_>(src), policy, parent);
+    }
+
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        } else if (src.is_none()) {
+            return true;  // default-constructed value is already empty
+        }
+        value_conv inner_caster;
+        if (!inner_caster.load(src, convert))
+            return false;
+
+        value.emplace(cast_op<typename T::value_type &&>(std::move(inner_caster)));
+        return true;
+    }
+
+    PYBIND11_TYPE_CASTER(T, _("Optional[") + value_conv::name + _("]"));
+};
+
+#if PYBIND11_HAS_OPTIONAL
+template<typename T> struct type_caster<std::optional<T>>
+    : public optional_caster<std::optional<T>> {};
+
+template<> struct type_caster<std::nullopt_t>
+    : public void_caster<std::nullopt_t> {};
+#endif
+
+#if PYBIND11_HAS_EXP_OPTIONAL
+template<typename T> struct type_caster<std::experimental::optional<T>>
+    : public optional_caster<std::experimental::optional<T>> {};
+
+template<> struct type_caster<std::experimental::nullopt_t>
+    : public void_caster<std::experimental::nullopt_t> {};
+#endif
+
+/// Visit a variant and cast any found type to Python
+struct variant_caster_visitor {
+    return_value_policy policy;
+    handle parent;
+
+    using result_type = handle; // required by boost::variant in C++11
+
+    template <typename T>
+    result_type operator()(T &&src) const {
+        return make_caster<T>::cast(std::forward<T>(src), policy, parent);
+    }
+};
+
+/// Helper class which abstracts away variant's `visit` function. `std::variant` and similar
+/// `namespace::variant` types which provide a `namespace::visit()` function are handled here
+/// automatically using argument-dependent lookup. Users can provide specializations for other
+/// variant-like classes, e.g. `boost::variant` and `boost::apply_visitor`.
+template <template<typename...> class Variant>
+struct visit_helper {
+    template <typename... Args>
+    static auto call(Args &&...args) -> decltype(visit(std::forward<Args>(args)...)) {
+        return visit(std::forward<Args>(args)...);
+    }
+};
+
+/// Generic variant caster
+template <typename Variant> struct variant_caster;
+
+template <template<typename...> class V, typename... Ts>
+struct variant_caster<V<Ts...>> {
+    static_assert(sizeof...(Ts) > 0, "Variant must consist of at least one alternative.");
+
+    template <typename U, typename... Us>
+    bool load_alternative(handle src, bool convert, type_list<U, Us...>) {
+        auto caster = make_caster<U>();
+        if (caster.load(src, convert)) {
+            value = cast_op<U>(caster);
+            return true;
+        }
+        return load_alternative(src, convert, type_list<Us...>{});
+    }
+
+    bool load_alternative(handle, bool, type_list<>) { return false; }
+
+    bool load(handle src, bool convert) {
+        // Do a first pass without conversions to improve constructor resolution.
+        // E.g. `py::int_(1).cast<variant<double, int>>()` needs to fill the `int`
+        // slot of the variant. Without two-pass loading `double` would be filled
+        // because it appears first and a conversion is possible.
+        if (convert && load_alternative(src, false, type_list<Ts...>{}))
+            return true;
+        return load_alternative(src, convert, type_list<Ts...>{});
+    }
+
+    template <typename Variant>
+    static handle cast(Variant &&src, return_value_policy policy, handle parent) {
+        return visit_helper<V>::call(variant_caster_visitor{policy, parent},
+                                     std::forward<Variant>(src));
+    }
+
+    using Type = V<Ts...>;
+    PYBIND11_TYPE_CASTER(Type, _("Union[") + detail::concat(make_caster<Ts>::name...) + _("]"));
+};
+
+#if PYBIND11_HAS_VARIANT
+template <typename... Ts>
+struct type_caster<std::variant<Ts...>> : variant_caster<std::variant<Ts...>> { };
+#endif
+
+NAMESPACE_END(detail)
+
+inline std::ostream &operator<<(std::ostream &os, const handle &obj) {
+    os << (std::string) str(obj);
+    return os;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/python/src/pybind11/stl_bind.h b/python/src/pybind11/stl_bind.h
new file mode 100644
index 000000000..1f8725260
--- /dev/null
+++ b/python/src/pybind11/stl_bind.h
@@ -0,0 +1,630 @@
+/*
+    pybind11/std_bind.h: Binding generators for STL data types
+
+    Copyright (c) 2016 Sergey Lyskov and Wenzel Jakob
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "operators.h"
+
+#include <algorithm>
+#include <sstream>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/* SFINAE helper class used by 'is_comparable */
+template <typename T>  struct container_traits {
+    template <typename T2> static std::true_type test_comparable(decltype(std::declval<const T2 &>() == std::declval<const T2 &>())*);
+    template <typename T2> static std::false_type test_comparable(...);
+    template <typename T2> static std::true_type test_value(typename T2::value_type *);
+    template <typename T2> static std::false_type test_value(...);
+    template <typename T2> static std::true_type test_pair(typename T2::first_type *, typename T2::second_type *);
+    template <typename T2> static std::false_type test_pair(...);
+
+    static constexpr const bool is_comparable = std::is_same<std::true_type, decltype(test_comparable<T>(nullptr))>::value;
+    static constexpr const bool is_pair = std::is_same<std::true_type, decltype(test_pair<T>(nullptr, nullptr))>::value;
+    static constexpr const bool is_vector = std::is_same<std::true_type, decltype(test_value<T>(nullptr))>::value;
+    static constexpr const bool is_element = !is_pair && !is_vector;
+};
+
+/* Default: is_comparable -> std::false_type */
+template <typename T, typename SFINAE = void>
+struct is_comparable : std::false_type { };
+
+/* For non-map data structures, check whether operator== can be instantiated */
+template <typename T>
+struct is_comparable<
+    T, enable_if_t<container_traits<T>::is_element &&
+                   container_traits<T>::is_comparable>>
+    : std::true_type { };
+
+/* For a vector/map data structure, recursively check the value type (which is std::pair for maps) */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_vector>> {
+    static constexpr const bool value =
+        is_comparable<typename T::value_type>::value;
+};
+
+/* For pairs, recursively check the two data types */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_pair>> {
+    static constexpr const bool value =
+        is_comparable<typename T::first_type>::value &&
+        is_comparable<typename T::second_type>::value;
+};
+
+/* Fallback functions */
+template <typename, typename, typename... Args> void vector_if_copy_constructible(const Args &...) { }
+template <typename, typename, typename... Args> void vector_if_equal_operator(const Args &...) { }
+template <typename, typename, typename... Args> void vector_if_insertion_operator(const Args &...) { }
+template <typename, typename, typename... Args> void vector_modifiers(const Args &...) { }
+
+template<typename Vector, typename Class_>
+void vector_if_copy_constructible(enable_if_t<is_copy_constructible<Vector>::value, Class_> &cl) {
+    cl.def(init<const Vector &>(), "Copy constructor");
+}
+
+template<typename Vector, typename Class_>
+void vector_if_equal_operator(enable_if_t<is_comparable<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+
+    cl.def(self == self);
+    cl.def(self != self);
+
+    cl.def("count",
+        [](const Vector &v, const T &x) {
+            return std::count(v.begin(), v.end(), x);
+        },
+        arg("x"),
+        "Return the number of times ``x`` appears in the list"
+    );
+
+    cl.def("remove", [](Vector &v, const T &x) {
+            auto p = std::find(v.begin(), v.end(), x);
+            if (p != v.end())
+                v.erase(p);
+            else
+                throw value_error();
+        },
+        arg("x"),
+        "Remove the first item from the list whose value is x. "
+        "It is an error if there is no such item."
+    );
+
+    cl.def("__contains__",
+        [](const Vector &v, const T &x) {
+            return std::find(v.begin(), v.end(), x) != v.end();
+        },
+        arg("x"),
+        "Return true the container contains ``x``"
+    );
+}
+
+// Vector modifiers -- requires a copyable vector_type:
+// (Technically, some of these (pop and __delitem__) don't actually require copyability, but it seems
+// silly to allow deletion but not insertion, so include them here too.)
+template <typename Vector, typename Class_>
+void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_type>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+
+    cl.def("append",
+           [](Vector &v, const T &value) { v.push_back(value); },
+           arg("x"),
+           "Add an item to the end of the list");
+
+    cl.def(init([](iterable it) {
+        auto v = std::unique_ptr<Vector>(new Vector());
+        v->reserve(len_hint(it));
+        for (handle h : it)
+           v->push_back(h.cast<T>());
+        return v.release();
+    }));
+
+    cl.def("extend",
+       [](Vector &v, const Vector &src) {
+           v.insert(v.end(), src.begin(), src.end());
+       },
+       arg("L"),
+       "Extend the list by appending all the items in the given list"
+    );
+
+    cl.def("extend",
+       [](Vector &v, iterable it) {
+           const size_t old_size = v.size();
+           v.reserve(old_size + len_hint(it));
+           try {
+               for (handle h : it) {
+                   v.push_back(h.cast<T>());
+               }
+           } catch (const cast_error &) {
+               v.erase(v.begin() + static_cast<typename Vector::difference_type>(old_size), v.end());
+               try {
+                   v.shrink_to_fit();
+               } catch (const std::exception &) {
+                   // Do nothing
+               }
+               throw;
+           }
+       },
+       arg("L"),
+       "Extend the list by appending all the items in the given list"
+    );
+
+    cl.def("insert",
+        [](Vector &v, SizeType i, const T &x) {
+            if (i > v.size())
+                throw index_error();
+            v.insert(v.begin() + (DiffType) i, x);
+        },
+        arg("i") , arg("x"),
+        "Insert an item at a given position."
+    );
+
+    cl.def("pop",
+        [](Vector &v) {
+            if (v.empty())
+                throw index_error();
+            T t = v.back();
+            v.pop_back();
+            return t;
+        },
+        "Remove and return the last item"
+    );
+
+    cl.def("pop",
+        [](Vector &v, SizeType i) {
+            if (i >= v.size())
+                throw index_error();
+            T t = v[i];
+            v.erase(v.begin() + (DiffType) i);
+            return t;
+        },
+        arg("i"),
+        "Remove and return the item at index ``i``"
+    );
+
+    cl.def("__setitem__",
+        [](Vector &v, SizeType i, const T &t) {
+            if (i >= v.size())
+                throw index_error();
+            v[i] = t;
+        }
+    );
+
+    /// Slicing protocol
+    cl.def("__getitem__",
+        [](const Vector &v, slice slice) -> Vector * {
+            size_t start, stop, step, slicelength;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+                throw error_already_set();
+
+            Vector *seq = new Vector();
+            seq->reserve((size_t) slicelength);
+
+            for (size_t i=0; i<slicelength; ++i) {
+                seq->push_back(v[start]);
+                start += step;
+            }
+            return seq;
+        },
+        arg("s"),
+        "Retrieve list elements using a slice object"
+    );
+
+    cl.def("__setitem__",
+        [](Vector &v, slice slice,  const Vector &value) {
+            size_t start, stop, step, slicelength;
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+                throw error_already_set();
+
+            if (slicelength != value.size())
+                throw std::runtime_error("Left and right hand size of slice assignment have different sizes!");
+
+            for (size_t i=0; i<slicelength; ++i) {
+                v[start] = value[i];
+                start += step;
+            }
+        },
+        "Assign list elements using a slice object"
+    );
+
+    cl.def("__delitem__",
+        [](Vector &v, SizeType i) {
+            if (i >= v.size())
+                throw index_error();
+            v.erase(v.begin() + DiffType(i));
+        },
+        "Delete the list elements at index ``i``"
+    );
+
+    cl.def("__delitem__",
+        [](Vector &v, slice slice) {
+            size_t start, stop, step, slicelength;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+                throw error_already_set();
+
+            if (step == 1 && false) {
+                v.erase(v.begin() + (DiffType) start, v.begin() + DiffType(start + slicelength));
+            } else {
+                for (size_t i = 0; i < slicelength; ++i) {
+                    v.erase(v.begin() + DiffType(start));
+                    start += step - 1;
+                }
+            }
+        },
+        "Delete list elements using a slice object"
+    );
+
+}
+
+// If the type has an operator[] that doesn't return a reference (most notably std::vector<bool>),
+// we have to access by copying; otherwise we return by reference.
+template <typename Vector> using vector_needs_copy = negation<
+    std::is_same<decltype(std::declval<Vector>()[typename Vector::size_type()]), typename Vector::value_type &>>;
+
+// The usual case: access and iterate by reference
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<!vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using ItType   = typename Vector::iterator;
+
+    cl.def("__getitem__",
+        [](Vector &v, SizeType i) -> T & {
+            if (i >= v.size())
+                throw index_error();
+            return v[i];
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def("__iter__",
+           [](Vector &v) {
+               return make_iterator<
+                   return_value_policy::reference_internal, ItType, ItType, T&>(
+                   v.begin(), v.end());
+           },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+// The case for special objects, like std::vector<bool>, that have to be returned-by-copy:
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using ItType   = typename Vector::iterator;
+    cl.def("__getitem__",
+        [](const Vector &v, SizeType i) -> T {
+            if (i >= v.size())
+                throw index_error();
+            return v[i];
+        }
+    );
+
+    cl.def("__iter__",
+           [](Vector &v) {
+               return make_iterator<
+                   return_value_policy::copy, ItType, ItType, T>(
+                   v.begin(), v.end());
+           },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+template <typename Vector, typename Class_> auto vector_if_insertion_operator(Class_ &cl, std::string const &name)
+    -> decltype(std::declval<std::ostream&>() << std::declval<typename Vector::value_type>(), void()) {
+    using size_type = typename Vector::size_type;
+
+    cl.def("__repr__",
+           [name](Vector &v) {
+            std::ostringstream s;
+            s << name << '[';
+            for (size_type i=0; i < v.size(); ++i) {
+                s << v[i];
+                if (i != v.size() - 1)
+                    s << ", ";
+            }
+            s << ']';
+            return s.str();
+        },
+        "Return the canonical string representation of this list."
+    );
+}
+
+// Provide the buffer interface for vectors if we have data() and we have a format for it
+// GCC seems to have "void std::vector<bool>::data()" - doing SFINAE on the existence of data() is insufficient, we need to check it returns an appropriate pointer
+template <typename Vector, typename = void>
+struct vector_has_data_and_format : std::false_type {};
+template <typename Vector>
+struct vector_has_data_and_format<Vector, enable_if_t<std::is_same<decltype(format_descriptor<typename Vector::value_type>::format(), std::declval<Vector>().data()), typename Vector::value_type*>::value>> : std::true_type {};
+
+// Add the buffer interface to a vector
+template <typename Vector, typename Class_, typename... Args>
+enable_if_t<detail::any_of<std::is_same<Args, buffer_protocol>...>::value>
+vector_buffer(Class_& cl) {
+    using T = typename Vector::value_type;
+
+    static_assert(vector_has_data_and_format<Vector>::value, "There is not an appropriate format descriptor for this vector");
+
+    // numpy.h declares this for arbitrary types, but it may raise an exception and crash hard at runtime if PYBIND11_NUMPY_DTYPE hasn't been called, so check here
+    format_descriptor<T>::format();
+
+    cl.def_buffer([](Vector& v) -> buffer_info {
+        return buffer_info(v.data(), static_cast<ssize_t>(sizeof(T)), format_descriptor<T>::format(), 1, {v.size()}, {sizeof(T)});
+    });
+
+    cl.def(init([](buffer buf) {
+        auto info = buf.request();
+        if (info.ndim != 1 || info.strides[0] % static_cast<ssize_t>(sizeof(T)))
+            throw type_error("Only valid 1D buffers can be copied to a vector");
+        if (!detail::compare_buffer_info<T>::compare(info) || (ssize_t) sizeof(T) != info.itemsize)
+            throw type_error("Format mismatch (Python: " + info.format + " C++: " + format_descriptor<T>::format() + ")");
+
+        auto vec = std::unique_ptr<Vector>(new Vector());
+        vec->reserve((size_t) info.shape[0]);
+        T *p = static_cast<T*>(info.ptr);
+        ssize_t step = info.strides[0] / static_cast<ssize_t>(sizeof(T));
+        T *end = p + info.shape[0] * step;
+        for (; p != end; p += step)
+            vec->push_back(*p);
+        return vec.release();
+    }));
+
+    return;
+}
+
+template <typename Vector, typename Class_, typename... Args>
+enable_if_t<!detail::any_of<std::is_same<Args, buffer_protocol>...>::value> vector_buffer(Class_&) {}
+
+NAMESPACE_END(detail)
+
+//
+// std::vector
+//
+template <typename Vector, typename holder_type = std::unique_ptr<Vector>, typename... Args>
+class_<Vector, holder_type> bind_vector(handle scope, std::string const &name, Args&&... args) {
+    using Class_ = class_<Vector, holder_type>;
+
+    // If the value_type is unregistered (e.g. a converting type) or is itself registered
+    // module-local then make the vector binding module-local as well:
+    using vtype = typename Vector::value_type;
+    auto vtype_info = detail::get_type_info(typeid(vtype));
+    bool local = !vtype_info || vtype_info->module_local;
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+
+    // Declare the buffer interface if a buffer_protocol() is passed in
+    detail::vector_buffer<Vector, Class_, Args...>(cl);
+
+    cl.def(init<>());
+
+    // Register copy constructor (if possible)
+    detail::vector_if_copy_constructible<Vector, Class_>(cl);
+
+    // Register comparison-related operators and functions (if possible)
+    detail::vector_if_equal_operator<Vector, Class_>(cl);
+
+    // Register stream insertion operator (if possible)
+    detail::vector_if_insertion_operator<Vector, Class_>(cl, name);
+
+    // Modifiers require copyable vector value type
+    detail::vector_modifiers<Vector, Class_>(cl);
+
+    // Accessor and iterator; return by value if copyable, otherwise we return by ref + keep-alive
+    detail::vector_accessor<Vector, Class_>(cl);
+
+    cl.def("__bool__",
+        [](const Vector &v) -> bool {
+            return !v.empty();
+        },
+        "Check whether the list is nonempty"
+    );
+
+    cl.def("__len__", &Vector::size);
+
+
+
+
+#if 0
+    // C++ style functions deprecated, leaving it here as an example
+    cl.def(init<size_type>());
+
+    cl.def("resize",
+         (void (Vector::*) (size_type count)) & Vector::resize,
+         "changes the number of elements stored");
+
+    cl.def("erase",
+        [](Vector &v, SizeType i) {
+        if (i >= v.size())
+            throw index_error();
+        v.erase(v.begin() + i);
+    }, "erases element at index ``i``");
+
+    cl.def("empty",         &Vector::empty,         "checks whether the container is empty");
+    cl.def("size",          &Vector::size,          "returns the number of elements");
+    cl.def("push_back", (void (Vector::*)(const T&)) &Vector::push_back, "adds an element to the end");
+    cl.def("pop_back",                               &Vector::pop_back, "removes the last element");
+
+    cl.def("max_size",      &Vector::max_size,      "returns the maximum possible number of elements");
+    cl.def("reserve",       &Vector::reserve,       "reserves storage");
+    cl.def("capacity",      &Vector::capacity,      "returns the number of elements that can be held in currently allocated storage");
+    cl.def("shrink_to_fit", &Vector::shrink_to_fit, "reduces memory usage by freeing unused memory");
+
+    cl.def("clear", &Vector::clear, "clears the contents");
+    cl.def("swap",   &Vector::swap, "swaps the contents");
+
+    cl.def("front", [](Vector &v) {
+        if (v.size()) return v.front();
+        else throw index_error();
+    }, "access the first element");
+
+    cl.def("back", [](Vector &v) {
+        if (v.size()) return v.back();
+        else throw index_error();
+    }, "access the last element ");
+
+#endif
+
+    return cl;
+}
+
+
+
+//
+// std::map, std::unordered_map
+//
+
+NAMESPACE_BEGIN(detail)
+
+/* Fallback functions */
+template <typename, typename, typename... Args> void map_if_insertion_operator(const Args &...) { }
+template <typename, typename, typename... Args> void map_assignment(const Args &...) { }
+
+// Map assignment when copy-assignable: just copy the value
+template <typename Map, typename Class_>
+void map_assignment(enable_if_t<std::is_copy_assignable<typename Map::mapped_type>::value, Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__",
+           [](Map &m, const KeyType &k, const MappedType &v) {
+               auto it = m.find(k);
+               if (it != m.end()) it->second = v;
+               else m.emplace(k, v);
+           }
+    );
+}
+
+// Not copy-assignable, but still copy-constructible: we can update the value by erasing and reinserting
+template<typename Map, typename Class_>
+void map_assignment(enable_if_t<
+        !std::is_copy_assignable<typename Map::mapped_type>::value &&
+        is_copy_constructible<typename Map::mapped_type>::value,
+        Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__",
+           [](Map &m, const KeyType &k, const MappedType &v) {
+               // We can't use m[k] = v; because value type might not be default constructable
+               auto r = m.emplace(k, v);
+               if (!r.second) {
+                   // value type is not copy assignable so the only way to insert it is to erase it first...
+                   m.erase(r.first);
+                   m.emplace(k, v);
+               }
+           }
+    );
+}
+
+
+template <typename Map, typename Class_> auto map_if_insertion_operator(Class_ &cl, std::string const &name)
+-> decltype(std::declval<std::ostream&>() << std::declval<typename Map::key_type>() << std::declval<typename Map::mapped_type>(), void()) {
+
+    cl.def("__repr__",
+           [name](Map &m) {
+            std::ostringstream s;
+            s << name << '{';
+            bool f = false;
+            for (auto const &kv : m) {
+                if (f)
+                    s << ", ";
+                s << kv.first << ": " << kv.second;
+                f = true;
+            }
+            s << '}';
+            return s.str();
+        },
+        "Return the canonical string representation of this map."
+    );
+}
+
+
+NAMESPACE_END(detail)
+
+template <typename Map, typename holder_type = std::unique_ptr<Map>, typename... Args>
+class_<Map, holder_type> bind_map(handle scope, const std::string &name, Args&&... args) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+    using Class_ = class_<Map, holder_type>;
+
+    // If either type is a non-module-local bound type then make the map binding non-local as well;
+    // otherwise (e.g. both types are either module-local or converting) the map will be
+    // module-local.
+    auto tinfo = detail::get_type_info(typeid(MappedType));
+    bool local = !tinfo || tinfo->module_local;
+    if (local) {
+        tinfo = detail::get_type_info(typeid(KeyType));
+        local = !tinfo || tinfo->module_local;
+    }
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+
+    cl.def(init<>());
+
+    // Register stream insertion operator (if possible)
+    detail::map_if_insertion_operator<Map, Class_>(cl, name);
+
+    cl.def("__bool__",
+        [](const Map &m) -> bool { return !m.empty(); },
+        "Check whether the map is nonempty"
+    );
+
+    cl.def("__iter__",
+           [](Map &m) { return make_key_iterator(m.begin(), m.end()); },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+
+    cl.def("items",
+           [](Map &m) { return make_iterator(m.begin(), m.end()); },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+
+    cl.def("__getitem__",
+        [](Map &m, const KeyType &k) -> MappedType & {
+            auto it = m.find(k);
+            if (it == m.end())
+              throw key_error();
+           return it->second;
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def("__contains__",
+        [](Map &m, const KeyType &k) -> bool {
+            auto it = m.find(k);
+            if (it == m.end())
+              return false;
+           return true;
+        }
+    );
+
+    // Assignment provided only if the type is copyable
+    detail::map_assignment<Map, Class_>(cl);
+
+    cl.def("__delitem__",
+           [](Map &m, const KeyType &k) {
+               auto it = m.find(k);
+               if (it == m.end())
+                   throw key_error();
+               m.erase(it);
+           }
+    );
+
+    cl.def("__len__", &Map::size);
+
+    return cl;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/python/triton/_C/include b/python/triton/_C/include
new file mode 120000
index 000000000..b85a40983
--- /dev/null
+++ b/python/triton/_C/include
@@ -0,0 +1 @@
+../../../include/
\ No newline at end of file
diff --git a/python/triton/__init__.py b/python/triton/__init__.py
new file mode 100644
index 000000000..cb4097e72
--- /dev/null
+++ b/python/triton/__init__.py
@@ -0,0 +1,12 @@
+from .kernel import *
+from .function import *
+from .utils import *
+import triton.ops
+
+
+# clean-up libtriton resources
+import atexit
+import triton._C.libtriton as libtriton
+@atexit.register
+def cleanup():
+  libtriton.cleanup()
\ No newline at end of file
diff --git a/python/triton/frameworks.py b/python/triton/frameworks.py
new file mode 100644
index 000000000..9385bc212
--- /dev/null
+++ b/python/triton/frameworks.py
@@ -0,0 +1,28 @@
+import sys
+import os
+import triton._C.libtriton as libtriton
+
+torch = None
+tensorflow = None
+
+def _import_torch():
+  global torch
+  if torch is None:
+    import torch
+
+def _import_tensorflow():
+  global tensorflow
+  if tensorflow is None:
+    import tensorflow
+
+def has_tensorflow():
+  result = 'tensorflow' in sys.modules
+  if result:
+    _import_tensorflow()
+  return result
+
+def has_torch():
+  result = 'torch' in sys.modules
+  if result:
+    _import_torch()
+  return result
\ No newline at end of file
diff --git a/python/triton/function.py b/python/triton/function.py
new file mode 100644
index 000000000..2c9cb60fe
--- /dev/null
+++ b/python/triton/function.py
@@ -0,0 +1,127 @@
+import triton.frameworks as fw
+import triton.utils as utils
+
+class OpContext(object):
+
+    def __init__(self):
+      self.to_save = []
+
+    def save_for_backward(self, *tensors):
+      self.to_save = [x.to_tensor() if isinstance(x, utils.tf_empty_proxy) else x 
+                      for x in tensors]
+    
+    @property
+    def saved_tensors(self):
+      return self.to_save
+
+class function_meta(type):
+
+    def __init__(cls, name, bases, attrs):
+        cls.registered = False
+        return super(function_meta, cls).__init__(name, bases, attrs)
+
+ctx_registry = utils.id_dict()
+
+class function(metaclass = function_meta):
+  
+  @staticmethod
+  def forward(ctx, *args, **kwargs):
+    raise NotImplementedError
+
+  @staticmethod
+  def backward(ctx, grad_output):
+    raise NotImplementedError
+
+  @classmethod
+  def apply_torch(cls, *args, **kwargs):
+    class TorchFunction(fw.torch.autograd.Function):
+      @staticmethod
+      def forward(ctx, *targs):
+        y = cls.forward(ctx, *targs, **cls.torch_kwargs)
+        ctx_registry[y] = ctx
+        return y
+      @staticmethod
+      def backward(ctx, grad_output):
+        return cls.backward(ctx, grad_output)
+    cls.torch_kwargs = kwargs
+    return TorchFunction.apply(*args)
+  torch_kwargs = 0
+
+  @classmethod
+  def extract_tf_tensors(cls, lst, err):
+    ret = []
+    for x in lst:
+      if x is None:
+        ret += [None]
+      elif isinstance(x, fw.tensorflow.Tensor):
+        ret += [x]
+      elif isinstance(x, utils.tf_empty_proxy):
+        if x.tensor is None:
+          raise ValueError('Empty tensor never filled during ' + err)
+        else:
+          ret += [x.tensor]
+      else:
+        raise ValueError('Unsupported return type', type(x))
+    return ret
+  
+  @classmethod
+  def map_in_to_args(cls, op, args):
+    ret = dict()
+    for i, ix in enumerate(op.inputs):
+      for j, jx in enumerate(args):
+        if ix is jx:
+          ret[j] = i
+    return ret
+  
+  @classmethod
+  def map_res_to_out(cls, op, result):
+    ret = []
+    for i, ix in enumerate(result):
+      for j, jx in enumerate(op.outputs):
+        if ix is jx:
+          ret.append(j)
+    return ret
+    
+  @classmethod
+  def apply_tensorflow(cls, *args, **kwargs):
+    ctx = OpContext()
+
+    # run forward pass
+    result = cls.forward(ctx, *args, **kwargs)
+    result = result if isinstance(result, tuple) else (result, )
+    result = function.extract_tf_tensors(result, 'forward')
+    
+    # Register backward pass
+    op = result[0].op
+    ctx_registry[op] = ctx
+    if not cls.registered:
+      remap_in = cls.map_in_to_args(op, args)
+      remap_out = cls.map_res_to_out(op, result)
+      @fw.tensorflow.RegisterGradient(op.op_def.name)
+      def gradient(op, *dy):
+        # Remap gradient inputs in the right order
+        dy = [dy[i] for i in remap_out]
+        dy = dy if len(dy) > 1 else dy[0]
+        # Execute gradient function
+        grad = cls.backward(ctx_registry[op], dy)
+        grad = function.extract_tf_tensors(grad, 'backward')
+        # Remap gradient in the right order
+        ret = [None] * len(op.inputs)
+        for i in range(len(grad)):
+          if i in remap_in:
+            ret[remap_in[i]] = grad[i]
+        # Return
+        return ret
+      cls.registered = True
+
+    # Return tensor
+    return result[0] if len(result)==1 else result
+
+  @classmethod
+  def apply(cls, *args, **kwargs):
+    if fw.has_tensorflow():
+        return cls.apply_tensorflow(*args, **kwargs)
+    elif fw.has_torch():
+        return cls.apply_torch(*args, **kwargs)
+    else:
+        assert False
diff --git a/python/triton/kernel.py b/python/triton/kernel.py
new file mode 100644
index 000000000..71b79bb99
--- /dev/null
+++ b/python/triton/kernel.py
@@ -0,0 +1,298 @@
+# import for cache
+import os
+import tempfile
+import shutil
+import hashlib
+import sysconfig
+import sys
+import weakref
+import contextlib
+import io
+# import for just-in-time compilation
+import distutils
+import setuptools.command.build_ext
+import setuptools
+# triton
+import triton.frameworks as fw
+import triton.utils
+import triton._C.libtriton as libtriton
+
+def _make_framework_src(src, out, tmp, grid):
+  if fw.has_tensorflow():
+    return libtriton.make_tensorflow_src(src, out, tmp, grid)
+  elif fw.has_torch:
+    return libtriton.make_torch_src(src, out, tmp, grid)
+  else:
+    assert False
+
+def _make_cache_path(src):
+  md5 = hashlib.sha1(src.encode())
+  hexhash = md5.hexdigest()
+  home = os.path.expanduser('~')
+  cacheroot = os.path.join(home, '.triton', 'cache')
+  cachepath = os.path.join(cacheroot, str(hexhash))
+  if not os.path.exists(cachepath):
+    os.makedirs(cachepath)
+  return cachepath
+
+def _write_bindings(src, root):
+  if fw.has_tensorflow():
+    name = 'tensorflow'
+  elif fw.has_torch():
+    name = 'torch'
+  else:
+    assert False
+  cpp = os.path.join(root, '{name}.cpp'.format(name=name))
+  suffix = sysconfig.get_config_var('EXT_SUFFIX')
+  so = os.path.join(root, '{name}{suffix}'.format(name=name, suffix=suffix))
+  recompile = False
+  # recompile if .so does not exist
+  if not os.path.exists(cpp) or not os.path.exists(so):
+    recompile = True
+  # recompile if cpp was modified after .so
+  elif max(cpp, so, key=os.path.getctime) == cpp:
+    recompile = True
+  # write cpp file
+  if recompile:
+    with open(cpp, 'w+') as handle:
+      handle.writelines(src)
+  # return path of cpp file
+  return (cpp, so)
+
+@contextlib.contextmanager
+def quiet():
+    old_stdout, old_stderr = sys.stdout, sys.stderr
+    sys.stdout, sys.stderr = io.StringIO(), io.StringIO()
+    try:
+        yield
+    finally:
+        sys.stdout, sys.stderr = old_stdout, old_stderr
+
+def _build(src, path):
+  ccdir = os.path.join(libtriton.__file__, os.path.pardir)
+  ccdir = os.path.realpath(ccdir)
+  # include directories
+  triton_include_dirs = [os.path.join(ccdir, 'include')]
+  include_dirs = triton_include_dirs 
+  # library directories
+  triton_library_dirs = [ccdir]
+  library_dirs = triton_library_dirs
+  # libraries
+  libraries = ['triton']
+  # add framework
+  extra_compile_args = []
+  if fw.has_tensorflow():
+    library_dirs += [fw.tensorflow.sysconfig.get_lib()]
+    include_dirs += [fw.tensorflow.sysconfig.get_include()]
+    include_dirs += ['/usr/local/cuda/include/']
+    libraries += [fw.tensorflow.sysconfig.get_link_flags()[1].replace('-l', '')]
+    abi = fw.tensorflow.__cxx11_abi_flag__ if "__cxx11_abi_flag__" in fw.tensorflow.__dict__ else 0
+    extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI={abi}'.format(abi=abi)]
+    name = 'tensorflow'
+  elif fw.has_torch():
+    prefix = os.path.dirname(fw.torch.__file__)
+    library_dirs += [os.path.join(prefix, 'lib')]
+    include_dirs += ['/usr/local/cuda/include/',
+                     os.path.join(prefix, 'lib', 'include'),
+                     os.path.join(prefix, 'lib', 'include', 'torch', 'csrc', 'api', 'include'),
+                     os.path.join(prefix, 'include'),
+                     os.path.join(prefix, 'include', 'torch', 'csrc', 'api', 'include')]
+    libraries += ['torch']
+    abi = fw.torch._C._GLIBCXX_USE_CXX11_ABI
+    extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI={abi}'.format(abi=abi)]
+    name = 'torch'
+  else:
+    assert False
+  # extra arguments
+  extra_link_args = []
+  # dependences
+  depends = [os.path.realpath(libtriton.__file__)]
+  # create extension module
+  ext = setuptools.Extension(
+      name = name,
+      language = 'c++',
+      sources = [src],
+      include_dirs = include_dirs,
+      extra_compile_args = extra_compile_args + ['-g0'],
+      extra_link_args = extra_link_args,
+      library_dirs = library_dirs,
+      libraries = libraries,
+      depends = depends
+  )
+  # build extension module
+  args = ['build_ext']
+  tmp = tempfile.mkdtemp()
+  args.append('--build-temp=' + tmp)
+  args.append('--build-lib=' + path)
+  args.append('-q')
+  args = dict(
+      name = name,
+      ext_modules = [ext],
+      script_args = args,
+  ) 
+  with quiet():
+    setuptools.setup(**args)
+  shutil.rmtree(tmp)
+
+def _cvt_to_def_str(obj):
+  # bool
+  if isinstance(obj, bool):
+    return str(int(obj))
+  # tensorflow type
+  if fw.has_tensorflow():
+    if isinstance(obj, fw.tensorflow.DType):
+      return {fw.tensorflow.int8: 'char',
+              fw.tensorflow.int16: 'short',
+              fw.tensorflow.int32: 'int',
+              fw.tensorflow.int64: 'long',
+              fw.tensorflow.float16: 'half',
+              fw.tensorflow.float32: 'float',
+              fw.tensorflow.float64: 'double'}[obj]
+  # torch type
+  elif fw.has_torch():
+    if isinstance(obj, fw.torch.dtype):
+      return {fw.torch.int8: 'char',
+              fw.torch.int16: 'short',
+              fw.torch.int32: 'int',
+              fw.torch.int64: 'long',
+              fw.torch.float16: 'half',
+              fw.torch.float32: 'float',
+              fw.torch.float64: 'double'}[obj]
+  else:
+    assert False
+  # default
+  return str(obj)
+
+
+def _make_framework_op(src, outputs, tmp, options):
+  src, name = _make_framework_src(src, outputs, tmp, options)
+  cache_path = _make_cache_path(src)
+  cpp, so = _write_bindings(src, cache_path)
+  _build(cpp, cache_path)
+  if fw.has_tensorflow():
+    return fw.tensorflow.load_op_library(so).__dict__[name]
+  elif fw.has_torch():
+    fw.torch.ops.load_library(so)
+    return getattr(fw.torch.ops.triton, name)
+  else:
+    assert False
+
+def _make_grid(grid, args) :
+  scalars = [x for x in args if isinstance(x, triton.utils.scalar)]
+  def grid(opt):
+    for x in scalars:
+      x.set_assume_initialized()
+    result = grid(opt)
+    for x in scalars:
+      x.unset_assume_initialized()
+    return result
+  return grid
+
+
+bench_registry = triton.utils.id_dict()
+
+class kernel:
+
+  def __init__(self, src, outputs, tmp=[]):
+    self.fw_id = dict()
+    self.fw_grids = dict()
+    self.fw_op = None
+    self.src = src
+    self.outputs = outputs
+    self.tmp = tmp
+    self.cst = dict()
+
+  def set_constant(self, name, value):
+    self.cst[name] = value
+
+  def __call__(self, *args, **kwargs):
+
+    ########################
+    # keyword arguments
+    ########################
+    num_warps = kwargs['num_warps'] if 'num_warps' in kwargs else [2, 4, 8]
+    defines = kwargs['defines']     if 'defines'   in kwargs else dict()
+    bench = kwargs['bench']         if 'bench'     in kwargs else 0
+    if 'grid' not in kwargs:
+      raise RuntimeError('Must provide grid for kernel launch')
+    grid = kwargs['grid']
+
+
+    #########################
+    # cache
+    ########################
+
+    # create a new framework op when defines are different
+    key = '-'.join(['{key}-{val}'.format(key=key, val=val) for key, val in defines.items()])
+    if key not in self.fw_id.keys():
+      # code generation options
+      macros = []
+      for k, v in defines.items():
+        cvt = lambda x: _cvt_to_def_str(x)
+        if(isinstance(v, list)):
+          values = list(map(cvt, v))
+        else:
+          values = [cvt(v)]
+        macros.append((k, values))
+      opt = libtriton.options_space()
+      opt.defines = macros
+      opt.num_warps = [2, 4, 8]
+      # create unique id for this op
+      op_id = libtriton.make_op_id()
+      self.fw_id[key] = op_id
+      # register function
+      libtriton.register_fn(op_id, self.src, opt)
+      for name, value in self.cst.items():
+        libtriton.register_cst(op_id, name, value)
+      if self.fw_op is None:
+        self.fw_op = _make_framework_op(self.src, self.outputs, self.tmp, opt)
+
+    ########################
+    # initialize
+    ########################
+    op_id = self.fw_id[key]
+    libtriton.register_grid(op_id, grid)
+    bench_id = libtriton.make_scalar_id() if bench > 0 else -1
+
+    #########################
+    # call framework function
+    #########################
+    if fw.has_tensorflow():
+      empty = [x for x in args if isinstance(x, triton.utils.tf_empty_proxy)]
+      if len(empty) != len(self.outputs):
+        raise ValueError('Number of empty arguments does not much number of outputs provided')
+      # operands
+      operands = [x.shape if isinstance(x, triton.utils.tf_empty_proxy) else x for x in args]
+      # output data types
+      kwargs = {'id': op_id, 'bench': bench, 'bench_id': bench_id}
+      for i, x in enumerate(args):
+        if isinstance(x, triton.utils.tf_empty_proxy):
+          kwargs['T' + str(i)] = x.dtype
+      # launch
+      ret = self.fw_op(*operands, **kwargs)
+      ret = [ret] if isinstance(ret, fw.tensorflow.Tensor) else ret
+      op_def = ret[0].op.op_def
+       # fill empty tensors with corresponding values
+      for j, y in enumerate(op_def.output_arg):
+        found = False
+        for i, x in enumerate(op_def.input_arg):
+          if y.name + '_shape' == x.name:
+            args[i].tensor = ret[j]
+            found = True
+        assert found
+      # store timing information
+      if bench > 0:
+        for y in ret: 
+          bench_registry[y] = triton.utils.id_dict.lazy_entry(bench_id)
+    
+    ############################
+    # call torch function
+    ############################
+    elif fw.has_torch():
+      args = [x if isinstance(x, fw.torch.Tensor) else x for x in args]
+      ret = self.fw_op(op_id, bench, bench_id, *args)
+      if bench > 0:
+        bench_registry[ret] = libtriton.retrieve_scalar(bench_id)
+
+    else:
+      assert False
\ No newline at end of file
diff --git a/python/triton/ops/__init__.py b/python/triton/ops/__init__.py
new file mode 100644
index 000000000..ea638f76c
--- /dev/null
+++ b/python/triton/ops/__init__.py
@@ -0,0 +1,2 @@
+from .einsum import _einsum, einsum
+from .batchnorm import _batchnorm, batchnorm
\ No newline at end of file
diff --git a/python/triton/ops/batchnorm.py b/python/triton/ops/batchnorm.py
new file mode 100644
index 000000000..117cca3b1
--- /dev/null
+++ b/python/triton/ops/batchnorm.py
@@ -0,0 +1,129 @@
+import triton
+import math
+
+class _batchnorm(triton.function):
+
+  fwd_src = """
+void fwdbatchnorm(float *Y, float *M, float *V,
+                  float *X, float *G, float *B,
+                  int N, float eps) {
+  // pointers
+  int c = get_program_id(1);
+  int rm[TM] = 0 ... TM;
+  float *px[TM] = X + rm + c*N;
+  float* py[TM] = Y + rm + c*N;
+
+  // compute mean
+  float accm[TM] = 0;
+  for(int i = 0; i < N; i = i + TM)
+    accm = accm + *(px + i);
+  float mean = (float)accm[+] / N;
+  *(M + c) = mean;
+
+  // compute variance
+  float accv[TM] = 0;
+  for(int i = 0; i < N; i = i + TM){
+    float x[TM] = *(px + i);
+    x = x - mean;
+    accv = accv + x*x;
+  }
+  float var = (float)accv[+] / N;
+  *(V + c) = var;
+
+  // Normalize batch
+  float gamma = *(G + c);
+  float beta = *(B + c);
+  float rstdg = 1 / sqrtf(var + eps) * gamma;
+  for(int i = 0; i < N; i = i + TM){
+    float x[TM] = *(px + i);
+    float y[TM] = (x - mean)*rstdg + beta;
+    *(py + i) = y;
+  }
+}
+"""
+  fwd_kernel = triton.kernel(fwd_src, ['Y', 'M', 'V'])
+
+  bwd_src = """
+void bwdbatchnorm(float *DX, float *DG, float *DB,
+                  float *DY, float *X, float *G,
+                  float *M, float *V,
+                  int N, float epsilon) {
+
+   // pointers
+  int c = get_program_id(1);
+  int rx[TM] = 0 ... TM;
+  int offset = c*N;
+  float* px[TM]  =  X + rx + offset;
+  float* pdy[TM] = DY + rx + offset;
+  float* pdx[TM] = DX + rx + offset;
+
+  // fetch statistics
+  float gamma = *(G + c);
+  float mean = *(M + c);
+  float var = *(V + c);
+  float rstd = 1 / sqrtf(var + epsilon);
+
+  // compute dgamma and dbeta
+  float  acc_dg[TM] = 0;
+  float  acc_db[TM] = 0;
+  for(int i = 0; i < N; i = i + TM){
+    float x[TM] = *(px + i);
+    float dy[TM] = *(pdy + i);
+    acc_dg += dy*(x - mean)*rstd;
+    acc_db += dy;
+  }
+  float dg = acc_dg[+];
+  float db = acc_db[+];
+  *(DG + c) = dg;
+  *(DB + c) = db;
+
+  // compute dx
+  for(int i = 0; i < N; i = i + TM){
+    float x[TM] = *(px + i);
+    float dy[TM] = *(pdy + i);
+    float xhat[TM] = (x - mean) * rstd;
+    float xtmp[TM] = (xhat * dg + db) / N;
+    float dx[TM] = (dy - xtmp) * rstd * gamma;
+    *(pdx + i) = dx;
+  }
+}
+"""
+  bwd_kernel = triton.kernel(bwd_src, ['DX', 'DG', 'DB'])
+
+  @staticmethod
+  def forward(ctx, x, gamma, beta, eps):
+    shape = triton.shape(x)
+    dtype = x.dtype
+    # allocate outputs
+    C, H, W, B = shape[0], shape[1], shape[2], shape[3]
+    y = triton.empty(shape, dtype=dtype)
+    mean = triton.empty([C], dtype=dtype)
+    var = triton.empty([C], dtype=dtype)
+    # execute kernels
+    _batchnorm.fwd_kernel(y, mean, var, x, gamma, beta, H*W*B, eps,
+                          grid = lambda opt: [1, C],
+                          defines = {'TM': 128})
+    # save
+    ctx.save_for_backward(x, gamma, beta, mean, var)
+    ctx.eps = eps
+    return y
+
+  @staticmethod
+  def backward(ctx, dy):
+    # retrieve info
+    x, gamma, beta, mean, var = ctx.saved_tensors
+    eps = ctx.eps
+    # allocate result
+    dx = triton.empty(triton.shape(x), dtype=x.dtype)
+    dgamma = triton.empty(triton.shape(gamma), dtype=gamma.dtype)
+    dbeta = triton.empty(triton.shape(beta), dtype=beta.dtype)
+    # execute
+    C, H, W, B = triton.shape(x)
+    _batchnorm.bwd_kernel(dx, dgamma, dbeta, dy, 
+                          x, gamma, mean, var, 
+                          H*W*B, eps,
+                          grid = lambda opt: [1, C],
+                          defines = {'TM': 128})
+    return dx, dgamma, dbeta, None
+
+batchnorm = _batchnorm.apply
\ No newline at end of file
diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py
new file mode 100644
index 000000000..936a5fced
--- /dev/null
+++ b/python/triton/ops/einsum.py
@@ -0,0 +1,650 @@
+import numpy as np
+import torch
+from math import ceil, log2
+from enum import IntEnum
+import triton
+from functools import reduce
+from operator import mul
+from sympy.parsing.sympy_parser import parse_expr
+import sympy as sp
+from collections import OrderedDict
+from collections import namedtuple
+import re
+from sympy.printing.ccode import C89CodePrinter
+
+       
+class _einsum(triton.function):
+
+
+    #############################
+    ## Triton-C code generation
+    #############################
+    def print_cc(expr, axes_0, axes_1, axes_2):
+
+        class TritonCodePrinter(C89CodePrinter):
+            
+            def __init__(self, axes_0, axes_1, axes_2):
+                super(TritonCodePrinter, self).__init__()
+                self.axes_0 = axes_0
+                self.axes_1 = axes_1
+                self.axes_2 = axes_2
+
+            def _print_Symbol(self, expr):
+                name = super(C89CodePrinter, self)._print_Symbol(expr)
+                if expr in self.axes_0:
+                    return f'r{name}[:, newaxis, newaxis]'
+                if expr in self.axes_1:
+                    return f'r{name}[newaxis, :, newaxis]'
+                if expr in self.axes_2:
+                    return f'r{name}[newaxis, newaxis, :]'
+                return name
+
+            def _print_Indexed(self, expr):
+                assert len(expr.indices) == 1
+                return "*(%s + %s)" % (self._print(expr.base.label),
+                                    self._print(expr.indices[0]))
+        
+        return TritonCodePrinter(axes_0, axes_1, axes_2).doprint(expr)
+
+
+    def unpack_cc(tile, axes, prefix, remat):
+        ret = ''
+        axes = list(map(str, axes))
+        for i, d in enumerate(reversed(axes)):
+            if i == len(axes) - 1:
+                break
+            currs = ''.join(axes[: len(axes) - i])
+            nexts = ''.join(axes[: len(axes) - (i + 1)])
+            ty = '' if remat else 'int '
+            sz = '' if remat else f'[{tile}]'
+            ret += f'    {ty}{prefix}{nexts}{sz} = r{currs} / dim_{d};\n'
+            ret += f'    {ty}{prefix}{d}{sz} = r{currs} % dim_{d};\n'
+        return ret
+
+    def strides_cc(name, expr):
+        ret = [f'stride_{name}_{d}' for d in expr[:-1]] + ['1']
+        ret = dict(zip(expr, ret))
+        return ret
+
+    def make_kernel(name,
+                    expr_a, expr_b, expr_c,
+                    axes_m, axes_n, axes_k, axes_b,
+                    multipleof_a, multipleof_b, multipleof_c,
+                    lut_mode_a, lut_mode_b,
+                    delta_a, delta_b,
+                    subscripted):
+
+        use_lut_a = True
+        use_lut_b = True
+
+        src = ""
+
+        if use_lut_a and lut_mode_a == _einsum.LUT_MODE.CONSTANT:
+            src += f"""
+char __constant__* AD = calloc({4*len(delta_a)});"""
+        if use_lut_b and lut_mode_b == _einsum.LUT_MODE.CONSTANT:
+            src += f"""
+char __constant__* BD = calloc({4*len(delta_b)});"""
+
+
+        src += f"""
+__global__ void {name}(
+              TYPE * A __noalias __readonly __aligned(16)
+            , TYPE * B __noalias __readonly __aligned(16)
+            , TYPE * C
+            , int * locks
+            , float alpha
+            , int matmul_m, int matmul_n, int matmul_k __multipleof(16)
+            , int div_m
+            """
+        for dim in [axes_m, axes_n, axes_k, axes_b]:
+            for d in dim:
+                src += f", int dim_{d}"
+        src += "\n            "
+        for dim, name, mult in zip([expr_a, expr_b, expr_c],
+                                         ['a', 'b', 'c'],
+                                         [multipleof_a, multipleof_b, multipleof_c]):
+            for d in range(len(dim) - 1):
+                attr = f'__multipleof({mult})'
+                src += f", int stride_{name}_{d} {attr}"
+            src += "\n            "
+        if lut_mode_a == _einsum.LUT_MODE.SCALAR:
+            src += f", int stride_a_inner __multipleof({multipleof_a})"
+        elif lut_mode_a == _einsum.LUT_MODE.DRAM:
+            src += ", int* AD __noalias __readonly __aligned(16)"
+        src += "\n            "
+        if lut_mode_b == _einsum.LUT_MODE.SCALAR:
+            src += f", int stride_b_inner __multipleof({multipleof_b})"
+        elif lut_mode_b == _einsum.LUT_MODE.DRAM:
+            src += ", int* BD"
+        for ptr in subscripted:
+            src += f", int* {ptr}"
+        src += """) {
+
+    // re-order outer program ids
+    int grid_m = (matmul_m + TM - 1) / TM;
+    int grid_n = (matmul_n + TN - 1) / TN;
+    int pid_mn = get_program_id(0) / div_m;
+    int pid_n = pid_mn % grid_n;
+    int pid_m = (pid_mn / grid_n)*div_m + (get_program_id(0) % div_m);
+
+    // get batch program id
+    int pid_b = get_program_id(1);
+
+#if TZ == 1
+    int off_k = 0;
+#else
+    // get reduction sub-group program id
+    int pid_z = get_program_id(2);
+    int grid_z = get_num_programs(2);
+    int div_z = matmul_k / TZ;
+    int rem_z = matmul_k % TZ;
+    int off_k = pid_z * div_z;
+    matmul_k = select(pid_z < rem_z, div_z, div_z + rem_z);
+#endif
+    
+    // create ranges
+"""
+        rk = 'r{}'.format(''.join(map(str,axes_k)))
+        for axes, tile, off in zip([axes_m, axes_n, axes_b, axes_k],
+                                   ['TM', 'TN', 'TB', 'TK'],
+                                   ['pid_m*TM', 'pid_n*TN', 'pid_b*TB', 'off_k']):
+            currs = ''.join(map(str,axes))
+            if axes:
+                src += f"    int r{currs}[{tile}] = {off} + 0 ... {tile};\n"
+                src += _einsum.unpack_cc(tile, axes, 'r', False)
+
+        src += """    
+    // initialize pointers to A
+    int offa[TM, TK, TB] = """
+        for i, sym in enumerate(expr_a):
+            ccode = _einsum.print_cc(sym, axes_m, axes_k, axes_b)
+            stride = f'stride_a_{i}' if i < len(expr_a) - 1 else '1'
+            if i > 0:
+                src += ' + '
+            src += f"({ccode}) * {stride}\n                            "
+        src += ';'
+
+        src += """
+    TYPE *pa[TM, TK, TB] = A + offa;"""
+       
+        if use_lut_a and not lut_mode_a == _einsum.LUT_MODE.SCALAR:
+            spec = '__constant__' if lut_mode_a == _einsum.LUT_MODE.CONSTANT else ''
+            cast = '(int __constant__*)' if lut_mode_a == _einsum.LUT_MODE.CONSTANT else ''
+            src += f"""
+    // initialize pointers to A look-up table
+    int offadelta[TK] = off_k + 0 ... TK;
+    int {spec} *padelta[TK]  = {cast}AD  + offadelta;
+    int incda[TM, TK, TB] = (*padelta)[newaxis, :, newaxis];"""
+    
+        src += """
+
+    // initialize pointers to B
+    int offb[TK, TN, TB] = """
+        for i, sym in enumerate(expr_b):
+            ccode = _einsum.print_cc(sym, axes_k, axes_n, axes_b)
+            stride = f'stride_b_{i}' if i < len(expr_b) - 1 else '1'
+            if i > 0:
+                src += ' + '
+            src += f"({ccode}) * {stride}\n                            "
+        src += ';'
+
+        src += """
+    TYPE *pb[TK, TN, TB] = B + offb;"""
+
+
+        if use_lut_b and not lut_mode_b == _einsum.LUT_MODE.SCALAR:
+            spec = '__constant__' if lut_mode_b == _einsum.LUT_MODE.CONSTANT else ''
+            cast = '(int __constant__*)' if lut_mode_b == _einsum.LUT_MODE.CONSTANT else ''
+            src += f"""
+    // initialize pointers to B look-up table
+    int offbdelta[TK] = off_k + 0 ... TK;
+    int *pbdelta[TK]  = BD  + offbdelta;"""
+
+        src += f"""
+    
+    // prefetch 
+    bool checkm[TM] = r""" + ''.join(map(str,axes_m)) + f""" < matmul_m;
+    bool checkn[TN] = r""" + ''.join(map(str,axes_n)) + f""" < matmul_n;
+    bool checkk[TK] = {rk} < matmul_k + off_k;
+    bool checka[TM, TK, TB] = checkm[:, newaxis, newaxis] && checkk[newaxis, :, newaxis];
+    bool checkb[TK, TN, TB] = checkk[:, newaxis, newaxis] && checkn[newaxis, :, newaxis];
+    TYPE a[TM, TK, TB] = checka ? *pa : 0;
+    TYPE b[TK, TN, TB] = checkb ? *pb : 0;
+    // accumulate
+    float acc[TM, TN, TB] = 0;
+    for(int k = matmul_k; k > 0; k -= TK) {{
+        acc += a @ b;"""
+
+        if not use_lut_a or not use_lut_b:
+            src += f"""
+        {rk} += TK;
+"""
+            src += _einsum.unpack_cc(tile, axes_k, 'r', True)
+            
+
+        if use_lut_a:
+            if lut_mode_a == _einsum.LUT_MODE.SCALAR:
+                src += """
+            pa += stride_a_inner;"""
+            else:
+                src += """
+            pa += incda;
+            padelta += TK;
+            incda = (*padelta)[newaxis, :, newaxis];"""
+        else:
+            src += """
+            offa = """
+            for i, sym in enumerate(expr_a):
+                ccode = _einsum.print_cc(sym, axes_m, axes_k, axes_b)
+                stride = f'stride_a_{i}' if i < len(expr_a) - 1 else '1'
+                if i > 0:
+                    src += ' + '
+                src += f"({ccode}) * {stride}\n                            "
+            src += """;
+            TYPE *pa[TM, TK, TB] = A + offa;"""
+
+
+
+        if lut_mode_b == _einsum.LUT_MODE.SCALAR:
+            src += """
+        pb += stride_b_inner;"""
+        else:
+            src += """
+        pb += (*pbdelta)[:, newaxis, newaxis];
+        pbdelta += TK;"""
+
+        src += f"""
+        checkk = k > TK;
+        checka = checkm[:, newaxis, newaxis] && checkk[newaxis, :, newaxis];
+        checkb = checkk[:, newaxis, newaxis] && checkn[newaxis, :, newaxis];
+        a = *?(checka)pa;
+        b = *?(checkb)pb;
+    }}
+    TYPE c[TM, TN, TB] = acc;
+
+    // re-materialize ranges
+"""
+        for axes, tile, off in zip([axes_m, axes_n, axes_b],
+                                   ['TM', 'TN', 'TB'],
+                                   ['pid_m*TM', 'pid_n*TN', 'pid_b*TB']):
+            currs = ''.join(map(str,axes))
+            if axes:
+                src += f"    r{currs} = {off} + 0 ... {tile};\n"
+                src += _einsum.unpack_cc(tile, axes, 'r', True)
+
+        src += """
+    // initialize pointers to C
+    int offc[TM, TN, TB] = """
+        for i, sym in enumerate(expr_c):
+            stride = f'stride_c_{i}' if i < len(expr_c) - 1 else '1'
+            ccode = _einsum.print_cc(sym, axes_m, axes_n, axes_b)
+            if i > 0:
+                src += ' + '
+            src += f"({ccode}) * {stride}\n                            "
+        src += ';'
+
+        src += """
+    TYPE *pc[TM, TN, TB] = C + offc;
+    
+    // bounds-checking
+    checkm = r""" + ''.join(map(str,axes_m)) + """ < matmul_m;
+    checkn = r""" + ''.join(map(str,axes_n)) + """ < matmul_n;
+    bool checkc[TM, TN, TB] = checkm[:, newaxis, newaxis] && 
+                              checkn[newaxis, :, newaxis];
+
+    // write back
+#if TZ == 1
+    *?(checkc)pc = c;
+#else
+    int *plock = locks + pid_mn + pid_b * get_num_programs(0);
+    int *pcount = plock + 1024*1024;
+    // spin
+    for(int repeat = 1; repeat == 1; repeat = atomic_cas(plock, 0, 1));
+    int count = *pcount;
+    if(count == 0)
+      *?(checkc)pc = c;
+    else
+      *?(checkc)pc = c + *?(checkc)pc;
+    atomic_xchg(pcount, (count + 1) % (grid_z));
+    atomic_xchg(plock, 0);
+#endif
+}
+"""
+
+        #print(src)
+        ret = triton.kernel(src, ['C'])
+        if use_lut_a and lut_mode_a == _einsum.LUT_MODE.CONSTANT:
+            ret.set_constant('AD', delta_a)
+        if use_lut_b and lut_mode_b == _einsum.LUT_MODE.CONSTANT:
+            ret.set_constant('BD', delta_b)
+        return ret
+
+    ############################
+    ## Look-up Table
+    ############################
+
+    class LUT_MODE(IntEnum):
+        SCALAR = 1
+        CONSTANT = 2
+        DRAM = 3
+    
+    def lut_mode(delta):
+        if delta.size == 0 or np.min(delta) == np.max(delta):
+            return _einsum.LUT_MODE.SCALAR
+        #if delta.size < 4096:
+        #    return _einsum.LUT_MODE.CONSTANT
+        return _einsum.LUT_MODE.DRAM
+
+    def symbolic_delta(symbols, axes):
+        rank = len(symbols)
+        strides = [sp.symbols(f'stride{d}') for d in range(rank)]
+        nexts = {s: sp.symbols(f'next{s}') for s in axes}
+        delta = 0
+        for i in range(rank):
+            delta += strides[i] * (symbols[i].subs(nexts) - symbols[i])
+        return delta
+
+    def unpack_offset(k, axes, dims):
+        ret = dict()
+        for d in reversed(axes):
+            ret[d] = k % dims[d]
+            k = k // dims[d]
+        return ret
+    
+    def make_delta(axes, step, stride, dims, symbols, arrays):
+        # symbolic pointer increments
+        delta = _einsum.symbolic_delta(symbols, axes)
+        args =  [f'stride{d}' for d in range(len(stride))]
+        args += [f'{sk}' for sk in axes]
+        args += [f'next{sk}' for sk in axes]
+        args += [f'{sk}' for sk, _ in arrays]
+        fn = sp.lambdify(args, delta, 'numpy')
+        # inner axes values
+        inner = [dims[d] for d in axes]
+        k = np.arange(np.prod(inner), dtype=np.int32)
+        off      = _einsum.unpack_offset(k, axes, dims)
+        nextoff  = _einsum.unpack_offset(k + step, axes, dims)
+        # evaluate deltas
+        args  = [s for s in stride]
+        args += [off[sk] for sk in axes]
+        args += [nextoff[sk] for sk in axes]
+        args += [x for _, x in arrays]
+        delta = fn(*args)
+        return delta, _einsum.lut_mode(delta[:-step])
+
+    ############################
+    ## Einsum parsing
+    ############################
+
+    def uniq(seq):
+        seen = set()
+        seen_add = seen.add
+        return [x for x in seq if not (x in seen or seen_add(x))]
+
+    def parse_axes(expr_a, expr_b, expr_c, subscripted):
+        is_index = lambda x: type(x) == sp.indexed.Indexed or str(x) in subscripted
+        sym_a = [x for s in expr_a for x in s.free_symbols if not is_index(x)]
+        sym_b = [x for s in expr_b for x in s.free_symbols if not is_index(x)]
+        sym_c = [x for s in expr_c for x in s.free_symbols]
+        batch = [d for d in sym_a if d in sym_b and d in sym_c]
+        outer = [d for d in sym_a if d not in sym_b and d in sym_c]
+        inner = [d for d in sym_a if d in sym_b and d not in sym_c]
+        illegal = [d for d in sym_a if d not in sym_b and d not in sym_c]
+        if illegal:
+            raise ValueError(f"einsum labels {illegal} ({expr_a}) "\
+                             f"not present in {expr_b} or {expr_c}")
+        return _einsum.uniq(batch), _einsum.uniq(outer), _einsum.uniq(inner)
+
+
+    def replace_subscript(expr, arrays):
+        # replace array indexing by Indexed()
+        indexed = re.findall('([_a-zA-Z][_a-zA-Z0-9]*)\[([_a-z]*)\]', expr)
+        for x in indexed:
+            arrays.append(x[0])
+            expr = expr.replace(f'{x[0]}[{x[1]}]', f'Indexed({x[0]},{x[1]})')
+        return expr
+
+
+    def parse_expr(expr, arrays):
+        # extract symbols
+        sym = []
+        i = 0
+        while i < len(expr):
+            d = expr[i]
+            if d == '(':
+                size = expr[i:].find(')')
+                d = expr[i : i + size + 1]
+                d = _einsum.replace_subscript(d, arrays)
+                sym.append(parse_expr(d))
+                i += size + 1
+            else:
+                sym.append(parse_expr(d))
+                i += 1
+        return sym
+  
+    ############################
+    ## Preprocessing
+    ############################
+
+    @staticmethod
+    def pad(tensor, pad):
+        pad = pad + [0] *  (2*len(tensor.shape) - len(pad))
+        begin = [ x if x > 0 else None for x in pad[-1::-2]]
+        end   = [-x if x > 0 else None for x in pad[-2::-2]]
+        slices = [slice(b, e) for b, e in zip(begin, end)]
+        tensor = torch.nn.functional.pad(tensor, pad, 'constant', 0)
+        tensor = tensor[slices]
+        return tensor
+
+
+    ############################
+    ## Compilation
+    ############################
+
+    class instance:
+
+        locks = None
+        kernel_cache = dict()
+
+        def __init__(self, einsum, dtype, stride_a, stride_b, stride_c, shape_a, shape_b, shape_c, arrays):
+            # parse symbols
+            expr_a, expr_bc = einsum.split(",")
+            expr_b, expr_c  = expr_bc.split("->")
+            subscripted = []
+            sym_a = _einsum.parse_expr(expr_a, subscripted)
+            sym_b = _einsum.parse_expr(expr_b, subscripted)
+            sym_c = _einsum.parse_expr(expr_c, subscripted)
+            # parse axes
+            axes_b, axes_m, axes_k = _einsum.parse_axes(sym_a, sym_b, sym_c, subscripted)
+            _, axes_n, _           = _einsum.parse_axes(sym_b, sym_a, sym_c, subscripted)
+            axes = axes_b + axes_m + axes_n + axes_k
+            # check dimensions
+            dims_a  = dict(zip(sym_a, shape_a))
+            dims_b  = dict(zip(sym_b, shape_b))
+            dims_c  = dict(zip(sym_c, shape_c))
+            for axes in [axes_b, axes_k]:
+                for d in axes:
+                    dim_a = dims_a[d] if d in sym_a else None
+                    dim_b = dims_b[d] if d in sym_b else None
+                    if dim_a and dim_b and dim_a != dim_b:
+                        raise ValueError(f'incompatible dimension {d}'
+                                        f' (a: {dim_a}; b: {dim_b})')
+            dims = dict()
+            dims.update(dims_a)
+            dims.update(dims_b)
+            dims.update(dims_c)
+            # look-up tables
+            TK = 16 if dtype == triton.fw.torch.float16 else 8
+            arrays = [(x, arrays[x]) for x in subscripted]
+            delta_a, lut_mode_a = _einsum.make_delta(axes_k, TK, stride_a, dims, sym_a, arrays)
+            delta_b, lut_mode_b = _einsum.make_delta(axes_k, TK, stride_b, dims, sym_b, arrays)
+            # hash for recompilation
+            stride_a_multiple = max([x for x in [1, 2, 4, 8] if shape_a[-1] % x == 0])
+            stride_b_multiple = max([x for x in [1, 2, 4, 8] if shape_b[-1] % x == 0])
+            stride_c_multiple = max([x for x in [1, 2, 4, 8] if shape_c[-1] % x == 0])
+            name = f'{expr_a}_{expr_b}_{expr_c}_{lut_mode_a}_{lut_mode_b}'\
+                f'_{stride_a_multiple}_{stride_b_multiple}_{stride_c_multiple}'
+            # recompile if necessary
+            cache = _einsum.instance.kernel_cache
+            if name not in cache:
+                cachesize = len(cache)
+                cache[name] = _einsum.make_kernel(f'__einsum{cachesize}', 
+                                                        sym_a, sym_b, sym_c, 
+                                                        axes_m, axes_n, axes_k, axes_b, 
+                                                        stride_a_multiple, stride_b_multiple, stride_c_multiple,
+                                                        lut_mode_a, lut_mode_b,
+                                                        delta_a, delta_b,
+                                                        subscripted)
+            self.kernel = cache[name]
+            # Initialize locks
+            if _einsum.instance.locks is None:
+                _einsum.instance.locks = torch.zeros(2*1024*1024, dtype=torch.int32).cuda()
+            # Kernel arguments
+            dim_m = [dims[d] for d in axes_m]
+            dim_n = [dims[d] for d in axes_n]
+            dim_k = [dims[d] for d in axes_k]
+            dim_b = [dims[d] for d in axes_b]
+            M = reduce(mul, dim_m, 1)
+            N = reduce(mul, dim_n, 1)
+            K = reduce(mul, dim_k, 1)
+            B = reduce(mul, dim_b, 1)
+            stride_a = list(stride_a[:-1])
+            stride_b = list(stride_b[:-1])
+            stride_c = list(stride_c[:-1])
+            arrays = [torch.from_numpy(x).cuda() for _, x in arrays]
+            alpha = 1.
+            div_m = 1
+            self.args = [None, None, None,
+                         _einsum.instance.locks, 
+                         alpha, M, N, K, div_m] +\
+                         dim_m + dim_n +  dim_k + dim_b +\
+                         stride_a + stride_b + stride_c
+            if lut_mode_a != _einsum.LUT_MODE.CONSTANT:
+                delta_a = delta_a[0] if lut_mode_a == _einsum.LUT_MODE.SCALAR else torch.from_numpy(delta_a).cuda()
+                self.args += [delta_a]
+            if lut_mode_b != _einsum.LUT_MODE.CONSTANT:
+                delta_b = delta_b[0] if lut_mode_b == _einsum.LUT_MODE.SCALAR else torch.from_numpy(delta_b).cuda()
+                self.args += [delta_b]
+            self.args += arrays
+            self.grid = lambda opt: [triton.cdiv(M, opt.d('TM')) * 
+                                    triton.cdiv(N, opt.d('TN')),
+                                    triton.cdiv(B, opt.d('TB')),
+                                    opt.d('TZ')]
+            # position of dynamic arguments
+            self.pos_a = 0
+            self.pos_b = 1
+            self.pos_c = 2
+            # pre-processor macros
+            TM = [16] + [x for x in [32, 64, 128] if x <= M]
+            TN = [16] + [x for x in [32, 64, 128] if x <= N]
+            TB = [x for x in [1, 2, 4] if x <= B]
+            MAX_GZ = K // 2048
+            MIN_GM = M // max(TM)
+            MIN_GN = N // max(TN)
+            MIN_GB = B // max(TB)
+            TZ = [x for x in [1, 2, 4, 8, 16, 32] \
+                    if x < MAX_GZ and x*MIN_GM*MIN_GN*MIN_GB < 256]
+            TZ = [1] if not TZ else [TZ[-1], TZ[-1]*2]
+            self.macros = {  'TM': TM, 'TN': TN, 'TB': TB, 'TK': TK, 'TZ': TZ, 'TYPE': dtype }
+            # information on compute
+            self.dtype = dtype
+            self.flops = 2 * B * M * N * K
+            self.sym_a = sym_a
+            self.sym_b = sym_b
+            self.sym_c = sym_c
+            # save equivalent mat-mul dimensions
+            self.matmul_B = B
+            self.matmul_M = M
+            self.matmul_N = N
+            self.matmul_K = K
+                    
+        def run(self, a, b, c, bench):
+            self.args[self.pos_a] = a
+            self.args[self.pos_b] = b
+            self.args[self.pos_c] = c
+            self.kernel(*self.args, grid=self.grid, bench=bench, defines=self.macros)
+
+
+
+
+    ############################
+    ## Forward
+    ############################
+
+    instance_cache = dict()
+
+    @staticmethod
+    def forward(ctx, einsum, a, b, shape_c, **kwargs):
+        bench = kwargs['bench'] if 'bench' in kwargs else False
+        arrays = kwargs['arrays'] if 'arrays' in kwargs else dict()
+        # allocate output
+        dtype = a.dtype
+        c = triton.empty(shape_c, dtype=dtype)
+        # compile einsum instance
+        cache = _einsum.instance_cache
+        key = (einsum, dtype, 
+               a.stride(), b.stride(), c.stride(), 
+               a.shape, b.shape, c.shape)
+        if key not in cache:
+            cache[key] = _einsum.instance(einsum, dtype, 
+                                          a.stride(), b.stride(), c.stride(),
+                                          a.shape, b.shape, c.shape, arrays)
+        instance = cache[key]
+        instance.run(a, b, c, bench)
+        # save information in context
+        ctx.flops = instance.flops
+        ctx.sym_a = instance.sym_a
+        ctx.sym_b = instance.sym_b
+        ctx.sym_c = instance.sym_c
+        ctx.matmul_B = instance.matmul_B
+        ctx.matmul_M = instance.matmul_M
+        ctx.matmul_N = instance.matmul_N
+        ctx.matmul_K = instance.matmul_K
+        ctx.bench = bench
+        ctx.save_for_backward(a, b)
+        return c
+
+    ############################
+    ## Backward
+    ############################
+
+    @staticmethod
+    def sym_invert(sym_c, sym_x, prefix, renamed, inverse):
+        for i, expr in enumerate(sym_x):
+           if expr.is_symbol:
+               continue
+           sc = [x for x in expr.free_symbols if x in sym_c][0]
+           sx = sp.symbols(f'{prefix}{i}')
+           renamed[expr] = sx
+           inverse[sc] = sp.solve(sp.Eq(expr, sx), sc)[0]
+
+    @staticmethod
+    def sym_to_expr(sym):
+        res = [f'({x})' for x in sym]
+        res = ''.join(res)
+        return res
+
+    @staticmethod
+    def backward(ctx, dy):
+        a, b = ctx.saved_tensors
+        sym_a = ctx.sym_a
+        sym_b = ctx.sym_b
+        sym_c = ctx.sym_c
+        inverse = dict()
+        renamed = dict()
+        _einsum.sym_invert(sym_c, sym_a, 'a', renamed, inverse)
+        _einsum.sym_invert(sym_c, sym_b, 'b', renamed, inverse)
+        sym_a = [renamed[x] if x in renamed else x for x in sym_a]
+        sym_b = [renamed[x] if x in renamed else x for x in sym_b]
+        sym_c = [inverse[x] if x in inverse else x for x in sym_c]
+        expr_a =  _einsum.sym_to_expr(sym_a)
+        expr_b =  _einsum.sym_to_expr(sym_b)
+        expr_c =  _einsum.sym_to_expr(sym_c)
+        expr = f'{expr_c},{expr_b}->{expr_a}'
+        da = einsum(expr, dy, b, a.shape, False)
+        return None, da, None, None, None
+
+
+
+einsum = _einsum.apply
\ No newline at end of file
diff --git a/python/triton/utils.py b/python/triton/utils.py
new file mode 100644
index 000000000..da8a1e8f9
--- /dev/null
+++ b/python/triton/utils.py
@@ -0,0 +1,75 @@
+import triton.frameworks as fw
+import triton._C.libtriton as libtriton
+import numpy as np
+import weakref
+
+def cdiv(a, b):
+    return -(-a // b)
+
+class tf_empty_proxy:
+
+  def __init__(self, shape, dtype):
+    self.shape = shape
+    self.dtype = dtype
+    self.tensor = None
+  
+  def to_tensor(self):
+    assert self.tensor is not None
+    return self.tensor
+
+def empty(shape, dtype):
+  if fw.has_tensorflow():
+    shape = [fw.tensorflow.constant(x) for x in shape]
+    shape = fw.tensorflow.stack(shape)
+    return tf_empty_proxy(shape, dtype)
+    #return fw.tf_extra_ops.alloc_empty(args, T = dtype)
+  elif fw.has_torch():
+    return fw.torch.empty(shape, dtype=dtype, device='cuda:0')
+
+def shape(A) :
+  if fw.has_tensorflow():
+    return A.shape.as_list()
+  elif fw.has_torch():
+    return A.shape
+  else:
+    assert False
+
+
+class id_dict:
+
+  # Lazy entry for e.g., tensorflow, when value of benchmark is
+  # not known at graph compile time
+  class lazy_entry:
+    def __init__(self, id):
+      self.id = id
+
+    def get(self):
+      return libtriton.retrieve_scalar(self.id)
+
+  def __init__(self):
+    self.data = dict()
+
+  def __delitem__(self, key):
+    del self.data[key]
+
+  @staticmethod
+  def _get_key(key):
+    if fw.has_tensorflow():
+      if isinstance(key, fw.tensorflow.Tensor):
+        key = id(key.op)
+    if fw.has_torch():
+      if isinstance(key, fw.torch.Tensor):
+        key = id(key)
+    return key
+
+  def __getitem__(self, key):
+    ret = self.data[id_dict._get_key(key)]
+    if isinstance(ret, id_dict.lazy_entry):
+      return ret.get()
+    return ret
+
+  def __len__(self):
+    return len(self.data)
+
+  def __setitem__(self, key, value):
+    self.data[id_dict._get_key(key)] = value
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 000000000..8c80ee070
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,3 @@
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}/common")
+add_subdirectory(bench)
+add_subdirectory(unit)
diff --git a/tests/bench/CMakeLists.txt b/tests/bench/CMakeLists.txt
new file mode 100644
index 000000000..d9978ca3f
--- /dev/null
+++ b/tests/bench/CMakeLists.txt
@@ -0,0 +1,6 @@
+foreach(PROG dot copy)
+     set(TARGET bench_${PROG})
+     add_executable(${TARGET} ${PROG}.cc)
+     set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME ${TARGET})
+     target_link_libraries(${TARGET} triton dl)
+endforeach(PROG)
diff --git a/tests/bench/copy.cc b/tests/bench/copy.cc
new file mode 100644
index 000000000..f1252797e
--- /dev/null
+++ b/tests/bench/copy.cc
@@ -0,0 +1,36 @@
+﻿#include <iostream>
+#include <tuple>
+#include "copy.h"
+#include "triton/driver/backend.h"
+
+
+int main() {
+  // initialize default compute device
+  auto context = triton::driver::backend::contexts::get_default();
+  triton::driver::stream* stream = triton::driver::stream::create(context);
+  // shapes to benchmark
+  typedef std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> config_t;
+  std::vector<config_t> configs = {
+    {{4096*4096}, {0}, {0}},
+    {{4096, 4096}, {0, 1}, {1, 0}},
+    {{4096, 4096}, {0, 1}, {1, 0}},
+    {{4096, 4096}, {1, 0}, {0, 1}},
+    {{4096, 4096}, {0, 1}, {0, 1}},
+    {{256, 256, 256}, {0, 1, 2}, {0, 1, 2}},
+    {{256, 256, 256}, {0, 1, 2}, {0, 2, 1}},
+    {{256, 256, 256}, {1, 0, 2}, {1, 2, 0}},
+    {{256, 256, 256}, {1, 2, 0}, {1, 0, 2}},
+    {{256, 256, 256}, {2, 0, 1}, {0, 1, 2}},
+    {{256, 256, 256}, {2, 1, 0}, {0, 2, 1}}
+  };
+  // does the work
+  std::vector<int32_t> shape;
+  std::vector<int32_t> ord_x, ord_y;
+  for(const auto& c: configs){
+    std::tie(shape, ord_x, ord_y) = c;
+    std::cout << "// " << c << std::flush;
+    for(auto perf: bench_copy_nd(stream, shape, ord_x, ord_y))
+      std::cout << ", " << perf << std::flush;
+    std::cout << std::endl;
+  }
+}
diff --git a/tests/bench/copy1d.cc b/tests/bench/copy1d.cc
new file mode 100644
index 000000000..51afbacd6
--- /dev/null
+++ b/tests/bench/copy1d.cc
@@ -0,0 +1,54 @@
+﻿#include <cstring>
+#include <sstream>
+#include <cstdio>
+#include "triton/driver/backend.h"
+#include "triton/driver/stream.h"
+#include "triton/tools/bench.hpp"
+#include "triton/external/half.hpp"
+#include "triton/runtime/function.h"
+#include "src/copy.h"
+#include "util.h"
+#include "cuda/cublas.h"
+
+
+std::vector<double> do_bench(drv::stream* stream, int32_t N){
+  typedef float NumericT;
+  std::string ty = "float";
+  size_t dt_nbytes = sizeof(NumericT);
+  drv::context* context = stream->context();
+  // create inputs
+  auto dx = std::unique_ptr<drv::buffer>(drv::buffer::create(context, N*dt_nbytes));
+  auto dy = std::unique_ptr<drv::buffer>(drv::buffer::create(context, N*dt_nbytes));
+  // create options
+  rt::function::options_space_t opt;
+  opt.defines.push_back({"TYPE", {ty}});
+  opt.defines.push_back({"TN", {"128"}});
+  opt.num_warps = {1, 2, 4, 8};
+  // create function
+  rt::function function(src::copy1d, opt);
+  // benchmark available libraries
+  std::vector<double> result;
+  auto gbps = [&](double ns) { return 2*N*dt_nbytes / (ns * 1e-9) * 1e-9; };
+  // triton
+  double triton_ns = triton::tools::bench([&]() { function({&*dx, &*dy, N}, grid1d(N), stream);}, stream);
+  result.push_back(gbps(triton_ns));
+  // done
+  return result;
+}
+
+int main() {
+  // initialize default compute device
+  auto context = triton::driver::backend::contexts::get_default();
+  triton::driver::stream* stream = triton::driver::stream::create(context);
+  // shapes to benchmark
+  typedef std::tuple<int> config_t;
+  std::vector<config_t> configs = { 1024*1024*32 };
+  int N;
+  for(const auto& c: configs){
+    std::tie(N) = c;
+    std::cout << "// " << c << std::flush;
+    for(auto perf: do_bench(stream, N))
+      std::cout << ", " << perf << std::flush;
+    std::cout << std::endl;
+  }
+}
diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc
new file mode 100644
index 000000000..131083e72
--- /dev/null
+++ b/tests/bench/dot.cc
@@ -0,0 +1,43 @@
+﻿#include "triton/driver/backend.h"
+#include "triton/driver/stream.h"
+#include "dot.h"
+
+int main() {
+  // initialize default compute device
+  auto context = triton::driver::backend::contexts::get_default();
+  triton::driver::stream* stream = triton::driver::stream::create(context);
+  // shapes to benchmark
+  typedef std::tuple<std::vector<int>, bool, bool, int, int, int> config_t;
+  std::vector<config_t> configs;
+  for(auto ord: std::vector<std::vector<int>>{{1, 0}})
+    for(auto x: std::vector<std::array<bool, 2>>{{false, false}, {true, false}}){
+    std::vector<config_t> tmp = {
+//      config_t{ord, x[0], x[1], 512, 512, 512},
+      config_t{ord, x[0], x[1], 8192, 8192, 8192},
+//      config_t{ord, x[0], x[1], 127008, 768, 576},
+//      config_t{ord, x[0], x[1], 8192, 8192, 8192}
+//      config_t{ord, x[0], x[1], 16, 2048, 2048},
+//      config_t{ord, x[0], x[1], 32, 2048, 2048},
+//      config_t{ord, x[0], x[1], 64, 2048, 2048},
+//      config_t{ord, x[0], x[1], 128, 2048, 2048},
+//      config_t{ord, x[0], x[1], 7000, 2048, 2048},
+//      config_t{ord, x[0], x[1], 16, 4096, 4096},
+//      config_t{ord, x[0], x[1], 32, 4096, 4096},
+//      config_t{ord, x[0], x[1], 64, 4096, 4096},
+//      config_t{ord, x[0], x[1], 128, 4096, 4096},
+//      config_t{ord, x[0], x[1], 7000, 4096, 4096}
+    };
+    configs.insert(configs.end(), tmp.begin(), tmp.end());
+  }
+  // does the work
+  std::vector<int> ord;
+  bool AT, BT;
+  int32_t M, N, K;
+  for(const auto& c: configs){
+    std::tie(ord, AT, BT, M, N, K) = c;
+    std::cout << "// " << c ;
+    for(auto perf: bench_dot(stream, HALF, AT, BT, M, N, K, ord, ord))
+      std::cout << ", " << perf << std::flush;
+    std::cout << std::endl;
+  }
+}
diff --git a/tests/common/copy.h b/tests/common/copy.h
new file mode 100644
index 000000000..811e5d7a4
--- /dev/null
+++ b/tests/common/copy.h
@@ -0,0 +1,142 @@
+#include "src/copy.h"
+#include "triton/driver/stream.h"
+#include "triton/runtime/function.h"
+#include "triton/tools/bench.hpp"
+#include "util.h"
+
+int32_t off(const std::vector<int32_t>& idx, const std::vector<int32_t>& strides) {
+  int32_t res = 0;
+  for(size_t d = 0; d < idx.size(); d++)
+    res += idx[d] * strides[d];
+  return res;
+}
+
+enum run_mode_t {
+  BENCH,
+  TEST
+};
+
+template<class T>
+void cc_copy_nd(const std::vector<T>& x, std::vector<T>& y,
+                const std::vector<int32_t>& shape,
+                const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
+  size_t rank = shape.size();
+  // strides for x
+  std::vector<int32_t> x_strides(shape.size());
+  for(size_t d = 0; d < rank; d++)
+    x_strides[x_order[d]] = (d == 0) ? 1 : (x_strides[x_order[d-1]] * shape[x_order[d-1]]);
+  // strides for y
+  std::vector<int32_t> y_strides(shape.size());
+  for(size_t d = 0; d < rank; d++)
+    y_strides[y_order[d]] = (d == 0) ? 1 : (y_strides[y_order[d-1]] * shape[y_order[d-1]]);
+  // copy 1d
+  if(rank == 1)
+    for(int32_t i = 0; i < shape[0]; i++)
+      y[off({i}, y_strides)] = x[off({i}, x_strides)];
+  // copy 2d
+  if(rank == 2)
+    for(int32_t i = 0; i < shape[0]; i++)
+    for(int32_t j = 0; j < shape[1]; j++)
+      y[off({i, j}, y_strides)] = x[off({i, j}, x_strides)];
+  // copy 3d
+  if(rank == 3)
+    for(int32_t i = 0; i < shape[0]; i++)
+    for(int32_t j = 0; j < shape[1]; j++)
+    for(int32_t k = 0; k < shape[2]; k++)
+      y[off({i, j, k}, y_strides)] = x[off({i, j, k}, x_strides)];
+}
+
+void triton_copy_nd(drv::stream* stream, const std::vector<int32_t>& shape,
+                    const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order,
+                    std::vector<std::vector<std::string>> TS,
+                    run_mode_t mode, std::vector<double>& bench, bool &test) {
+  typedef float NumericT;
+  std::string ty = "float";
+  size_t dtsize = sizeof(NumericT);
+  drv::context* context = stream->context();
+
+  // rank
+  size_t rank = shape.size();
+  // size
+  size_t size = 1;
+  for(int32_t d: shape)
+    size *= d;
+  std::vector<std::string> shapename = {"S0", "S1", "S2"};
+  // strides for x
+  std::vector<std::string> x_strides = {"1"};
+  for(size_t d = 0; d < rank - 1; d++)
+    x_strides.push_back(x_strides[d] + " * " + shapename[x_order[d]]);
+  // strides for y
+  std::vector<std::string> y_strides = {"1"};
+  for(size_t d = 0; d < rank - 1; d++)
+    y_strides.push_back(y_strides[d] + " * " + shapename[y_order[d]]);
+
+  // create inputs
+  auto dx = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size*dtsize));
+  auto dy = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size*dtsize));
+  // create options
+  rt::function::options_space_t opt;
+
+
+  // macros
+  opt.defines.push_back({"TYPE", {ty}});
+  for(size_t d = 0; d < rank; d++)
+    opt.defines.push_back({"STRIDE_XS" + std::to_string(x_order[d]), {x_strides[d]}});
+  for(size_t d = 0; d < rank; d++)
+    opt.defines.push_back({"STRIDE_YS" + std::to_string(y_order[d]), {y_strides[d]}});
+  if(TS.empty())
+    TS = tile_nd(rank);
+  for(size_t d = 0; d < rank; d++)
+    opt.defines.push_back({"TS" + std::to_string(d), TS[d]});
+  opt.num_warps = {4};
+
+  // kernel
+  rt::function function(src::copy_nd[rank - 1], opt);
+  std::vector<rt::arg> args = {&*dx, &*dy};
+  for(int32_t d: shape)
+    args.push_back(d);
+  std::vector<std::string> ts = {"TS0", "TS1", "TS2"};
+  auto grid = grid_nd(shape, ts);
+
+  // metrics
+  if(mode == BENCH){
+    auto gbps = [&](double ns) { return 2 * size * dtsize / (ns * 1e-9) * 1e-9; };
+    double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream);
+    bench.push_back(gbps(triton_ns));
+  }
+
+  // test triton
+  if(mode == TEST){
+    std::vector<NumericT> hx(size);
+    std::vector<NumericT> hy(size);
+    std::vector<NumericT> ry(size);
+    for(size_t i = 0; i < hx.size(); i++)
+      hx[i] = static_cast<NumericT>((float)rand()/RAND_MAX);
+    stream->write(&*dx, true, 0, hx);
+    function(args, grid, stream);
+    stream->synchronize();
+    stream->read(&*dy, true, 0, hy);
+    cc_copy_nd(hx, ry, shape, x_order, y_order);
+    test = testing::diff(hy, ry);
+  }
+}
+
+std::vector<double> bench_copy_nd(drv::stream* stream, const std::vector<int32_t>& shape,
+                                  const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
+  std::vector<double> bench;
+  bool test;
+  triton_copy_nd(stream, shape, x_order, y_order, {}, BENCH, bench, test);
+  return bench;
+}
+
+bool test_copy_nd(drv::stream* stream, const std::vector<int32_t>& shape,
+                  const std::vector<int32_t>& TS,
+                  const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
+  std::vector<double> bench;
+  bool test;
+  std::vector<std::vector<std::string>> TSS;
+  for(int32_t d: TS)
+    TSS.push_back({std::to_string(d)});
+  triton_copy_nd(stream, shape, x_order, y_order, TSS, TEST, bench, test);
+  return test;
+}
diff --git a/tests/common/cuda/cublas.h b/tests/common/cuda/cublas.h
new file mode 100644
index 000000000..db1f2a360
--- /dev/null
+++ b/tests/common/cuda/cublas.h
@@ -0,0 +1,221 @@
+/* Copyright 2019 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <algorithm>
+#include <vector>
+#include <cassert>
+#include "forward.h"
+#include "triton/driver/buffer.h"
+#include "triton/driver/stream.h"
+#include "triton/driver/context.h"
+#include "triton/driver/error.h"
+#include "triton/tools/bench.hpp"
+
+
+class cublas {
+private:
+  template <class F>
+  struct return_type;
+
+  template <class R, class... A>
+  struct return_type<R (*)(A...)>
+  { typedef R type; };
+
+  typedef bool (*f_init_t)();
+
+  template<f_init_t initializer, typename FunPtrT, typename... Args>
+  static typename return_type<FunPtrT>::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args)
+  {
+    initializer();
+    if(cache == nullptr){
+      cache = dlsym(lib_h, name);
+      if(cache == 0)
+        throw std::runtime_error("dlsym unable to load function");
+    }
+    FunPtrT fptr;
+    *reinterpret_cast<void **>(&fptr) = cache;
+    typename return_type<FunPtrT>::type res = (*fptr)(args...);
+    triton::driver::check(res);
+    return res;
+  }
+
+public:
+  static bool cublasinit();
+  static cublasStatus_t cublasSetMathMode(cublasHandle_t h, cublasMath_t m);
+  static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
+  static cublasStatus_t cublasGetStream_v2(cublasHandle_t h, cudaStream_t *streamId);
+  static cublasStatus_t cublasSetStream_v2(cublasHandle_t h, cudaStream_t streamId);
+  static cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+                                     int m, int n, int k,
+                                     const void *alpha, const void *A, cudaDataType Atype, int lda,
+                                     const void *B, cudaDataType Btype, int ldb, const void *beta,
+                                     void *C, cudaDataType Ctype, int ldc,
+                                     cudaDataType computeType, cublasGemmAlgo_t algo);
+
+private:
+  static void* so_;
+  static void* cublasGetStream_v2_;
+  static void* cublasSetStream_v2_;
+  static void* cublasCreate_v2_;
+  static void* cublasGemmEx_;
+  static void* cublasSetMathMode_;
+};
+
+void* cublas::so_;
+void* cublas::cublasGetStream_v2_;
+void* cublas::cublasSetStream_v2_;
+void* cublas::cublasCreate_v2_;
+void* cublas::cublasGemmEx_;
+void* cublas::cublasSetMathMode_;
+
+
+bool cublas::cublasinit() {
+  if(so_==nullptr)
+    so_ = dlopen("libcublas.so", RTLD_LAZY);
+  return so_ != nullptr;
+}
+
+cublasStatus_t cublas::cublasGetStream_v2(cublasHandle_t h, cudaStream_t *a)
+{ return f_impl<cublas::cublasinit>(so_, cublasGetStream_v2, cublasGetStream_v2_, "cublasGetStream_v2", h, a); }
+cublasStatus_t cublas::cublasSetStream_v2(cublasHandle_t h, cudaStream_t a)
+{ return f_impl<cublas::cublasinit>(so_, cublasSetStream_v2, cublasSetStream_v2_, "cublasSetStream_v2", h, a); }
+cublasStatus_t cublas::cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
+                                    const void *alpha, const void *A, cudaDataType Atype, int lda,
+                                    const void *B, cudaDataType Btype, int ldb, const void *beta,
+                                    void *C, cudaDataType Ctype, int ldc, cudaDataType computeType, cublasGemmAlgo_t algo) {
+  return f_impl<cublas::cublasinit>(so_, cublasGemmEx, cublasGemmEx_, "cublasGemmEx", handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
+}
+cublasStatus_t cublas::cublasCreate_v2(cublasHandle_t *h) {
+  return f_impl<cublas::cublasinit>(so_, cublasCreate_v2, cublasCreate_v2_, "cublasCreate_v2", h);
+}
+cublasStatus_t cublas::cublasSetMathMode(cublasHandle_t h, cublasMath_t m) {
+  return f_impl<cublas::cublasinit>(so_, cublasSetMathMode, cublasSetMathMode_, "cublasSetMathMode", h, m);
+}
+
+
+
+inline cublasGemmAlgo_t cublasGemmFastest(
+                         triton::driver::stream* stream,
+                         cublasHandle_t handle, cudaDataType cudt,
+                         cublasOperation_t AT, cublasOperation_t BT,
+                         int32_t M, int32_t N, int32_t K,
+                         void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb,
+                         void* beta, CUdeviceptr C, int32_t ldc) {
+
+  // initialize list of cublas algorithms
+  static std::vector<cublasGemmAlgo_t> algorithms;
+  if(algorithms.empty()) {
+    // non-tensor ops
+    for(int i = -1; i < 24; i++)
+      algorithms.push_back((cublasGemmAlgo_t)i);
+    // tensor ops
+    for(int i = 99; i < 116; i++)
+      algorithms.push_back((cublasGemmAlgo_t)i);
+  }
+
+  // cache to avoid re-benchmarking
+  typedef std::tuple<cudaDataType_t,
+                     cublasOperation_t, cublasOperation_t,
+                     int32_t, int32_t, int32_t> key_t;
+  static std::map<key_t, cublasGemmAlgo_t> cache;
+  key_t key(cudt, AT, BT, M, N, K);
+  // benchmark algorithms if necessary
+  if(cache.find(key) == cache.end()){
+    std::vector<double> times;
+    for(cublasGemmAlgo_t a: algorithms) {
+      cublasStatus_t status;
+      double nanosec = triton::tools::bench([&](){ status = cublas::cublasGemmEx(handle, AT, BT,
+                                                                 M, N, K,
+                                                                 alpha, (const void*)A, cudt, lda,
+                                                                 (const void*)B, cudt, ldb,
+                                                                 beta, (void*)C, cudt, ldc, cudt,
+                                                                 a); }, stream);
+      if(status != CUBLAS_STATUS_SUCCESS)
+        nanosec = INFINITY;
+    }
+    size_t argmin = std::min_element(times.begin(), times.end()) - times.begin();
+    assert(times[argmin] != INFINITY);
+    cache.insert({key, algorithms[argmin]});
+  }
+
+  // return best algorithm
+  return cache.at(key);
+}
+
+
+
+
+/* Get cuBLAS handle */
+inline cublasHandle_t cublasGetHandle(triton::driver::stream* stream) {
+  static std::map<CUstream, cublasHandle_t> cache;
+  CUstream key = *stream->cu();
+
+  // create handle if necessary
+  if(cache.find(key) == cache.end()) {
+    cublasHandle_t handle;
+    if(cublas::cublasCreate_v2(&handle) != CUBLAS_STATUS_SUCCESS)
+      throw std::runtime_error("Error: could not create cuBLAS handle");
+    cublas::cublasSetStream_v2(handle, key);
+    cache.insert({key, handle});
+  }
+
+  // return handle for the stream
+  return cache.at(key);
+}
+
+
+
+/* Simplified API for default GEMM */
+inline void cublasGemm(cublasDataType_t dtype,
+                       triton::driver::stream* stream,
+                       bool AT, bool BT,
+                       int32_t M, int32_t N, int32_t K,
+                       void* alpha, triton::driver::buffer* A, int32_t lda,
+                       triton::driver::buffer* B, int32_t ldb,
+                       void* beta, triton::driver::buffer* C, int32_t ldc,
+                       cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT) {
+
+  // switch triton context
+  triton::driver::cu_context::context_switcher scope(*stream->context());
+  // get handle
+  static cublasHandle_t handle = cublasGetHandle(stream);
+  // set math mode
+  if(dtype == CUDA_R_16F)
+    cublas::cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
+  // cuda types
+  static const std::map<char, cublasOperation_t> cu_op = {
+    {false, CUBLAS_OP_N},
+    {true, CUBLAS_OP_T}
+  };
+  cublasOperation_t opa = cu_op.at(AT);
+  cublasOperation_t opb = cu_op.at(BT);
+  // benchmark fastest
+  if(fastest)
+    *fastest = cublasGemmFastest(stream, handle, dtype, opa, opb, M, N, K, alpha, *A->cu(), lda, *B->cu(), ldb, beta, *C->cu(), ldc);
+  else {
+    // execute supplied algo
+    cublasStatus_t status = cublas::cublasGemmEx(handle, opa, opb, M, N, K,
+                                                 alpha, (const void*)*A->cu(), dtype, lda,
+                                                 (const void*)*B->cu(), dtype, ldb,
+                                                 beta, (void*)*C->cu(), dtype, ldc, dtype, algo);
+  }
+}
diff --git a/tests/common/cuda/forward.h b/tests/common/cuda/forward.h
new file mode 100644
index 000000000..bd32adec6
--- /dev/null
+++ b/tests/common/cuda/forward.h
@@ -0,0 +1,105 @@
+#ifndef _COMMON_CUDA_FORWARDS_H_
+#define _COMMON_CUDA_FORwARDS_H_
+
+struct cublasContext;
+typedef struct cublasContext *cublasHandle_t;
+struct CUstream_st;
+typedef struct CUstream_st *cudaStream_t;
+
+/* CUBLAS status type returns */
+typedef enum{
+    CUBLAS_STATUS_SUCCESS         =0,
+    CUBLAS_STATUS_NOT_INITIALIZED =1,
+    CUBLAS_STATUS_ALLOC_FAILED    =3,
+    CUBLAS_STATUS_INVALID_VALUE   =7,
+    CUBLAS_STATUS_ARCH_MISMATCH   =8,
+    CUBLAS_STATUS_MAPPING_ERROR   =11,
+    CUBLAS_STATUS_EXECUTION_FAILED=13,
+    CUBLAS_STATUS_INTERNAL_ERROR  =14,
+    CUBLAS_STATUS_NOT_SUPPORTED   =15,
+    CUBLAS_STATUS_LICENSE_ERROR   =16
+} cublasStatus_t;
+
+/*For different GEMM algorithm */
+typedef enum {
+    CUBLAS_GEMM_DFALT               = -1,
+    CUBLAS_GEMM_DEFAULT             = -1,
+    CUBLAS_GEMM_ALGO0               =  0, // maxwell_sgemm_32x128_nt
+    CUBLAS_GEMM_ALGO1               =  1, // maxwell_sgemm_64x64_nt
+    CUBLAS_GEMM_ALGO2               =  2, // maxwell_sgemm_128x32_nt
+    CUBLAS_GEMM_ALGO3               =  3, // maxwell_sgemm_128x64_nt
+    CUBLAS_GEMM_ALGO4               =  4, // maxwell_sgemm_128x128_nt
+    CUBLAS_GEMM_ALGO5               =  5,
+    CUBLAS_GEMM_ALGO6               =  6,
+    CUBLAS_GEMM_ALGO7               =  7,
+    CUBLAS_GEMM_ALGO8               =  8,
+    CUBLAS_GEMM_ALGO9               =  9,
+    CUBLAS_GEMM_ALGO10              =  10,
+    CUBLAS_GEMM_ALGO11              =  11,
+    CUBLAS_GEMM_ALGO12              =  12,
+    CUBLAS_GEMM_ALGO13              =  13,
+    CUBLAS_GEMM_ALGO14              =  14,
+    CUBLAS_GEMM_ALGO15              =  15,
+    CUBLAS_GEMM_ALGO16              =  16,
+    CUBLAS_GEMM_ALGO17              =  17,
+    CUBLAS_GEMM_ALGO18              =  18, //sliced 32x32
+    CUBLAS_GEMM_ALGO19              =  19, //sliced 64x32
+    CUBLAS_GEMM_ALGO20              =  20, //sliced 128x32
+    CUBLAS_GEMM_ALGO21              =  21, //sliced 32x32  -splitK
+    CUBLAS_GEMM_ALGO22              =  22, //sliced 64x32  -splitK
+    CUBLAS_GEMM_ALGO23              =  23, //sliced 128x32 -splitK
+    CUBLAS_GEMM_DEFAULT_TENSOR_OP   =  99,
+    CUBLAS_GEMM_DFALT_TENSOR_OP     =  99,
+    CUBLAS_GEMM_ALGO0_TENSOR_OP     =  100,
+    CUBLAS_GEMM_ALGO1_TENSOR_OP     =  101,
+    CUBLAS_GEMM_ALGO2_TENSOR_OP     =  102,
+    CUBLAS_GEMM_ALGO3_TENSOR_OP     =  103,
+    CUBLAS_GEMM_ALGO4_TENSOR_OP     =  104,
+    CUBLAS_GEMM_ALGO5_TENSOR_OP     =  105,
+    CUBLAS_GEMM_ALGO6_TENSOR_OP     =  106,
+    CUBLAS_GEMM_ALGO7_TENSOR_OP     =  107,
+    CUBLAS_GEMM_ALGO8_TENSOR_OP     =  108,
+    CUBLAS_GEMM_ALGO9_TENSOR_OP     =  109,
+    CUBLAS_GEMM_ALGO10_TENSOR_OP     =  110,
+    CUBLAS_GEMM_ALGO11_TENSOR_OP     =  111,
+    CUBLAS_GEMM_ALGO12_TENSOR_OP     =  112,
+    CUBLAS_GEMM_ALGO13_TENSOR_OP     =  113,
+    CUBLAS_GEMM_ALGO14_TENSOR_OP     =  114,
+    CUBLAS_GEMM_ALGO15_TENSOR_OP     =  115
+} cublasGemmAlgo_t;
+
+typedef enum cudaDataType_t
+{
+  CUDA_R_16F= 2,  /* real as a half */
+  CUDA_C_16F= 6,  /* complex as a pair of half numbers */
+  CUDA_R_32F= 0,  /* real as a float */
+  CUDA_C_32F= 4,  /* complex as a pair of float numbers */
+  CUDA_R_64F= 1,  /* real as a double */
+  CUDA_C_64F= 5,  /* complex as a pair of double numbers */
+  CUDA_R_8I = 3,  /* real as a signed char */
+  CUDA_C_8I = 7,  /* complex as a pair of signed char numbers */
+  CUDA_R_8U = 8,  /* real as a unsigned char */
+  CUDA_C_8U = 9,  /* complex as a pair of unsigned char numbers */
+  CUDA_R_32I= 10, /* real as a signed int */
+  CUDA_C_32I= 11, /* complex as a pair of signed int numbers */
+  CUDA_R_32U= 12, /* real as a unsigned int */
+  CUDA_C_32U= 13  /* complex as a pair of unsigned int numbers */
+} cudaDataType;
+
+typedef cudaDataType cublasDataType_t;
+
+typedef enum {
+    CUBLAS_OP_N=0,
+    CUBLAS_OP_T=1,
+    CUBLAS_OP_C=2,
+    CUBLAS_OP_HERMITAN=2, /* synonym if CUBLAS_OP_C */
+    CUBLAS_OP_CONJG=3     /* conjugate */
+} cublasOperation_t;
+
+/*Enum for default math mode/tensor operation*/
+typedef enum {
+    CUBLAS_DEFAULT_MATH = 0,
+    CUBLAS_TENSOR_OP_MATH = 1
+} cublasMath_t;
+
+#endif
diff --git a/tests/common/dot.h b/tests/common/dot.h
new file mode 100644
index 000000000..427e7ca04
--- /dev/null
+++ b/tests/common/dot.h
@@ -0,0 +1,196 @@
+#include <iomanip>
+#include <cstring>
+#include <sstream>
+#include <cstdio>
+#include "triton/driver/backend.h"
+#include "triton/driver/stream.h"
+#include "triton/tools/bench.hpp"
+#include "triton/external/half.hpp"
+#include "triton/runtime/function.h"
+#include "src/dot.h"
+#include "cuda/cublas.h"
+#include "util.h"
+
+
+template<class T, bool AT, bool BT>
+static void cc_dot(std::vector<T> &c, const std::vector<T> &a, const std::vector<T> &b,
+                    size_t M, size_t N, size_t K){
+  for(size_t m = 0; m < M; m++)
+  for(size_t n = 0; n < N; n++){
+    float acc = 0;
+    for(size_t k = 0; k < K; k++)
+      acc = acc + (!AT ? a[k*M + m] : a[m*K + k]) * (!BT ? b[n*K + k] : b[k*N + n]);
+    c[m*N + n] = static_cast<T>(acc);
+  }
+}
+
+template<class T>
+void cc_dot(bool AT_, bool BT_, size_t M, size_t N, size_t K,
+             std::vector<T> &c, const std::vector<T> &a, const std::vector<T> &b) {
+  if(AT_ && BT_)
+    cc_dot<T, true, true>(c, a, b, M, N, K);
+  else if(AT_ && !BT_)
+    cc_dot<T, true, false>(c, a, b, M, N, K);
+  else if(!AT_ && BT_)
+    cc_dot<T, false, true>(c, a, b, M, N, K);
+  else
+    cc_dot<T, false, false>(c, a, b, M, N, K);
+}
+
+enum run_mode_t {
+  BENCH,
+  TEST
+};
+
+enum dtype_t {
+  FLOAT,
+  HALF,
+  DOUBLE
+};
+
+template<class T>
+struct to_string;
+
+template<> struct to_string<half_float::half>{
+  static constexpr const char* value = "half";
+};
+
+template<> struct to_string<float>{
+  static constexpr const char* value = "float";
+};
+
+template<> struct to_string<double>{
+  static constexpr const char* value = "double";
+};
+
+template<class T>
+bool triton_dot(drv::stream* stream, bool AT, bool BT,
+                int32_t M, int32_t N, int32_t K,
+                int32_t TM, int32_t TN, int32_t TK, int32_t nwarp,
+                const std::vector<int>& a_order, const std::vector<int>& b_order,
+                run_mode_t mode, std::vector<double>& bench, bool &test){
+  std::string ty = to_string<T>::value;
+  size_t dt_nbytes = sizeof(T);
+  drv::context* context = stream->context();
+  int32_t lda = (AT ^ a_order[0]==1) ? K : M;
+  int32_t ldb = (BT ^ b_order[0]==1) ? N : K;
+  int32_t ldc = N;
+  std::vector<std::string> sa = { "1", "lda" };
+  std::vector<std::string> sb = { "1", "ldb" };
+
+  // inputs
+  auto dc = std::shared_ptr<drv::buffer>(drv::buffer::create(context, M*N*dt_nbytes));
+  auto da = std::shared_ptr<drv::buffer>(drv::buffer::create(context, M*K*dt_nbytes));
+  auto db = std::shared_ptr<drv::buffer>(drv::buffer::create(context, K*N*dt_nbytes));
+
+  // macros
+  rt::function::options_space_t opt;
+  // A access patterns
+  opt.defines.push_back({"USEA",         {AT? "a"    : "a"            }});
+  opt.defines.push_back({"BROADCAST_AK", {AT? "newaxis, :"   : "newaxis, :"   }});
+  opt.defines.push_back({"BROADCAST_AM", {AT? ":, newaxis"   : ":, newaxis"   }});
+  opt.defines.push_back({"SHAPE_A",      {AT? "TM, TK"       : "TM, TK"       }});
+  opt.defines.push_back({"STRIDE_AK",    {AT? sa[a_order[0]] : sa[a_order[1]] }});
+  opt.defines.push_back({"STRIDE_AM",    {AT? sa[a_order[1]] : sa[a_order[0]] }});
+  // B access patterns
+  opt.defines.push_back({"USEB",         {BT? "b"    : "b"            }});
+  opt.defines.push_back({"BROADCAST_BK", {BT? ":, newaxis"   : ":, newaxis"   }});
+  opt.defines.push_back({"BROADCAST_BN", {BT? "newaxis, :"   : "newaxis, :"   }});
+  opt.defines.push_back({"SHAPE_B",      {BT? "TK, TN"       : "TK, TN"       }});
+  opt.defines.push_back({"STRIDE_BK",    {BT? sb[b_order[1]] : sb[b_order[0]] }});
+  opt.defines.push_back({"STRIDE_BN",    {BT? sb[b_order[0]] : sb[b_order[1]] }});
+  // data-type
+  opt.defines.push_back({"TYPE", {ty}});
+  // tile sizes
+  if(mode == TEST) {
+    opt.defines.push_back({"TM", {std::to_string(TM)}});
+    opt.defines.push_back({"TN", {std::to_string(TN)}});
+    opt.defines.push_back({"TK", {std::to_string(TK)}});
+    opt.num_warps = {nwarp};
+  }
+  if(mode == BENCH) {
+    opt.defines.push_back({"TM", {"32", "64", "128"}});
+    opt.defines.push_back({"TN", {"32", "64", "128"}});
+    opt.defines.push_back({"TK", {to_string<T>::value == "half" ? "16" : "8"}});
+    opt.num_warps = {2, 4, 8};
+  }
+
+  // kernels
+  rt::function function(src::dot, opt);
+  std::vector<rt::arg> args = {&*da, &*db, &*dc, (float)1, M, N, K, lda, ldb, ldc};
+  auto grid = grid2d(M, N);
+
+  // metrics
+  if(mode == BENCH){
+    auto tflops = [&](double nanosec) { return 2.*M*N*K / nanosec * 1e-3; };
+    double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream);
+    bench.push_back(tflops(triton_ns));
+
+//    // cublas
+//    if(cublas::cublasinit()){
+//      T alpha(static_cast<double>(1));
+//      T beta(static_cast<double>(0));
+//      cublasGemmAlgo_t fastest;
+//      cublasGemm(CUDA_R_32F, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest);
+//      double cublas_ms = triton::tools::bench([&]() { cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K,
+//                                                                 &alpha, &*da, lda, &*db, ldb, &beta, &*dc,
+//                                                                 ldc, nullptr, fastest); }, stream);
+//      bench.push_back(tflops(cublas_ms));
+//    }
+  }
+
+  // test triton
+  if(mode == TEST){
+    srand(0);
+    // initialize buffers
+    std::vector<T> hc(M*N);
+    std::vector<T> ha(M*K);
+    std::vector<T> hb(K*N);
+    for(size_t i = 0; i < ha.size(); i++)
+      ha[i] = (float)rand()/RAND_MAX;
+    for(size_t i = 0; i < hb.size(); i++)
+      hb[i] = (float)rand()/RAND_MAX;
+    // copy buffer
+    stream->write(&*da, true, 0, ha);
+    stream->write(&*db, true, 0, hb);
+    // run kernel
+    function(args, grid, stream);
+    // write back
+    stream->synchronize();
+    // compare with CPU
+    stream->read(&*dc, true, 0, hc);
+    std::vector<T> rc(hc.size());
+    cc_dot(AT, BT, M, N, K, rc, ha, hb);
+    test = testing::diff(hc, rc);
+  }
+}
+
+std::vector<double> bench_dot(drv::stream* stream,
+               dtype_t dtype, bool AT, bool BT,
+               int32_t M, int32_t N, int32_t K,
+               const std::vector<int>& a_order, const std::vector<int>& b_order) {
+  std::vector<double> bench;
+  bool test;
+  switch(dtype){
+    case HALF:   triton_dot<half_float::half>(stream, AT, BT, M, N, K, 0, 0, 0, 0, a_order, b_order, BENCH, bench, test); break;
+    case FLOAT:  triton_dot<float>(stream, AT, BT, M, N, K, 0, 0, 0, 0, a_order, b_order, BENCH, bench, test); break;
+    case DOUBLE: triton_dot<double>(stream, AT, BT, M, N, K, 0, 0, 0, 0, a_order, b_order, BENCH, bench, test); break;
+    default: break;
+  }
+  return bench;
+}
+bool test_dot(drv::stream* stream,
+              dtype_t dtype, bool AT, bool BT,
+              int32_t M, int32_t N, int32_t K,
+              const std::vector<int>& a_order, const std::vector<int>& b_order,
+              int32_t TM, int32_t TN, int32_t TK, size_t nwarp) {
+  std::vector<double> bench;
+  bool test = false;
+  switch(dtype){
+    case HALF:   triton_dot<half_float::half>(stream, AT, BT, M, N, K, TM, TN, TK, nwarp, a_order, b_order, TEST, bench, test); break;
+    case FLOAT:  triton_dot<float>(stream, AT, BT, M, N, K, TM, TN, TK, nwarp, a_order, b_order, TEST, bench, test); break;
+    case DOUBLE: triton_dot<double>(stream, AT, BT, M, N, K, TM, TN, TK, nwarp, a_order, b_order, TEST, bench, test); break;
+    default: break;
+  }
+  return test;
+}
diff --git a/tests/common/reduce.h b/tests/common/reduce.h
new file mode 100644
index 000000000..0f5d63612
--- /dev/null
+++ b/tests/common/reduce.h
@@ -0,0 +1,163 @@
+#include <iomanip>
+#include <cstring>
+#include <sstream>
+#include <cstdio>
+#include "triton/driver/backend.h"
+#include "triton/driver/stream.h"
+#include "triton/tools/bench.hpp"
+#include "triton/external/half.hpp"
+#include "triton/runtime/function.h"
+#include "src/reduce.h"
+#include "util.h"
+
+namespace drv = triton::driver;
+namespace rt = triton::runtime;
+
+template<class T>
+void cc_reduce_nd(std::vector<T> &y, const std::vector<T> &x, reduce_op_t op, size_t axis, const std::vector<int>& shapes) {
+  assert(axis <= shapes.size() - 1);
+  // remove shape at index axis to get outer dimensions
+  std::vector<int> outer = shapes;
+  outer.erase(outer.begin() + axis);
+  if(outer.empty())
+    outer.push_back(1);
+  // retrieve shape at index axis to get inner dimension
+  int inner = shapes[axis];
+  // accumualtion function
+  auto acc = get_accumulator<T>(op);
+  // iterate over outer dimensions
+  _loop_nest(outer, [&](const std::vector<int>& y_idx) {
+    T ret = 0;
+    auto x_idx = y_idx;
+    x_idx.insert(x_idx.begin() + axis, 0);
+    // accumulate over inner dimensions
+    for(int z = 0; z < inner; z++){
+      x_idx[axis] = z;
+      ret = acc(ret, x[offset(x_idx, shapes)]);
+    }
+    y[offset(y_idx, outer)] = ret;
+  });
+}
+
+enum run_mode_t {
+  BENCH,
+  TEST
+};
+
+void triton_reduce_nd(drv::stream* stream, const std::vector<int32_t>& shape_x,
+                      int axis, reduce_op_t op,
+                      const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order,
+                      std::vector<std::vector<std::string>> TS,
+                      run_mode_t mode, std::vector<double>& bench, bool &test) {
+  typedef float NumericT;
+  std::string ty = "float";
+  size_t dtsize = sizeof(NumericT);
+  drv::context* context = stream->context();
+
+
+
+  // shape
+  std::vector<int> shape_y = shape_x;
+  shape_y.erase(shape_y.begin() + axis);
+
+  // rank
+  int rank_x = shape_x.size();
+  int rank_y = shape_y.size();
+
+  // size
+  size_t size_x = 1;
+  for(int32_t d: shape_x)
+    size_x *= d;
+  size_t size_y = 1;
+  for(int32_t d: shape_y)
+    size_y *= d;
+
+  // strides for x
+  std::vector<std::string> x_shapename = {"S0", "S1", "S2"};
+  std::vector<std::string> x_strides = {"1"};
+  for(int d = 0; d < rank_x - 1; d++)
+    x_strides.push_back(x_strides[d] + " * " + x_shapename[x_order[d]]);
+
+  // strides for y
+  std::vector<std::string> y_shapename = x_shapename;
+  y_shapename.erase(y_shapename.begin() + axis);
+  std::vector<std::string> y_strides = {"1"};
+  for(int d = 0; d < rank_y - 1; d++)
+    y_strides.push_back(y_strides[d] + " * " + y_shapename[y_order[d]]);
+
+  // options
+  rt::function::options_space_t opt;
+  opt.defines.push_back({"TYPE", {ty}});
+  for(int d = 0; d < rank_x; d++)
+    opt.defines.push_back({"STRIDE_XS" + std::to_string(x_order[d]), {x_strides[d]}});
+  for(int d = 0; d < rank_y; d++)
+    opt.defines.push_back({"STRIDE_YS" + std::to_string(y_order[d]), {y_strides[d]}});
+  if(TS.empty())
+    TS = tile_nd(rank_x);
+  for(int d = 0; d < rank_x; d++)
+    opt.defines.push_back({"TS" + std::to_string(d), TS[d]});
+
+  std::vector<size_t> axy;
+  for(int d = 0; d < rank_x; d++)
+    if(d != axis)
+      axy.push_back(d);
+  for(int d = 0; d < rank_y; d++)
+    opt.defines.push_back({"TY" + std::to_string(d), {std::to_string(shape_x[axy[d]])}});
+  for(int d = 0; d < rank_y; d++)
+    opt.defines.push_back({"RY" + std::to_string(d), {"rs" + std::to_string(axy[d])}});
+
+  std::string RED = "";
+  for(int n = 0; n < rank_x; n++){
+    if(n > 0)
+      RED += ", ";
+    RED += (n==axis) ? to_str(op) : ":";
+  }
+  opt.defines.push_back({"RED", {RED}});
+  opt.num_warps = {1};
+
+  // kernel
+  rt::function function(src::reduce_nd[rank_x - 1], opt);
+
+  // input buffers
+  auto dx = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size_x*dtsize));
+  auto dy = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size_y*dtsize));
+
+  // grid
+  std::vector<rt::arg> args = {&*dx, &*dy};
+  for(int32_t d: shape_x)
+    args.push_back(d);
+  std::vector<std::string> ts = {"TS0", "TS1", "TS2"};
+  auto grid = grid_nd(shape_x, ts);
+
+  // metrics
+  if(mode == BENCH){
+    auto gbps = [&](double ns) { return 2 * size_x * dtsize / (ns * 1e-9) * 1e-9; };
+    double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream);
+    bench.push_back(gbps(triton_ns));
+  }
+
+  // test triton
+  if(mode == TEST){
+    std::vector<NumericT> hy(size_y);
+    std::vector<NumericT> ry(size_y);
+    std::vector<NumericT> hx(size_x);
+    init_zeros(hy);
+    init_rand(hx);
+    stream->write(&*dx, true, 0, hx);
+    function(args, grid, stream);
+    stream->synchronize();
+    stream->read(&*dy, true, 0, hy);
+    cc_reduce_nd(ry, hx, op, axis, shape_x);
+    test = testing::diff(hy, ry);
+  }
+}
+
+bool do_test(drv::stream* stream, std::vector<int> shape, int axis, reduce_op_t op, int nwarp){
+  std::vector<double> bench;
+  bool test;
+  std::vector<std::vector<std::string>> TSS;
+  for(int32_t d: shape)
+    TSS.push_back({std::to_string(d)});
+  triton_reduce_nd(stream, shape, axis, op, {0, 1, 2}, {0, 1, 2}, TSS, TEST, bench, test);
+  return test;
+}
diff --git a/tests/common/src/copy.h b/tests/common/src/copy.h
new file mode 100644
index 000000000..f45f7a5cd
--- /dev/null
+++ b/tests/common/src/copy.h
@@ -0,0 +1,66 @@
+#ifndef _TRITON_TEST_SRC_COPY_H_
+#define _TRITON_TEST_SRC_COPY_H_
+
+namespace src {
+
+    const char *copy1d =
+R"(
+void copy1d(TYPE * X __noalias __readonly __aligned(16),
+            TYPE * Y __noalias __readonly __aligned(16),
+            int S0) {
+  int pid0 = get_program_id(0);
+  int rs0[TS0] = pid0 * TS0 + 0 ... TS0;
+  TYPE* px[TS0] = X + rs0;
+  TYPE* py[TS0] = Y + rs0;
+  *py = *px;
+}
+)";
+
+    const char *copy2d =
+R"(
+void copy2d(TYPE * X __noalias __readonly __aligned(16),
+            TYPE * Y __noalias __writeonly __aligned(16),
+            int S0 __multipleof(8),
+            int S1 __multipleof(8)) {
+  int pid0 = get_program_id(0);
+  int pid1 = get_program_id(1);
+  int rs0[TS0] = pid0 * TS0 + 0 ... TS0;
+  int rs1[TS1] = pid1 * TS1 + 0 ... TS1;
+  TYPE* px[TS0, TS1] = X + rs0[:, newaxis] * STRIDE_XS0 + rs1[newaxis, :] * STRIDE_XS1;
+  TYPE* py[TS0, TS1] = Y + rs0[:, newaxis] * STRIDE_YS0 + rs1[newaxis, :] * STRIDE_YS1;
+  *py = *px;
+}
+)";
+
+    const char *copy3d =
+R"(
+void copy3d(TYPE * X __noalias __readonly __aligned(16),
+            TYPE * Y __noalias __writeonly __aligned(16),
+            int S0 __multipleof(8),
+            int S1 __multipleof(8),
+            int S2 __multipleof(8)) {
+  // program id
+  int pid0 = get_program_id(0);
+  int pid1 = get_program_id(1);
+  int pid2 = get_program_id(2);
+  // ranges
+  int rs0[TS0] = pid0 * TS0 + 0 ... TS0;
+  int rs1[TS1] = pid1 * TS1 + 0 ... TS1;
+  int rs2[TS2] = pid2 * TS2 + 0 ... TS2;
+  // X pointers
+  TYPE* px[TS0, TS1, TS2] = X + rs0[:, newaxis, newaxis] * STRIDE_XS0
+                              + rs1[newaxis, :, newaxis] * STRIDE_XS1
+                              + rs2[newaxis, newaxis, :] * STRIDE_XS2;
+  // Y pointers
+  TYPE* py[TS0, TS1, TS2] = Y + rs0[:, newaxis, newaxis] * STRIDE_YS0
+                              + rs1[newaxis, :, newaxis] * STRIDE_YS1
+                              + rs2[newaxis, newaxis, :] * STRIDE_YS2;
+  *py = *px;
+}
+)";
+
+  const char* copy_nd[] = {copy1d, copy2d, copy3d};
+
+}
+
+#endif
diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h
new file mode 100644
index 000000000..4dcab1efc
--- /dev/null
+++ b/tests/common/src/dot.h
@@ -0,0 +1,60 @@
+namespace src {
+
+    const char *dot =
+R"(
+__global__ void dot(TYPE * A __noalias __readonly __aligned(16),
+                    TYPE * B __noalias __readonly __aligned(16),
+                    TYPE * C __noalias __aligned(16),
+                    float alpha,
+                    int M, int N, int K,
+                    int lda __multipleof(8),
+                    int ldb __multipleof(8),
+                    int ldc __multipleof(8)) {
+      // prologue
+      int ridx = get_program_id(0);
+      int ridy = get_program_id(1);
+      int gridx = M / TM;
+      int gridy = N / TN;
+      int rid = ridx + ridy * gridx;
+      ridx = rid / gridy;
+      ridy = rid % gridy;
+      int rm[TM] = ridx * TM + 0 ... TM;
+      int rn[TN] = ridy * TN + 0 ... TN;
+      int rk[TK] = 0 ... TK;
+
+      // pointers to operands
+      int offa[SHAPE_A] = rk[BROADCAST_AK] * STRIDE_AK + rm[BROADCAST_AM] * STRIDE_AM;
+      int offb[SHAPE_B] = rk[BROADCAST_BK] * STRIDE_BK + rn[BROADCAST_BN] * STRIDE_BN;
+      TYPE* pa[SHAPE_A] = A + offa;
+      TYPE* pb[SHAPE_B] = B + offb;
+
+      // prefetches operands
+      bool checka[SHAPE_A] = rk[BROADCAST_AK] < K;
+      bool checkb[SHAPE_B] = rk[BROADCAST_BK] < K;
+      TYPE a[SHAPE_A] = checka ? *pa : 0;
+      TYPE b[SHAPE_B] = checkb ? *pb : 0;
+
+      // reduction loop
+      float c[TM, TN] = 0;
+      for(int k = K; k > 0; k -= TK){
+        c += USEA @ USEB;
+        bool checka[SHAPE_A] = k > TK;
+        bool checkb[SHAPE_B] = k > TK;
+        pa += TK * STRIDE_AK;
+        pb += TK * STRIDE_BK;
+        a = *?(checka)pa;
+        b = *?(checkb)pb;
+      }
+      //c = c * alpha;
+
+      // epilogue
+      int rxm[TM] = get_program_id(0) * TM + 0 ... TM;
+      int rxn[TN] = get_program_id(1) * TN + 0 ... TN;
+      int offc[TM, TN] = rxm[:, newaxis] * ldc + rxn[newaxis, :];
+      TYPE* pc[TM, TN] = C + offc;
+      bool checkc[TM, TN] = (rxm[:, newaxis] < M) && (rxn[newaxis, :] < N);
+      *?(checkc)pc = (TYPE[TM, TN])c;
+}
+)";
+
+}
diff --git a/tests/common/src/reduce.h b/tests/common/src/reduce.h
new file mode 100644
index 000000000..508ce896b
--- /dev/null
+++ b/tests/common/src/reduce.h
@@ -0,0 +1,58 @@
+namespace src {
+
+    const char *reduce1d =
+R"(
+void reduce1d(TYPE * X __noalias __readonly __aligned(16),
+              TYPE * Y __noalias __readonly __aligned(16),
+              int S0) {
+  int pid0 = get_program_id(0);
+  int rs0[TS0] = pid0 * TS0 + 0 ... TS0;
+  TYPE* px[TS0] = X + rs0;
+  *Y = (*px)[RED];
+}
+)";
+
+
+    const char *reduce2d =
+R"(
+void reduce2d(TYPE * X __noalias __readonly __aligned(16),
+              TYPE * Y __noalias __writeonly __aligned(16),
+              int S0, int S1) {
+  int pid0 = get_program_id(0);
+  int pid1 = get_program_id(1);
+  int rs0[TS0] = pid0 * TS0 + 0 ... TS0;
+  int rs1[TS1] = pid1 * TS1 + 0 ... TS1;
+  TYPE* px[TS0, TS1] = X + rs0[:, newaxis] * STRIDE_XS0
+                         + rs1[newaxis, :] * STRIDE_XS1;
+  TYPE* py[TY0] = Y + RY0 * STRIDE_YS0;
+  *py = (*px)[RED];
+}
+)";
+
+    const char *reduce3d =
+R"(
+void reduce2d(TYPE * X __noalias __readonly __aligned(16),
+              TYPE * Y __noalias __writeonly __aligned(16),
+              int S0, int S1, int S2) {
+  int pid0 = get_program_id(0);
+  int pid1 = get_program_id(1);
+  int pid2 = get_program_id(2);
+  int rs0[TS0] = pid0 * TS0 + 0 ... TS0;
+  int rs1[TS1] = pid1 * TS1 + 0 ... TS1;
+  int rs2[TS2] = pid2 * TS2 + 0 ... TS2;
+  // input pointers
+  TYPE* px[TS0, TS1, TS2] = X + rs0[:, newaxis, newaxis] * STRIDE_XS0
+                              + rs1[newaxis, :, newaxis] * STRIDE_XS1
+                              + rs2[newaxis, newaxis, :] * STRIDE_XS2;
+  // output pointers
+  TYPE* py[TY0, TY1] = Y + RY0[:, newaxis] * STRIDE_YS0
+                         + RY1[newaxis, :] * STRIDE_YS1;
+  // write-back
+  *py = (*px)[RED];
+}
+)";
+
+  const char* reduce_nd[] = {reduce1d, reduce2d, reduce3d};
+
+
+}
diff --git a/tests/common/util.h b/tests/common/util.h
new file mode 100644
index 000000000..89489f889
--- /dev/null
+++ b/tests/common/util.h
@@ -0,0 +1,219 @@
+#pragma once
+
+#ifndef _TRITON_TESTS_UTIL_H
+#define _TRITON_TESTS_UTIL_H
+
+#include <iomanip>
+#include <cmath>
+#include "triton/runtime/function.h"
+
+namespace drv = triton::driver;
+namespace rt = triton::runtime;
+
+/* ------------------------
+ *        Launch Grid
+ * ------------------------ */
+
+inline size_t ceil(size_t x, size_t y) {
+  return (x + y - 1) / y;
+}
+
+inline rt::function::grid_fn_ty grid1d(size_t N) {
+  return [N](const rt::function::options_t& x) {
+    return rt::grid_t{ceil(N, x.D<int>("TN"))};
+  };
+}
+
+inline rt::function::grid_fn_ty grid2d(size_t M, size_t N) {
+  return [M, N](const rt::function::options_t& x) {
+    return rt::grid_t{ceil(M, x.D<int>("TM")),
+                      ceil(N, x.D<int>("TN"))};
+  };
+}
+
+inline rt::function::grid_fn_ty grid_nd(const std::vector<int32_t> &shape,
+                                       const std::vector<std::string>& ts) {
+  return [&shape, &ts](const rt::function::options_t& x) {
+    rt::grid_t ret;
+    for(size_t d = 0; d < shape.size(); d++)
+      ret.push_back(ceil(shape[d], x.D<int>(ts[d])));
+    return ret;
+  };
+}
+
+inline std::vector<std::vector<std::string>> tile_nd(size_t rank) {
+  assert(rank <= 3);
+  if(rank == 1)
+    return {{"128", "256", "512", "1024"}};
+  if(rank == 2)
+    return {{"64"},
+            {"64"}};
+  if(rank == 3)
+    return {{"4", "16", "32"},
+            {"4", "16", "32"},
+            {"4", "16", "32"}};
+  return {};
+}
+
+/* ------------------------
+ *   Tensor Initialization
+ * ------------------------ */
+
+template<class T>
+void init_rand(std::vector<T>& x) {
+  for(size_t i = 0; i < x.size(); i++)
+    x[i] = i;
+}
+
+template<class T>
+void init_zeros(std::vector<T>& x) {
+  for(size_t i = 0; i < x.size(); i++)
+    x[i] = 0;
+}
+
+/* ------------------------
+ *       Loop Nests
+ * ------------------------ */
+
+void _loop_nest(std::vector<int> const & ranges,
+                std::function<void(std::vector<int> const &)> const & f){
+  int D = ranges.size();
+  std::vector<int> values(D, 0);
+  // Start with innermost loop
+  int i = D - 1;
+  while(true){
+    //  Execute function
+    f(values);
+    while(values[i]++ == ranges[i] - 1){
+      if(i == 0)
+        return;
+      values[i--] = 0;
+    }
+    i = D - 1;
+  }
+}
+
+/* -----------------------
+ *     TENSOR INDEXING
+ * ----------------------- */
+
+enum order_t {
+  ROWMAJOR,
+  COLMAJOR
+};
+
+
+
+
+int offset(const std::vector<int>& idx, const std::vector<int>& shapes) {
+  int result = idx[0];
+  int ld = 1;
+  for(int i = 1; i < idx.size(); i++){
+    ld *= shapes[i - 1];
+    result += idx[i]*ld;
+  }
+  return result;
+}
+
+/* -----------------------
+ *    REDUCTION HELPERS
+ * ----------------------- */
+
+enum reduce_op_t {
+  ADD,
+  MAX,
+  MIN
+};
+
+std::string to_str(reduce_op_t op) {
+  switch (op) {
+    case ADD: return "+";
+    case MAX: return "max";
+    case MIN: return "min";
+    default: break;
+  }
+  assert(false);
+  return "";
+}
+
+template<class T>
+std::function<T(T,T)> get_accumulator(reduce_op_t op) {
+  switch (op) {
+    case ADD: return [](T x, T y) { return x + y; };
+    case MAX: return [](T x, T y) { return std::max(x, y); };
+    case MIN: return [](T x, T y) { return std::min(x, y); };
+    default: break;
+  }
+  assert(false);
+  return std::function<T(T,T)>();
+}
+
+
+/* -----------------------
+ *     TENSOR COMPARISON
+ * ----------------------- */
+namespace testing {
+
+template<class T>
+bool diff(const std::vector<T>& hc, const std::vector<T>& rc) {
+if(hc.size() != rc.size())
+    return false;
+for(size_t i = 0; i < hc.size(); i++)
+  if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2){
+    std::cout << i << " " << hc[i] << " " << rc[i] << std::endl;
+    return false;
+  }
+return true;
+}
+
+}
+
+/* -----------------------
+ *    PRETTY PRINTING
+ * ----------------------- */
+
+namespace aux{
+template<std::size_t...> struct seq{};
+
+template<std::size_t N, std::size_t... Is>
+struct gen_seq : gen_seq<N-1, N-1, Is...>{};
+
+template<std::size_t... Is>
+struct gen_seq<0, Is...> : seq<Is...>{};
+
+template<class Ch, class Tr, class Tuple, std::size_t... Is>
+void print_tuple(std::basic_ostream<Ch,Tr>& os, Tuple const& t, seq<Is...>){
+  using swallow = int[];
+  (void)swallow{0, (void(os << (Is == 0? "" : ", ") << std::get<Is>(t)), 0)...};
+}
+} // aux::
+
+
+template<class Ch, class Tr, class... Args>
+auto operator<<(std::basic_ostream<Ch, Tr>& os, std::tuple<Args...> const& t)
+    -> std::basic_ostream<Ch, Tr>&
+{
+  aux::print_tuple(os, t, aux::gen_seq<sizeof...(Args)>());
+  return os;
+}
+
+template<class Ch, class Tr, class T>
+std::basic_ostream<Ch, Tr>& operator<<(std::basic_ostream<Ch, Tr>& os, const std::vector<T>& vec) {
+  os << "{";
+  for(size_t i = 0; i < vec.size(); i++){
+    if(i > 0)
+      os << ", ";
+    os << vec[i];
+  }
+  os << "}";
+  return os;
+}
+
+template<class Ch, class Tr>
+std::basic_ostream<Ch, Tr>& operator<<(std::basic_ostream<Ch, Tr>& os, reduce_op_t op) {
+  return os << to_str(op);
+}
+
+
+
+#endif
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
new file mode 100644
index 000000000..6f397badb
--- /dev/null
+++ b/tests/unit/CMakeLists.txt
@@ -0,0 +1,6 @@
+foreach(PROG dot copy reduce)
+     set(TARGET unit_${PROG})
+     add_executable(${TARGET} ${PROG}.cc)
+     set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME ${TARGET})
+     target_link_libraries(${TARGET} triton dl)
+endforeach(PROG)
diff --git a/tests/unit/copy.cc b/tests/unit/copy.cc
new file mode 100644
index 000000000..ec586066c
--- /dev/null
+++ b/tests/unit/copy.cc
@@ -0,0 +1,60 @@
+#include <iostream>
+#include <tuple>
+#include "copy.h"
+#include "triton/driver/backend.h"
+
+
+
+int main() {
+  // initialize default compute device
+  auto context = triton::driver::backend::contexts::get_default();
+  triton::driver::stream* stream = triton::driver::stream::create(context);
+  // shapes to benchmark
+  typedef std::tuple<std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>> config_t;
+  std::vector<config_t> configs;
+  // 1D
+//  configs.push_back({{65536}, {32}, {0}, {0}});
+  configs.push_back({{65536}, {128}, {0}, {0}});
+  configs.push_back({{65536}, {512}, {0}, {0}});
+  configs.push_back({{65536}, {1024}, {0}, {0}});
+  // 2D
+  configs.push_back({{256, 256}, {16, 16}, {0, 1}, {0, 1}});
+  configs.push_back({{256, 256}, {16, 64}, {0, 1}, {0, 1}});
+  configs.push_back({{256, 256}, {64, 16}, {0, 1}, {0, 1}});
+  configs.push_back({{256, 256}, {64, 64}, {0, 1}, {0, 1}});
+  configs.push_back({{256, 256}, {16, 16}, {0, 1}, {1, 0}});
+  configs.push_back({{256, 256}, {16, 64}, {0, 1}, {1, 0}});
+  configs.push_back({{256, 256}, {64, 16}, {0, 1}, {1, 0}});
+  configs.push_back({{256, 256}, {64, 64}, {0, 1}, {1, 0}});
+  configs.push_back({{256, 256}, {16, 16}, {1, 0}, {0, 1}});
+  configs.push_back({{256, 256}, {16, 64}, {1, 0}, {0, 1}});
+  configs.push_back({{256, 256}, {64, 16}, {1, 0}, {0, 1}});
+  configs.push_back({{256, 256}, {64, 64}, {1, 0}, {0, 1}});
+  configs.push_back({{256, 256}, {64, 64}, {1, 0}, {1, 0}});
+  configs.push_back({{256, 256}, {16, 64}, {1, 0}, {1, 0}});
+  configs.push_back({{256, 256}, {64, 16}, {1, 0}, {1, 0}});
+  configs.push_back({{256, 256}, {64, 64}, {1, 0}, {1, 0}});
+  // 3D
+  std::vector<std::vector<int>> xx_idx = {{0, 1, 2}, {2, 1, 0}, {1, 0, 2}};
+  std::vector<std::vector<int>> yy_idx = {{0, 1, 2}, {2, 1, 0}, {1, 0, 2}};
+  for(const auto& x_idx: xx_idx)
+  for(const auto& y_idx: yy_idx){
+      configs.push_back({{64, 64, 32}, {16, 4, 8}, x_idx, y_idx});
+      configs.push_back({{64, 64, 32}, {8, 16, 2}, x_idx, y_idx});
+      configs.push_back({{64, 64, 32}, {32, 2, 2}, x_idx, y_idx});
+      configs.push_back({{64, 64, 32}, {16, 64, 4}, x_idx, y_idx});
+  }
+  // testing
+  std::vector<int32_t> shape, tile;
+  std::vector<int32_t> ord_x, ord_y;
+  bool result = true;
+  for(const auto& c: configs){
+    std::tie(shape, tile, ord_x, ord_y) = c;
+    bool pass = test_copy_nd(stream, shape, tile, ord_x, ord_y);
+    result = result && pass;
+    std::cout << "// " << c << ", " << pass << std::endl;
+  }
+  return result;
+}
+
+
diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc
new file mode 100644
index 000000000..6c24386ea
--- /dev/null
+++ b/tests/unit/dot.cc
@@ -0,0 +1,34 @@
+﻿#include "triton/driver/backend.h"
+#include "triton/driver/stream.h"
+#include "dot.h"
+#include "util.h"
+
+int main() {
+  // initialize default compute device
+  auto context = triton::driver::backend::contexts::get_default();
+  triton::driver::stream* stream = triton::driver::stream::create(context);
+  // shapes to test
+  typedef std::tuple<dtype_t, bool, bool, int, int, int, int, int, int, int> config_t;
+  std::vector<config_t> configs;
+  for(int TM: std::vector<int>{32, 64, 128})
+  for(int TN: std::vector<int>{32, 64, 128})
+  for(int TK: std::vector<int>{16})
+  for(int nwarps: std::vector<int>{4})
+  for(bool AT: std::array<bool, 2>{false, true})
+  for(bool BT: std::array<bool, 2>{false, true}){
+    configs.push_back(config_t{HALF, AT, BT, TM, TN, TK, TM, TN, TK, nwarps});
+  }
+  // test
+  dtype_t dtype;
+  bool AT, BT;
+  int M, N, K, TM, TN, TK, nwarp;
+  for(const auto& c: configs){
+    std::tie(dtype, AT, BT, M, N, K, TM, TN, TK, nwarp) = c;
+    std::cout << "Testing " << c << " ... " << std::flush;
+    if(test_dot(stream, dtype, AT, BT, M, N, K, {0, 1}, {0, 1}, TM, TN, TK, (size_t)nwarp))
+      std::cout << " Pass! " << std::endl;
+    else{
+      std::cout << " Fail! " << std::endl;
+    }
+  }
+}
diff --git a/tests/unit/reduce.cc b/tests/unit/reduce.cc
new file mode 100644
index 000000000..96f2d89f9
--- /dev/null
+++ b/tests/unit/reduce.cc
@@ -0,0 +1,42 @@
+#include <iomanip>
+#include <cstring>
+#include <sstream>
+#include <cstdio>
+#include <functional>
+#include "triton/driver/backend.h"
+#include "triton/driver/stream.h"
+#include "triton/tools/bench.hpp"
+#include "triton/external/half.hpp"
+#include "triton/runtime/function.h"
+#include "cuda/cublas.h"
+#include "reduce.h"
+#include "util.h"
+
+
+int main() {
+  // initialize default compute device
+  auto context = triton::driver::backend::contexts::get_default();
+  triton::driver::stream* stream = triton::driver::stream::create(context);
+  // shapes to benchmark
+  typedef std::tuple<std::vector<int>, int, reduce_op_t> config_t;
+  std::vector<config_t> configs = {
+    config_t{{8, 8, 4}, 2, ADD},
+    config_t{{32}, 0, ADD},
+    config_t{{32, 32}, 0, MAX},
+    config_t{{32, 32}, 1, ADD},
+    config_t{{32, 64}, 0, ADD},
+    config_t{{64, 32}, 1, ADD}
+  };
+  // does the work
+  int axis;
+  std::vector<int> shape;
+  reduce_op_t op;
+  for(const auto& c: configs){
+    std::tie(shape, axis, op) = c;
+    std::cout << "Testing " << c << " ... " << std::flush;
+    if(do_test(stream, shape, axis, op, 1))
+      std::cout << " Pass! " << std::endl;
+    else
+      std::cout << " Fail! " << std::endl;
+  }
+}