[PYTHON] Modified version number to v0.4

[DOCS] Various improvements and typo fixes
[CODEGEN] Make sure peephole is called before anything else in codegen
2021-05-06 02:58:42 -04:00 · 2021-05-06 02:58:14 -04:00 · 2021-03-28 17:08:38 -04:00 · 2021-03-26 01:37:23 -04:00 · 2021-03-24 01:24:50 -04:00 · 2021-03-23 03:32:51 -04:00
789 changed files with 93981 additions and 89009 deletions
--- a/.buildinfo
+++ b/.buildinfo
@@ -1,4 +0,0 @@
-# Sphinx build info version 1
-# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 76c4cbf22d8ff0aa13ac3f4683f2586c
-tags: 645f666f9bcd5a90fca523b33c5a78b7
--- a/.ci/azure-pipelines.yml
+++ b/.ci/azure-pipelines.yml
@@ -0,0 +1,42 @@
+name: Triton CI
+pool:
+  name: default
+
+# Some variables
+variables:
+- name: venv
+  value: venv
+
+# Run CI when something pushed to master
+trigger: none
+# Run CI when a PR is created or updated from master
+pr:
+- master
+
+# Pipeline
+steps:
+- script: |
+    mkdir $(venv)
+    python -m virtualenv --python=python3 $(venv)
+    source $(venv)/bin/activate
+    python -m pip install --upgrade pip
+    pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio===0.7.2 \
+                -f https://download.pytorch.org/whl/torch_stable.html
+    cd python
+    python setup.py install
+  displayName: Setup python environment
+
+- script: |
+    source $(venv)/bin/activate
+    pip install matplotlib pandas
+    cd python/bench
+    python -m run 
+
+- publish: python/bench/results
+  artifact: Benchmarks
+
+- script: |
+    source $(venv)/bin/activate
+    pip install pytest
+    pytest .
+  displayName: 'Run Python tests'
--- a/.ci/build-wheels.yml
+++ b/.ci/build-wheels.yml
@@ -0,0 +1,35 @@
+trigger: none
+pr: none
+
+jobs:
+- job: linux
+
+  timeoutInMinutes: 180
+
+  pool: default
+
+  steps:
+    - bash: |
+        set -o errexit
+        python3 --version
+        python3 -m pip install --upgrade pip
+        pip3 install cibuildwheel==1.10.0
+        pip3 install twine
+      displayName: Install dependencies
+    - bash: |
+        #sed -i 's/name\=\"triton\"/name="triton-nightly"/g' python/setup.py
+        sed -i -r "s/version\=\"(.*)\"/version=\"\1-dev`date '+%Y%m%d'`\"/g" python/setup.py
+        echo "" >> python/setup.cfg
+        echo "[build_ext]" >> python/setup.cfg
+        echo "base-dir=/project" >> python/setup.cfg
+      displayName: Patch setup.py
+    - bash: |
+        export CIBW_BEFORE_BUILD="pip install cmake"
+        export CIBW_BUILD="{cp,pp}3*-manylinux_x86_64"
+        python3 -m cibuildwheel python --output-dir wheelhouse
+      displayName: Build wheels
+    - task: PublishBuildArtifacts@1
+      inputs: {pathtoPublish: 'wheelhouse'}
+    - bash: |
+        python3 -m twine upload wheelhouse/* --skip-existing -u $(PYPI_USERNAME) -p $(PYPI_PASSWORD) 
+      displayName: Upload wheels to PyPI
--- a/.cmake/api/v1/query/codemodel-v2
+++ b/.cmake/api/v1/query/codemodel-v2
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +0,0 @@
-.vscode
-./docs/_build/*
-./build/*
--- a/.htaccess
+++ b/.htaccess
@@ -1 +0,0 @@
-DirectoryIndex master/index.html
--- a/.nojekyll
+++ b/.nojekyll
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,100 @@
+cmake_minimum_required(VERSION 3.6)
+include(ExternalProject)
+
+if(NOT TRITON_LLVM_BUILD_DIR)
+    set(TRITON_LLVM_BUILD_DIR ${CMAKE_BINARY_DIR})
+endif()
+
+
+project(triton)
+include(CTest)
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+
+# Options
+option(BUILD_TUTORIALS "Build C++ Triton tutorials" ON)
+option(BUILD_PYTHON_MODULE "Build Python Triton bindings" OFF)
+
+# Default build type
+if(NOT CMAKE_BUILD_TYPE)
+  message(STATUS "Default build type: Release")
+  set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+# Compiler flags
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__STDC_FORMAT_MACROS -fvisibility=default -std=gnu++14")
+
+
+
+##########
+# LLVM
+##########
+get_cmake_property(_variableNames VARIABLES)
+set(__variableNames ${_variableNames})
+
+configure_file(cmake/DownloadLLVM.in ${TRITON_LLVM_BUILD_DIR}/llvm-download/CMakeLists.txt)
+execute_process(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" .
+    WORKING_DIRECTORY "${TRITON_LLVM_BUILD_DIR}/llvm-download"
+)
+execute_process(COMMAND "${CMAKE_COMMAND}" --build .
+    WORKING_DIRECTORY "${TRITON_LLVM_BUILD_DIR}/llvm-download"
+)
+set(LLVM_TARGETS_TO_BUILD "NVPTX" CACHE INTERNAL "")
+set(LLVM_BUILD_RUNTIME "OFF" CACHE INTERNAL "")
+set(LLVM_BUILD_RUNTIMES "OFF" CACHE INTERNAL "")
+set(LLVM_BUILD_TOOLS "OFF" CACHE INTERNAL "")
+set(LLVM_BUILD_UTILS "OFF" CACHE INTERNAL "")
+set(LLVM_INCLUDE_BENCHMARKS "OFF" CACHE INTERNAL "")
+set(LLVM_INCLUDE_DOCS "OFF" CACHE INTERNAL "")
+set(LLVM_INCLUDE_EXAMPLES "OFF" CACHE INTERNAL "")
+set(LLVM_INCLUDE_GO_TESTS "OFF" CACHE INTERNAL "")
+set(LLVM_INCLUDE_RUNTIME "OFF" CACHE INTERNAL "")
+set(LLVM_INCLUDE_TESTS "OFF" CACHE INTERNAL "")
+set(LLVM_INCLUDE_TOOLS "OFF" CACHE INTERNAL "")
+set(LLVM_INCLUDE_UTILS "OFF" CACHE INTERNAL "")
+add_subdirectory(${TRITON_LLVM_BUILD_DIR}/llvm-src
+                 ${TRITON_LLVM_BUILD_DIR}/llvm-build)
+get_property(LLVM_LIBRARIES GLOBAL PROPERTY LLVM_COMPONENT_LIBS)
+# remove LLVM-specific variables so we don't pollute GUI
+get_cmake_property(_variableNames VARIABLES)
+list(REMOVE_ITEM _variableNames ${__variableNames})
+list(REMOVE_ITEM _variableNames ${LLVM_LIBRARIES})
+foreach (_variableName ${_variableNames})
+    unset(${_variableName} CACHE)
+endforeach()
+include_directories("${TRITON_LLVM_BUILD_DIR}/llvm-build/include/"
+                    "${TRITON_LLVM_BUILD_DIR}/llvm-src/include/")
+
+# Python module
+if(BUILD_PYTHON_MODULE)
+    message(STATUS "Adding Python module")
+    # Build CUTLASS python wrapper if requested
+    set(PYTHON_SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR}/python/src)
+    set(CUTLASS_INCLUDE_DIR "$ENV{CUTLASS_INCLUDE_DIR}")
+    set(CUTLASS_LIBRARY_DIR "$ENV{CUTLASS_LIBRARY_DIR}")
+    if(NOT("${CUTLASS_INCLUDE_DIR}" STREQUAL "") AND NOT("${CUTLASS_LIBRARY_DIR}" STREQUAL ""))
+        set(CUTLASS_SRC ${PYTHON_SRC_PATH}/cutlass.cc)
+        add_definitions(-DWITH_CUTLASS_BINDINGS)
+        set(CUTLASS_LIBRARIES "cutlass.a")
+    endif()
+    message(STATUS ${CUTLASS_INCLUDE_PATH})
+    include_directories("." ${PYTHON_SRC_PATH} ${PYTHON_INCLUDE_DIRS} ${CUTLASS_INCLUDE_DIR})
+    link_directories(${PYTHON_LINK_DIRS} ${CUTLASS_LIBRARY_DIR})
+    set(PYTHON_SRC ${PYTHON_SRC_PATH}/main.cc ${PYTHON_SRC_PATH}/triton.cc  ${PYTHON_SRC_PATH}/superblock.cc ${CUTLASS_SRC})
+endif()
+
+
+# Triton
+file(GLOB_RECURSE LIBTRITON_SRC lib/*.cc)
+add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC})
+target_link_libraries(triton ${LLVM_LIBRARIES})
+
+if(BUILD_PYTHON_MODULE)
+    target_link_libraries(triton ${TORCH_LIBRARIES} ${CUTLASS_LIBRARIES})
+endif()
+
+# Tutorials
+if(BUILD_TUTORIALS)
+  message(STATUS "Adding C++ tutorials")
+  add_subdirectory(tutorials)
+endif()
--- a/1
+++ b/1
@@ -1 +0,0 @@
-triton-lang.org
--- a/26
+++ b/26
@@ -0,0 +1,26 @@
+/* Copyright 2018-2021 Philippe Tillet
+* 
+* Permission is hereby granted, free of charge, to any person obtaining 
+* a copy of this software and associated documentation files 
+* (the "Software"), to deal in the Software without restriction, 
+* including without limitation the rights to use, copy, modify, merge, 
+* publish, distribute, sublicense, and/or sell copies of the Software, 
+* and to permit persons to whom the Software is furnished to do so, 
+* subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be 
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+// The compiler front-end is based on a modified version of WGTCC
+// https://github.com/wgtdkp/wgtcc
+// Copyright (c) 2016 wgtdkp
--- a/README.md
+++ b/README.md
@@ -0,0 +1,18 @@
+# Triton
+
+This is the development repository of Triton, a language and compiler for writing highly efficient custom Deep-Learning primitives. The aim of Triton is to provide an open-source environment to write fast code at higher productivity than CUDA, but also with higher flexibility than other existing DSLs.
+
+[![Build Status](https://dev.azure.com/triton-lang/Triton/_apis/build/status/ptillet.triton?branchName=master)](https://dev.azure.com/triton-lang/Triton/_build/latest?definitionId=10&branchName=master)
+
+The foundations of this project are described in the following MAPL2019 publication: [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). Please consider citing us if you use our work!
+
+The [official documentation](https://triton-lang.org) contains installation instructions and tutorials.
+
+# Compatibility
+
+Supported Platforms:
+  * Linux
+
+Supported Hardware:
+  * NVIDIA GPUs (Compute Capability 7.0+)
+  * Under development: AMD GPUs, CPUs
--- a/cmake/DownloadLLVM.in
+++ b/cmake/DownloadLLVM.in
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.6)
+
+project(llvm-download NONE)
+include(ExternalProject)
+
+
+ExternalProject_Add(llvm
+    URL "https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/llvm-11.0.0.src.tar.xz"
+    SOURCE_DIR "${TRITON_LLVM_BUILD_DIR}/llvm-src"
+    BINARY_DIR "${TRITON_LLVM_BUILD_DIR}/llvm-build"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    INSTALL_COMMAND ""
+    TEST_COMMAND ""
+)
--- a/cmake/FindLLVM.cmake
+++ b/cmake/FindLLVM.cmake
@@ -0,0 +1,168 @@
+# - Find LLVM headers and libraries.
+# This module locates LLVM and adapts the llvm-config output for use with
+# CMake.
+#
+# A given list of COMPONENTS is passed to llvm-config.
+#
+# The following variables are defined:
+#  LLVM_FOUND          - true if LLVM was found
+#  LLVM_CXXFLAGS       - C++ compiler flags for files that include LLVM headers.
+#  LLVM_HOST_TARGET    - Target triple used to configure LLVM.
+#  LLVM_INCLUDE_DIRS   - Directory containing LLVM include files.
+#  LLVM_LDFLAGS        - Linker flags to add when linking against LLVM
+#                        (includes -LLLVM_LIBRARY_DIRS).
+#  LLVM_LIBRARIES      - Full paths to the library files to link against.
+#  LLVM_LIBRARY_DIRS   - Directory containing LLVM libraries.
+#  LLVM_ROOT_DIR       - The root directory of the LLVM installation.
+#                        llvm-config is searched for in ${LLVM_ROOT_DIR}/bin.
+#  LLVM_VERSION_MAJOR  - Major version of LLVM.
+#  LLVM_VERSION_MINOR  - Minor version of LLVM.
+#  LLVM_VERSION_STRING - Full LLVM version string (e.g. 6.0.0svn).
+#  LLVM_VERSION_BASE_STRING - Base LLVM version string without git/svn suffix (e.g. 6.0.0).
+#
+# Note: The variable names were chosen in conformance with the offical CMake
+# guidelines, see ${CMAKE_ROOT}/Modules/readme.txt.
+
+# Try suffixed versions to pick up the newest LLVM install available on Debian
+# derivatives.
+# We also want an user-specified LLVM_ROOT_DIR to take precedence over the
+# system default locations such as /usr/local/bin. Executing find_program()
+# multiples times is the approach recommended in the docs.
+set(llvm_config_names llvm-config-11 llvm-config-11.0
+                      llvm-config-10 llvm-config-10.0 llvm-config100
+                      llvm-config-9 llvm-config-9.0 llvm-config90
+                      llvm-config-8 llvm-config-8.0 llvm-config80
+                      llvm-config)
+find_program(LLVM_CONFIG
+    NAMES ${llvm_config_names}
+    PATHS ${LLVM_ROOT_DIR}/bin NO_DEFAULT_PATH
+    DOC "Path to llvm-config tool.")
+find_program(LLVM_CONFIG NAMES ${llvm_config_names})
+
+# Prints a warning/failure message depending on the required/quiet flags. Copied
+# from FindPackageHandleStandardArgs.cmake because it doesn't seem to be exposed.
+macro(_LLVM_FAIL _msg)
+  if(LLVM_FIND_REQUIRED)
+    message(FATAL_ERROR "${_msg}")
+  else()
+    if(NOT LLVM_FIND_QUIETLY)
+      message(STATUS "${_msg}")
+    endif()
+  endif()
+endmacro()
+
+
+if(NOT LLVM_CONFIG)
+    if(NOT LLVM_FIND_QUIETLY)
+        message(WARNING "Could not find llvm-config (LLVM >= ${LLVM_FIND_VERSION}). Try manually setting LLVM_CONFIG to the llvm-config executable of the installation to use.")
+    endif()
+else()
+    macro(llvm_set var flag)
+       if(LLVM_FIND_QUIETLY)
+            set(_quiet_arg ERROR_QUIET)
+        endif()
+        set(result_code)
+        execute_process(
+            COMMAND ${LLVM_CONFIG} --${flag}
+            RESULT_VARIABLE result_code
+            OUTPUT_VARIABLE LLVM_${var}
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ${_quiet_arg}
+        )
+        if(result_code)
+            _LLVM_FAIL("Failed to execute llvm-config ('${LLVM_CONFIG}', result code: '${result_code})'")
+        else()
+            if(${ARGV2})
+                file(TO_CMAKE_PATH "${LLVM_${var}}" LLVM_${var})
+            endif()
+        endif()
+    endmacro()
+    macro(llvm_set_libs var flag)
+       if(LLVM_FIND_QUIETLY)
+            set(_quiet_arg ERROR_QUIET)
+        endif()
+        set(result_code)
+        execute_process(
+            COMMAND ${LLVM_CONFIG} --${flag} ${LLVM_FIND_COMPONENTS}
+            RESULT_VARIABLE result_code
+            OUTPUT_VARIABLE tmplibs
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ${_quiet_arg}
+        )
+        if(result_code)
+            _LLVM_FAIL("Failed to execute llvm-config ('${LLVM_CONFIG}', result code: '${result_code})'")
+        else()
+            file(TO_CMAKE_PATH "${tmplibs}" tmplibs)
+            string(REGEX MATCHALL "${pattern}[^ ]+" LLVM_${var} ${tmplibs})
+        endif()
+    endmacro()
+
+    llvm_set(VERSION_STRING version)
+    llvm_set(CXXFLAGS cxxflags)
+    llvm_set(HOST_TARGET host-target)
+    llvm_set(INCLUDE_DIRS includedir true)
+    llvm_set(ROOT_DIR prefix true)
+    llvm_set(ENABLE_ASSERTIONS assertion-mode)
+
+    # The LLVM version string _may_ contain a git/svn suffix, so cut that off
+    string(SUBSTRING "${LLVM_VERSION_STRING}" 0 5 LLVM_VERSION_BASE_STRING)
+
+    # Versions below 4.0 do not support components debuginfomsf and demangle
+    if(${LLVM_VERSION_STRING} MATCHES "^3\\..*")
+        list(REMOVE_ITEM LLVM_FIND_COMPONENTS "debuginfomsf" index)
+        list(REMOVE_ITEM LLVM_FIND_COMPONENTS "demangle" index)
+    endif()
+    # Versions below 8.0 not supported
+    if(${LLVM_VERSION_STRING} MATCHES "^[3-7]\\..*")
+        message(FATAL_ERROR "LLVM version below 8.0 not supported")
+    endif()
+
+    llvm_set(LDFLAGS ldflags)
+    # In LLVM 3.5+, the system library dependencies (e.g. "-lz") are accessed
+    # using the separate "--system-libs" flag.
+    llvm_set(SYSTEM_LIBS system-libs)
+    string(REPLACE "\n" " " LLVM_LDFLAGS "${LLVM_LDFLAGS} ${LLVM_SYSTEM_LIBS}")
+    llvm_set(LIBRARY_DIRS libdir true)
+    llvm_set_libs(LIBRARIES libs)
+    # LLVM bug: llvm-config --libs tablegen returns -lLLVM-3.8.0
+    # but code for it is not in shared library
+    if("${LLVM_FIND_COMPONENTS}" MATCHES "tablegen")
+        if (NOT "${LLVM_LIBRARIES}" MATCHES "LLVMTableGen")
+            set(LLVM_LIBRARIES "${LLVM_LIBRARIES};-lLLVMTableGen")
+        endif()
+    endif()
+
+    # Versions below 4.0 do not support llvm-config --cmakedir
+    if(${LLVM_VERSION_STRING} MATCHES "^3\\..*")
+        set(LLVM_CMAKEDIR ${LLVM_LIBRARY_DIRS}/cmake/llvm)
+    else()
+        llvm_set(CMAKEDIR cmakedir)
+    endif()
+
+    llvm_set(TARGETS_TO_BUILD targets-built)
+    string(REGEX MATCHALL "${pattern}[^ ]+" LLVM_TARGETS_TO_BUILD ${LLVM_TARGETS_TO_BUILD})
+endif()
+
+# Remove some clang-specific flags for gcc.
+if(CMAKE_COMPILER_IS_GNUCXX)
+    string(REPLACE "-Wcovered-switch-default " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS})
+    string(REPLACE "-Wstring-conversion " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS})
+    string(REPLACE "-fcolor-diagnostics " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS})
+    string(REPLACE "-Werror=unguarded-availability-new " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS})
+endif()
+
+# Remove gcc-specific flags for clang.
+if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+    string(REPLACE "-Wno-maybe-uninitialized " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS})
+endif()
+
+string(REGEX REPLACE "([0-9]+).*" "\\1" LLVM_VERSION_MAJOR "${LLVM_VERSION_STRING}" )
+string(REGEX REPLACE "[0-9]+\\.([0-9]+).*[A-Za-z]*" "\\1" LLVM_VERSION_MINOR "${LLVM_VERSION_STRING}" )
+
+
+# Use the default CMake facilities for handling QUIET/REQUIRED.
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(LLVM
+    REQUIRED_VARS LLVM_ROOT_DIR LLVM_HOST_TARGET
+    VERSION_VAR LLVM_VERSION_STRING)
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = Triton
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Triton documentation build configuration file, created by
+# sphinx-quickstart on Mon Feb 10 01:19:09 2020.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = []
+
+# Math Jax
+extensions += ['sphinx.ext.mathjax']
+
+# Sphinx gallery
+extensions += ['sphinx_gallery.gen_gallery']
+from sphinx_gallery.sorting import FileNameSortKey
+sphinx_gallery_conf = {
+    'examples_dirs': '../python/tutorials/',
+    'gallery_dirs': 'getting-started/tutorials',
+    'filename_pattern': '',
+    'ignore_pattern': r'__init__\.py',
+    'within_subsection_order': FileNameSortKey,
+}
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'Triton'
+copyright = '2020, Philippe Tillet'
+author = 'Philippe Tillet'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = ''
+# The full version, including alpha/beta/rc tags.
+release = ''
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+import sphinx_rtd_theme
+html_theme = 'sphinx_rtd_theme'
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+html_css_files = [
+    'css/custom.css',
+]
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# This is required for the alabaster theme
+# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
+html_sidebars = {
+    '**': [
+        'relations.html',  # needs 'show_related': True theme option to display
+        'searchbox.html',
+    ]
+}
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Tritondoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'Triton.tex', 'Triton Documentation', 'Philippe Tillet', 'manual'),
+]
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, 'triton', 'Triton Documentation', [author], 1)]
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'Triton', 'Triton Documentation', author, 'Triton', 'One line description of project.', 'Miscellaneous'),
+]
--- a/docs/getting-started/installation.rst
+++ b/docs/getting-started/installation.rst
@@ -0,0 +1,61 @@
+==============
+Installation
+==============
+
+---------------------
+Binary Distributions
+---------------------
+
+You can install the latest nightly release of Triton from pip:
+
+.. code-block:: bash
+  
+      pip install -U --pre triton
+
+
+--------------
+From Source
+--------------
+
+++++++++++++++
+Python Package
+++++++++++++++
+
+You can install the Python package from source by running the following commands:
+
+.. code-block:: bash
+
+      git clone https://github.com/ptillet/triton.git;
+      cd triton/python;
+      pip install -e .
+
+This may take a while (10-20 minutes) as it will download and compile LLVM from source.
+
+You can then test your installation by running the unit tests:
+
+.. code-block:: bash
+
+      pytest -vs .
+
+and the benchmarks
+
+.. code-block:: bash
+      
+      cd bench/
+      python -m run --with-plots --result-dir /tmp/triton-bench
+
+++++++++++++++
+C++ Package
+++++++++++++++
+
+Those not interested in Python integration may want to use the internals of Triton (i.e, runtime, parser, codegen, driver, intermediate representation) directly. This can be done by running the following commands:
+
+.. code-block:: bash
+
+      git clone https://github.com/ptillet/triton.git;
+      mkdir build;
+      cd build;
+      cmake ../;
+      make -j8;
+
+Note that while direct usage of the C++ API is not officially supported, a usage tutorial can be found  `here <https://github.com/ptillet/triton/blob/master/tutorials/01-matmul.cc>`_
--- a/v1.1.2/_sources/index.rst.txt
+++ b/v1.1.2/_sources/index.rst.txt
@@ -1,7 +1,7 @@
 Welcome to Triton's documentation!
 ==================================

-Triton is a language and compiler for parallel programming. It aims to provide a Python-based programming environment for productively writing custom DNN compute kernels capable of running at maximal throughput on modern GPU hardware.
+Triton is an imperative language and compiler for parallel programming. It aims to provide a programming environment for productively writing custom DNN compute kernels capable of running at maximal throughput on modern GPU hardware.

 Getting Started
 ---------------
@@ -17,31 +17,15 @@ Getting Started
   getting-started/installation
   getting-started/tutorials/index

-Python API
-------------------
-
- :doc:`triton <python-api/triton>`
- :doc:`triton.language <python-api/triton.language>`
- :doc:`triton.testing <python-api/triton.testing>`
-
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Python API
-   :hidden:
-
-   python-api/triton
-   python-api/triton.language
-   python-api/triton.testing
-
-   
-Going Further
+Programming Guide
 ------------------

 Check out the following documents to learn more about Triton and how it compares against other DSLs for DNNs:

 - Chapter 1: :doc:`Introduction <programming-guide/chapter-1/introduction>`
 - Chapter 2: :doc:`Related Work <programming-guide/chapter-2/related-work>`
+- Chapter 3: :doc:`The Triton-C Language <programming-guide/chapter-3/triton-c>`
+- Chapter 4: :doc:`The Triton-IR Intermediate Representation <programming-guide/chapter-4/triton-ir>`

 .. toctree::
   :maxdepth: 1
@@ -50,3 +34,5 @@ Check out the following documents to learn more about Triton and how it compares

   programming-guide/chapter-1/introduction
   programming-guide/chapter-2/related-work
+   programming-guide/chapter-3/triton-c
+   programming-guide/chapter-4/triton-ir
--- a/docs/programming-guide/chapter-1/cuda-parallel-matmul.png
+++ b/docs/programming-guide/chapter-1/cuda-parallel-matmul.png
--- a/master/_sources/programming-guide/chapter-1/introduction.rst.txt
+++ b/master/_sources/programming-guide/chapter-1/introduction.rst.txt
--- a/docs/programming-guide/chapter-1/triton-parallel-matmul.png
+++ b/docs/programming-guide/chapter-1/triton-parallel-matmul.png
--- a/docs/programming-guide/chapter-2/halide-iteration.png
+++ b/docs/programming-guide/chapter-2/halide-iteration.png
--- a/docs/programming-guide/chapter-2/polyhedral-iteration.png
+++ b/docs/programming-guide/chapter-2/polyhedral-iteration.png
--- a/v1.1.2/_sources/programming-guide/chapter-2/related-work.rst.txt
+++ b/v1.1.2/_sources/programming-guide/chapter-2/related-work.rst.txt
@@ -2,7 +2,7 @@
 Related Work
 ==============

-At first sight, Triton may seem like just yet another DSL for DNNs. The purpose of this section is to contextualize Triton and highlight its differences with the two leading approaches in this domain: polyhedral compilation and scheduling languages.
+At first sight, Triton may seem like just yet another DSL for DNNs. The purpose of this section is to contextualize Triton and highlights its differences with the two leading approaches in this domain: polyhedral compilation and scheduling languages.

 -----------------------
 Polyhedral Compilation
@@ -121,7 +121,7 @@ Limitations

 Unfortunately, polyhedral compilers suffer from two major limitations that have prevented its adoption as a universal method for code generation in neural networks.

-First, the set of possible program transformations :math:`\Omega = \{ \Theta_S ~|~ S \in \text{program} \}` is large, and grows with the number of statements in the program as well as with the size of their iteration domain. Verifying the legality of each transformation can also require the resolution of complex integer linear programs, making polyhedral compilation very computationally expensive. To make matters worse, hardware properties (e.g., cache size, number of SMs) and contextual characteristics (e.g., input tensor shapes) also have to be taken into account by this framework, leading to expensive auto-tuning procedures [SATO2019]_.
+First, the set of possible program transformations $\Omega = \{ \Theta_S ~|~ S \in \text{program} \}$ is large, and grows with the number of statements in the program as well as with the size of their iteration domain. Verifying the legality of each transformation can also require the resolution of complex integer linear programs, making polyhedral compilation very computationally expensive. To make matters worse, hardware properties (e.g., cache size, number of SMs) and contextual characteristics (e.g., input tensor shapes) also have to be taken into account by this framework, leading to expensive auto-tuning procedures [SATO2019]_.

 Second, the polyhedral framework is not very generally applicable; SCoPs are relatively common [GIRBAL2006]_ but require loop bounds and array subscripts to be affine functions of loop indices, which typically only occurs in regular, dense computations. For this reason, this framework still has to be successfully applied to sparse -- or even structured-sparse -- neural networks, whose importance has been rapidly rising over the past few years.

@@ -131,7 +131,7 @@ On the other hand, blocked program representations advocated by this dissertatio
 Scheduling Languages
 -----------------------

-Separation of concerns [DIJKSTRA82]_ is a well-known design principle in computer science: programs should be decomposed into modular layers of abstraction that separate the semantics of their algorithms from the details of their implementation. Systems like Halide and TVM push this philosophy one step further, and enforce this separation at the grammatical level through the use of a  **scheduling language**. The benefits of this methodology are particularly visible in the case of matrix multiplication, where, as one can see below, the definition of the algorithm (Line 1-7) is completely disjoint from its implementation (Line 8-16), meaning that both can be maintained, optimized and distributed independently. 
+Separation of concerns \cite{dijkstra82} is a well-known design principle in computer science: programs should be decomposed into modular layers of abstraction that separate the semantics of their algorithms from the details of their implementation. Systems like Halide and TVM push this philosophy one step further, and enforce this separation at the grammatical level through the use of a  **scheduling language**. The benefits of this methodology are particularly visible in the case of matrix multiplication, where, as one can see below, the definition of the algorithm (Line 1-7) is completely disjoint from its implementation (Line 8-16), meaning that both can be maintained, optimized and distributed independently. 

 .. code-block:: python
  :linenos:
@@ -168,7 +168,7 @@ Scheduling languages are, without a doubt, one of the most popular approaches fo
 Limitations
 ++++++++++++

-This ease-of-development comes at a cost. First of all, existing systems that follow this paradigm tend to be noticeably slower than Triton on modern hardware when applicable (e.g., V100/A100 tensor cores w/ equal tile sizes). I do believe that this is not a fundamental issue of scheduling languages -- in the sense that it could probably be solved with more efforts -- but it could mean that these systems are harder to engineer. More importantly, existing scheduling languages generate loops whose bounds and increments cannot depend on surrounding loop indice without at least imposing severe constraints on possible schedules -- if not breaking the system entirely. This is problematic for sparse computations, whose iteration spaces may be irregular.
+This ease-of-development comes at a cost. First of all, existing systems that follow this paradigm tend to be noticeably slower than Triton on modern hardware when applicable (e.g., V100/A100 tensor cores w/ equal tile sizes). I do believe that this is not a fundamental issue of scheduling languages -- in the sense that it could probably be solved with more efforts -- but it could mean that these systems are harder to engineer. More importantly, existing scheduling languages generate loops whose bounds and increments cannot depend on surrounding loop indice without at least imposing severe constraints on possible schedules -- if not breaking the system entirely. This is problematic for sparse com-putations, whose iteration spaces may be irregular.

 .. table::
    :widths: 50 50
@@ -206,5 +206,4 @@ References
 .. [GROSSER2012] T. Grosser et al., "Polly - Performing Polyhedral Optimizations on a Low-Level Intermediate Representation", Parallel Processing Letters 2012
 .. [SATO2019] Y. Sato et al., "An Autotuning Framework for Scalable Execution of Tiled Code via Iterative Polyhedral Compilation", TACO 2019
 .. [GIRBAL2006] S. Girbal et al., "Semi-Automatic Composition of Loop Transformations for Deep Parallelism and Memory Hierarchies", International Journal of Parallel Programming 2006
-.. [DIJKSTRA82] E. W. Dijkstra et al., "On the role of scientific thought", Selected writings on computing: a personal perspective 1982
-.. [MULLAPUDI2016] R. Mullapudi et al., "Automatically scheduling halide image processing pipelines", TOG 2016
+.. [MULLAPUDI2016] R. Mullapudi et al., "Automatically scheduling halide image processing pipelines", TOG 2016
--- a/docs/programming-guide/chapter-3/triton-c.rst
+++ b/docs/programming-guide/chapter-3/triton-c.rst
@@ -0,0 +1,84 @@
+=======================
+The Triton-C Language
+=======================
+
+In the introduction, we stressed the importance of blocked algorithms and described their core principles in pseudo-code. To facilitate their implementation on modern GPU hardware, we present Triton-C, a single-threaded imperative kernel language in which block variables are first-class citizen.  This language may be used either directly by developers familiar with C, or as an intermediate language for existing (and future) transcompilers. In this chapter, we describe its differences with C, its Numpy-like semantics and its "Single-Program, Multiple-Data" (SPMD) programming model.
+
+-------------------
+Differences with C
+-------------------
+
+The syntax of Triton-C is based on that of ANSI C, but was modified and extended to accomodate the semantics and programming model described in the next two  subsections. These changes fall into the following categories:
+
+++++++++++
+Extensions
+++++++++++
+
+**Variable declarations**: Triton adds special-purpose syntax for multi-dimensional array declarations (e.g., :code:`int block[16, 16]`), which purposely differs from that of nested arrays (i.e., arrays of pointers) found in ANSI C (e.g., :code:`int block[16][16]`). Block dimensions must be constant but can also be made parametric with the use of pre-processor macros. One-dimensional blocks of integers may be initialized using ellipses (e.g., :code:`int range[16] = 0 ... 16`).
+
+**Primitive types**: Triton-C supports the following primitive data-types: :code:`bool`, :code:`uint8`, :code:`uint16`, :code:`uint32`, :code:`uint64`, :code:`int8`, :code:`int16`, :code:`int32`, :code:`int64`, :code:`half`, :code:`float`, :code:`double`.
+
+**Operators and built-in function**: The usual C operators were extended to support element-wise array operations (:code:`+`, :code:`-`, :code:`&&`, :code:`*`, etc.) and complex array operations(:code:`@` for matrix multiplication). Additionally, some built-in functions were added for concurrency (:code:`get_program_id`, :code:`atomic_add`).
+
+**Slicing and broadcasting**: Multi-dimensional blocks can be broadcast along any particular dimension using numpy-like slicing syntax (e.g., :code:`int array[8, 8] = range[:, newaxis]` for stacking columns). Note that, as of now, slicing blocks to retrieve sub-blocks (or scalars) is forbidden as it is incompatible with the automatic parallelization methods used by our JIT. Reductions can be achieved using a syntax similar to slicing (e.g., :code:`array[+]` for summing an array, or :code:`array[:, max]` for row-wise maximum). Currently supported reduction operators are :code:`+`, :code:`min`, :code:`max`.
+
+**Masked pointer dereferencement**: Block-level operations in Triton-C are "atomic", in the sense that they execute either completely or not at all. Basic element-wise control-flow for block-level operations can nonetheless be achieved using ternary operators and the *masked pointer dereferencement* operator exemplified below:
+
+.. code-block:: C
+  :force:
+
+  // create mask
+  bool mask[16, 16] = ...;
+  // conditional addition
+  float x[16, 16] = mask ? a + b : 0;
+  // conditional load
+  float y[16] 16] = mask ? *ptr : 0;
+  // conditional store
+  *?(mask)ptr = y;
+  \end{lstlisting}
+
+
+++++++++++++
+Restrictions
+++++++++++++
+
+The Triton project is still in its infancy. As such, there are quite a few features of ANSI C that are not supported:
+
+**Non-kernel functions**: Right now, all function definitions must be kernels, i.e. be preceded with the :code:`__global__` attribute. We are aware that this is a severe limitations, and the reason why it exists is because our automatic parallelization engine would not be capable of handling array parameter arguments.
+
+**Non-primitive types**: Non-primitive types defined with :code:`struct` and :code:`union` are currently not supported, again because it is unclear at this point how these constructs would hook into our block-level data-flow analysis passes.
+
+**While loops**: We just haven't had time to implement those yet.
+
+----------------
+Semantics
+----------------
+
+The existence of built-in **blocked** types, variable and operations in Triton-C offers two main benefits. First, it simplifies the structure of blocked programs by hiding important details pertaining to concurrent programming such as memory coalescing, cache management and specialized tensor instrinsics. Second, it opens the door for compilers to perform these optimizations automatically. However, it also means that programs have some kind of *block-level semantics* that does not exist in C. Though some aspects of it (e.g., the :code:`@` operator) are pretty intuitive, one in particular might be puzzling to some GPU programmers: broadcasting semantics.
+
+++++++++++++++++++++++
+Broadcasting Semantics
+++++++++++++++++++++++
+
+
+Block variables in Triton are strongly typed, meaning that certain instructions statically require their operands to satisfy strict shape constraints. For example, a scalar may not be added to an array unless it is first appropriately broadcast. *Broadcasting semantics* (first introduced in `Numpy <https://numpy.org/doc/stable/user/basics.broadcasting.html>`_) provides two formal rules for performing these conversions automatically in the case of binary operators: (1) the shape of the lowest-dimension operand is left-padded with ones until both operands have the same dimensionality; and (2) the content of both operands is replicated as many times as needed until their shape is identical. An error is emitted if this cannot be done.
+
+.. code-block:: C
+
+  int a[16], b[32, 16], c[16, 1];
+  // a is first reshaped to [1, 16]
+  // and then broadcast to [32, 16]
+  int x_1[32, 16] = a[newaxis, :] + b;
+  // Same as above but implicitly
+  int x_2[32, 16] = a + b;
+  // a is first reshaped to [1, 16]
+  // a is broadcast to [16, 16]
+  // c is broadcast to [16, 16]
+  int y[16, 16] = a + c;
+
+------------------
+Programming Model
+------------------
+
+As discussed in the `CUDA documentation <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html>`_, The execution of CUDA  code on GPUs is supported by an `SPMD <https://en.wikipedia.org/wiki/SPMD>`_ programming model in which each kernel instance is associated with an identifiable *thread-block*, itself decomposed into *warps* of 32 *threads*. The Triton programming model is similar, but each kernel is *single-threaded* -- though automatically parallelized -- and associated with a global :code:`program id` which varies from instance to instance. This approach leads to simpler kernels in which CUDA-like concurrency primitives (shared memory synchronization, inter-thread communication, etc.) do not exist. The global program ids associated with each  kernel instance can be queried using the :code:`get_program_id(axis)` built-in function where :code:`0 <= axis <= 2`. This is, for example, useful to create e.g., blocks of pointers as shown in the tutorials.
+
--- a/docs/programming-guide/chapter-4/broadcast-1.png
+++ b/docs/programming-guide/chapter-4/broadcast-1.png
--- a/docs/programming-guide/chapter-4/broadcast-2.png
+++ b/docs/programming-guide/chapter-4/broadcast-2.png
--- a/docs/programming-guide/chapter-4/triton-ir.rst
+++ b/docs/programming-guide/chapter-4/triton-ir.rst
@@ -0,0 +1,82 @@
+==========================================
+The Triton-IR Intermediate Representation
+==========================================
+
+Triton-IR is an LLVM-based Intermediate Representation (IR) whose purpose is to provide an environment suitable for block-level program analysis, transformation and optimization.
+In our implementation, Triton-IR programs are constructed directly from Triton-C after parsing, but they could also be formed directly by higher-level DSLs in the future.
+Triton-IR and LLVM-IR programs share the same high-level structure, but the former also includes a number of extensions necessary for block-level data-flow analysis.
+These extensions are crucial for carrying out the optimizations outlined in the next chapter of this document.
+
+---------------------------------
+Structure of a Triton-IR Program
+---------------------------------
+
++++++++
+Modules
++++++++
+
+At the highest level, Triton-IR programs consist of one or multiple basic units of compilation known as *modules*. These modules are compiled independently from one another, and eventually aggregated by a linker whose role is to resolve forward declarations and adequately merge global definitions. Each module itself is composed of functions, global variables, constants and other miscellaneous symbols such as metadata and attributes.
+
++++++++++
+Functions
++++++++++
+
+Triton-IR function definitions consist of a return type, a name and a potentially empty arguments list. Additional visibility, alignment and linkage specifiers can be added if desired. Function attributes (such as inlining hints) and parameter attributes (such as "readonly", aliasing hints) can also be specified, allowing compiler backends to perform more aggressive optimizations by, for instance, making better use of non-coherent caches found on NVIDIA GPUs. This header is followed by a body composed of a list of basic blocks whose interdependencies form the Control Flow Graph (CFG) of the function.
+
+++++++++++++
+Basic Blocks
+++++++++++++
+
+Basic blocks are straight-line code sequences that may only contain so-called *terminator* instructions (i.e., branching, return) at their end. To simplify program analysis, Triton-IR uses the Static Single Assignment (SSA) form, meaning that each variable in each basic block must be (1) assigned to only once and (2) defined before being used. In so doing, each basic block implicitly defines a Data-Flow Graph (DFG). In our case, the SSA form is created directly from Triton-C's Abstract Syntax Trees (ASTs) using an algorithm from the literature [BRAUN13]_.
+
+---------------------------------
+Block-Level Dataflow Analysis
+---------------------------------
+
+++++++
+Types
+++++++
+
+Multi-dimensional blocks are at the center of data-flow analysis in Triton-JIT. They can be declared using syntax similar to vector declarations in LLVM-IR. For example, :code:`i32<8, 8>` is the type corresponding to :math:`8 \times 8` blocks of 32-bit integers. Note that there is no preprocessor in Triton-IR, hence parametric shape  values must be resolved before programs are generated. In our case, this is done by Triton-JIT's auto-tuner.
+
+++++++++++++
+Instructions
+++++++++++++
+
+Triton-IR introduces a set of *reblocking* instructions whose purpose is to support broadcasting semantics as described in the previous chapter.  The :code:`reshape` instruction creates a block of the specified shape using the raw data from its input argument. This is particularly useful to re-interpret variables as higher-dimensional arrays by padding their input shapes with ones in preparation for broadcasting. The :code:`broadcast` instruction creates a block of the specified shapes by replicating its input argument as many times as necessary along dimensions of size 1 -- as shown below for the :code:`broadcast<3,3>` instruction.
+
+|pic1| and |pic2|
+
+.. |pic1| image:: broadcast-1.png
+   :width: 40%
+
+.. |pic2| image:: broadcast-2.png
+   :width: 40%
+
+Usual scalar instructions (:code:`cmp`, :code:`getelementptr`, :code:`add`, :code:`load`...) were preserved and extended to signify element-wise operations when applicable. Finally, Triton-IR also exposes specialized arithmetic instructions for reductions (:code:`reduce`) and matrix multiplications (:code:`dot`).
+
+----------------------------------
+Block-Level Control Flow Analysis
+----------------------------------
+
+In Triton-IR, operations on block variables are atomic: they execute either in full or not at all. As a result, traditional control flow structures (e.g., conditional, loops) are not applicable to individual block elements. This is problematic, since a program may need to e.g., partially guard blocked loads against memory access violations.
+
+This could be potentially solved through the use of the Predicated SSA (PSSA) [CARTER99]_ [STOUTCHININ01]_ form for Triton-IR. However, this would create a lot of unnecessary complexity for GPUs, where the benefits of PSSA are close to none as divergent program paths  within warps are  serialized anyway. Therefore, recent versions of Triton handle intra-block control flow in a much simpler way, using conditional instructions such as  :code:`select`, :code:`masked_load` and :code:`masked_store`:
+
+.. code-block:: C
+
+  // For all indices [idx], return cond[idx] ? true_value[idx] : false_value[idx];
+  select       TYPE<TS1, ..., TSN> cond, true_value, false_value;
+  // For all indices [idx], return cond[idx] ? *true_addr[idx] : false_value[idx];
+  masked_load  TYPE<TS1, ..., TSN> cond, true_addr, false_value;
+  // For all indices [idx], execute *true_addr[idx] = true_value[idx] if cond[idx]
+  masked_store TYPE<TS1, ..., TSN> cond, true_addr, true_value;
+
+
+------------
+References
+------------
+
+.. [BRAUN13] M. Braun et al., "Simple and Efficient Construction of Static Single Assignment Form", CC 2013
+.. [CARTER99] L. Carter et al., "Predicated Static Single Assignment", PACT 1999
+.. [STOUTCHININ01] A. Stoutchinin et al., "Efficient Static Single Assignment Form for Predication", MICRO 2001
--- a/include/triton/codegen/analysis/align.h
+++ b/include/triton/codegen/analysis/align.h
@@ -0,0 +1,80 @@
+#ifndef TDL_INCLUDE_CODEGEN_ALIGNMENT_INFO_PASS_H
+#define TDL_INCLUDE_CODEGEN_ALIGNMENT_INFO_PASS_H
+
+#include <map>
+#include <vector>
+
+namespace triton {
+
+namespace ir {
+  class value;
+  class module;
+  class phi_node;
+  class splat_inst;
+  class cast_inst;
+  class reshape_inst;
+  class broadcast_inst;
+  class binary_operator;
+  class getelementptr_inst;
+}
+
+namespace codegen{
+namespace analysis{
+
+class align {
+private:
+  struct cst_info {
+    unsigned num_cst;
+    unsigned value;
+  };
+  // helpers
+  std::vector<unsigned> get_shapes(ir::value *v);
+  // populate is_constant
+  std::vector<cst_info> populate_is_constant_phi(ir::phi_node* x);
+  std::vector<cst_info> populate_is_constant_splat(ir::splat_inst* x);
+  std::vector<cst_info> populate_is_constant_reshape(ir::reshape_inst* x);
+  std::vector<cst_info> populate_is_constant_broadcast(ir::broadcast_inst* x);
+  std::vector<cst_info> populate_is_constant_binop(ir::binary_operator* x);
+  std::vector<cst_info> populate_is_constant_gep(ir::getelementptr_inst* x);
+  std::vector<cst_info> populate_is_constant_default(ir::value* v);
+  std::vector<cst_info> populate_is_constant(ir::value *v);
+  // populate max_contiguous
+  std::vector<unsigned> populate_max_contiguous_phi(ir::phi_node* x);
+  std::vector<unsigned> populate_max_contiguous_splat(ir::splat_inst* x);
+  std::vector<unsigned> populate_max_contiguous_reshape(ir::reshape_inst* x);
+  std::vector<unsigned> populate_max_contiguous_broadcast(ir::broadcast_inst* x);
+  std::vector<unsigned> populate_max_contiguous_binop(ir::binary_operator* x);
+  std::vector<unsigned> populate_max_contiguous_gep(ir::getelementptr_inst* x);
+  std::vector<unsigned> populate_max_contiguous_cast(ir::cast_inst* x);
+  std::vector<unsigned> populate_max_contiguous_default(ir::value* v);
+  std::vector<unsigned> populate_max_contiguous(ir::value *v);
+  // populate starting_multiple
+  std::vector<unsigned> populate_starting_multiple_phi(ir::phi_node* x);
+  std::vector<unsigned> populate_starting_multiple_splat(ir::splat_inst* x);
+  std::vector<unsigned> populate_starting_multiple_reshape(ir::reshape_inst* x);
+  std::vector<unsigned> populate_starting_multiple_broadcast(ir::broadcast_inst* x);
+  std::vector<unsigned> populate_starting_multiple_binop(ir::binary_operator* x);
+  std::vector<unsigned> populate_starting_multiple_gep(ir::getelementptr_inst* x);
+  std::vector<unsigned> populate_starting_multiple_cast(ir::cast_inst* x);
+  std::vector<unsigned> populate_starting_multiple_default(ir::value* v);
+  std::vector<unsigned> populate_starting_multiple(ir::value *v);
+  // populate all maps
+  void populate(ir::value *v);
+
+public:
+  void run(ir::module &mod);
+  unsigned get(ir::value* v, unsigned ax) const;
+  std::vector<unsigned> contiguous(ir::value* v) const;
+
+private:
+  std::map<ir::value*, std::vector<cst_info>> is_constant_;
+  std::map<ir::value*, std::vector<unsigned>> max_contiguous_;
+  std::map<ir::value*, std::vector<unsigned>> starting_multiple_;
+};
+
+
+}
+}
+}
+
+#endif
--- a/include/triton/codegen/analysis/allocation.h
+++ b/include/triton/codegen/analysis/allocation.h
@@ -0,0 +1,47 @@
+#ifndef TDL_INCLUDE_IR_CODEGEN_STORAGE_ALLOC_H
+#define TDL_INCLUDE_IR_CODEGEN_STORAGE_ALLOC_H
+
+#include <map>
+#include <set>
+#include <iostream>
+#include "triton/codegen/analysis/liveness.h"
+
+namespace triton{
+
+namespace ir{
+  class value;
+  class function;
+  class module;
+}
+
+namespace codegen{
+namespace analysis{
+
+class tiles;
+
+class liveness;
+class cts;
+
+class allocation {
+public:
+  allocation(liveness *live)
+    : liveness_(live) { }
+  // accessors
+  bool has_offset(const data_layout *x)    const { return offsets_.find(x) != offsets_.end(); }
+  unsigned offset(const data_layout *x)    const { return offsets_.at(x); }
+  unsigned allocated_size()        const { return allocated_size_; }
+  // run
+  void run(ir::module& mod);
+
+private:
+  std::map<const data_layout*, unsigned> offsets_;
+  size_t allocated_size_;
+  // dependences
+  liveness *liveness_;
+};
+
+}
+}
+}
+
+#endif
--- a/include/triton/codegen/analysis/axes.h
+++ b/include/triton/codegen/analysis/axes.h
@@ -0,0 +1,51 @@
+#ifndef _TRITON_CODEGEN_ANALYSIS_AXES_H_
+#define _TRITON_CODEGEN_ANALYSIS_AXES_H_
+
+#include "triton/tools/graph.h"
+#include <map>
+#include <vector>
+
+namespace triton{
+
+namespace ir{
+  class value;
+  class module;
+  class instruction;
+}
+
+namespace codegen{
+namespace analysis{
+
+class axes {
+  typedef std::pair<ir::value*, unsigned> node_t;
+
+private:
+  // update graph
+  void update_graph_store(ir::instruction *i);
+  void update_graph_reduce(ir::instruction *i);
+  void update_graph_reshape(ir::instruction *i);
+  void update_graph_trans(ir::instruction *i);
+  void update_graph_broadcast(ir::instruction *i);
+  void update_graph_dot(ir::instruction *i);
+  void update_graph_elementwise(ir::instruction *i, bool connect_ret=true);
+  void update_graph_no_edge(ir::instruction *i);
+  void update_graph(ir::instruction *i);
+
+public:
+  axes();
+  void run(ir::module &mod);
+  // accessors
+  int get(ir::value *value, unsigned dim);
+  std::vector<int> get(ir::value *value);
+
+private:
+  tools::graph<node_t> graph_;
+  std::map<node_t, size_t> axes_;
+};
+
+}
+}
+
+}
+
+#endif
--- a/include/triton/codegen/analysis/layout.h
+++ b/include/triton/codegen/analysis/layout.h
@@ -0,0 +1,228 @@
+#ifndef _TRITON_CODEGEN_ANALYSIS_GRID_H_
+#define _TRITON_CODEGEN_ANALYSIS_GRID_H_
+
+#include <map>
+#include <set>
+#include <vector>
+#include <memory>
+#include "triton/tools/graph.h"
+#include "triton/codegen/target.h"
+
+namespace triton{
+
+namespace ir{
+  class value;
+  class type;
+  class module;
+  class instruction;
+  class phi_node;
+}
+
+namespace codegen{
+namespace analysis{
+
+class axes;
+class align;
+class layout_visitor;
+class data_layout;
+class mma_layout;
+class scanline_layout;
+class shared_layout;
+
+
+class layout_visitor {
+public:
+  virtual void visit_layout(data_layout *);
+  virtual void visit_layout_mma(mma_layout*) = 0;
+  virtual void visit_layout_scanline(scanline_layout*) = 0;
+  virtual void visit_layout_shared(shared_layout*) = 0;
+};
+
+class data_layout {
+protected:
+  enum id_t {
+    MMA,
+    SCANLINE,
+    SHARED
+  };
+
+  typedef std::vector<int> axes_t;
+  typedef std::vector<unsigned> shape_t;
+  typedef std::vector<int> order_t;
+  typedef std::vector<ir::value*> values_t;
+
+private:
+  template<typename T>
+  T* downcast(id_t id) {
+    if(id_ == id)
+      return static_cast<T*>(this);
+    return nullptr;
+  }
+
+public:
+  data_layout(id_t id,
+             const std::vector<int>& axes,
+             const std::vector<unsigned> &shape,
+             const std::vector<ir::value *> &values,
+             analysis::align* align);
+  // visitor
+  virtual void accept(layout_visitor* vst) = 0;
+  // downcast
+  mma_layout* to_mma()          { return downcast<mma_layout>(MMA); }
+  scanline_layout* to_scanline()      { return downcast<scanline_layout>(SCANLINE); }
+  shared_layout* to_shared()          { return downcast<shared_layout>(SHARED); }
+  // accessors
+  size_t get_rank()                   { return shape_.size(); }
+  const shape_t& get_shape() const    { return shape_; }
+  const order_t& get_order() const    { return order_; }
+  const values_t& get_values() const  { return values_;}
+  int get_axis(size_t k) const        { return axes_.at(k); }
+  std::vector<int> get_axes() const		{ return axes_; }
+  const int get_order(size_t k) const { return order_.at(k); }
+  // find the position of given axis
+  int find_axis(int to_find) const;
+
+
+private:
+  id_t id_;
+  axes_t axes_;
+  values_t values_;
+
+protected:
+  order_t order_;
+  shape_t shape_;
+};
+
+class mma_layout: public data_layout {
+public:
+  mma_layout(size_t num_warps,
+                const std::vector<int>& axes,
+                const std::vector<unsigned>& shapes,
+                const std::vector<ir::value *> &values,
+                analysis::align* align, target *tgt,
+             shared_layout* layout_a,
+             shared_layout* layout_b);
+  void accept(layout_visitor* vst) { vst->visit_layout_mma(this); }
+  // accessor
+  int fpw(size_t k) { return fpw_.at(k); }
+  int wpt(size_t k) { return wpt_.at(k); }
+  int spw(size_t k) { return spw_.at(k); }
+  int spt(size_t k) { return spt_.at(k); }
+  int rep(size_t k) { return rep_.at(k); }
+
+private:
+  std::vector<int> fpw_;
+  std::vector<int> spw_;
+  std::vector<int> wpt_;
+  std::vector<int> spt_;
+  std::vector<int> rep_;
+};
+
+struct scanline_layout: public data_layout {
+  scanline_layout(size_t num_warps,
+                    const std::vector<int>& axes,
+                    const std::vector<unsigned>& shape,
+                    const std::vector<ir::value *> &values,
+                    analysis::align* align,
+                    target* tgt);
+  void accept(layout_visitor* vst) { vst->visit_layout_scanline(this); }
+  // accessor
+  int mts(size_t k) { return mts_.at(k); }
+  int nts(size_t k) { return nts_.at(k); }
+
+public:
+  std::vector<int> mts_;
+  std::vector<int> nts_;
+};
+
+struct double_buffer_info_t {
+  ir::value* first;
+  ir::value* latch;
+  ir::phi_node* phi;
+};
+
+class shared_layout: public data_layout {
+private:
+  static bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator);
+  static void extract_double_bufferable(ir::value *v, std::shared_ptr<double_buffer_info_t>& res);
+
+public:
+  shared_layout(data_layout *arg,
+                const std::vector<int>& axes,
+                const std::vector<unsigned>& shapes,
+                const std::vector<ir::value *> &values_,
+                ir::type *ty,
+                analysis::align* align);
+  void accept(layout_visitor* vst) { vst->visit_layout_shared(this); }
+  // accessors
+  size_t get_size()                         { return size_; }
+  ir::type* get_type()                      { return ty_; }
+  double_buffer_info_t* get_double_buffer() { return double_buffer_.get(); }
+  size_t get_num_per_phase()                { return num_per_phase_; }
+  ir::value* hmma_dot_a()                      { return hmma_dot_a_; }
+  ir::value* hmma_dot_b()                      { return hmma_dot_b_; }
+  void set_mma_vec(int mma_vec)             { mma_vec_ = mma_vec; }
+  int  get_mma_vec()                        { return mma_vec_;}
+  data_layout* get_arg_layout()             { return arg_layout_; }
+
+private:
+  size_t size_;
+  ir::type *ty_;
+  std::shared_ptr<double_buffer_info_t> double_buffer_;
+  size_t num_per_phase_;
+  ir::value* hmma_dot_a_;
+  ir::value* hmma_dot_b_;
+  data_layout* arg_layout_;
+  int mma_vec_;
+};
+
+
+
+class layouts {
+  typedef ir::value* node_t;
+  typedef std::map <node_t, std::set<node_t>> graph_t;
+
+private:
+  // graph creation
+  void connect(ir::value *x, ir::value *y);
+  void make_graph(ir::instruction *i);
+
+  void init_hmma_tile(data_layout& layouts);
+  void init_scanline_tile(data_layout &layouts);
+
+  void create(size_t id, const std::vector<ir::value*>& values);
+
+public:
+  // constructor
+  layouts(analysis::axes *axes, analysis::align *align, size_t num_warps, target* tgt);
+
+  // accessors
+  unsigned layout_of(ir::value *value) const                  { return groups_.at(value); }
+  const std::vector<ir::value*>& values_of(unsigned id) const { return values_.at(id); }
+  size_t num_layouts() const                                  { return values_.size();}
+  data_layout* get(size_t id)                                 { return layouts_.at(id); }
+  data_layout* get(ir::value *v)                              { return get(layout_of(v));}
+  std::map<size_t, data_layout*> &get_all()                   { return layouts_; }
+  size_t tmp(ir::instruction* i)                              { return tmp_.at((ir::value*)i);}
+
+  // execution
+  void run(ir::module &mod);
+
+private:
+  analysis::axes* axes_;
+  analysis::align* align_;
+  size_t num_warps_;
+  target* tgt_;
+  tools::graph<ir::value*> graph_;
+  std::map<ir::value*, size_t> groups_;
+  std::map<size_t, std::vector<ir::value*>> values_;
+  std::map<size_t, data_layout*> layouts_;
+  std::map<ir::value*, size_t> tmp_;
+};
+
+}
+}
+
+}
+
+#endif
--- a/include/triton/codegen/analysis/liveness.h
+++ b/include/triton/codegen/analysis/liveness.h
@@ -0,0 +1,67 @@
+#ifndef TDL_INCLUDE_IR_CODEGEN_LIVENESS_H
+#define TDL_INCLUDE_IR_CODEGEN_LIVENESS_H
+
+#include <map>
+#include <set>
+#include <vector>
+#include "triton/codegen/analysis/layout.h"
+#include "triton/tools/graph.h"
+
+namespace triton{
+
+namespace ir{
+  class value;
+  class phi_node;
+  class function;
+  class module;
+  class instruction;
+}
+
+namespace codegen{
+namespace analysis{
+
+typedef unsigned slot_index;
+
+class tiles;
+class layouts;
+class data_layout;
+
+struct segment {
+  slot_index start;
+  slot_index end;
+
+  bool contains(slot_index idx) const {
+    return start <= idx && idx < end;
+  }
+
+  bool intersect(const segment &Other){
+    return contains(Other.start) || Other.contains(start);
+  }
+};
+
+
+class liveness {
+private:
+  typedef std::map<shared_layout*, segment>    intervals_map_t;
+
+public:
+  // constructor
+  liveness(layouts *l): layouts_(l){ }
+  // accessors
+  const intervals_map_t& get()  const { return intervals_; }
+  segment get(shared_layout* v)  const { return intervals_.at(v); }
+  // run
+  void run(ir::module &mod);
+
+private:
+  // analysis
+  layouts *layouts_;
+  intervals_map_t intervals_;
+};
+
+}
+}
+}
+
+
+#endif
--- a/include/triton/codegen/analysis/swizzle.h
+++ b/include/triton/codegen/analysis/swizzle.h
@@ -0,0 +1,43 @@
+#ifndef TRITON_INCLUDE_IR_CODEGEN_SWIZZLE_H
+#define TRITON_INCLUDE_IR_CODEGEN_SWIZZLE_H
+
+#include <map>
+
+namespace triton{
+
+namespace ir{
+  class module;
+}
+
+namespace codegen{
+class target;
+
+namespace analysis{
+
+class layouts;
+class data_layout;
+
+class swizzle {
+public:
+  // constructor
+  swizzle(layouts *l, target* tgt): layouts_(l), tgt_(tgt){ }
+  // accessors
+  int get_per_phase(data_layout* layout) { return per_phase_.at(layout); }
+  int get_max_phase(data_layout* layout) { return max_phase_.at(layout); }
+  int get_vec  (data_layout* layout)     { return vec_.at(layout); }
+  // run
+  void run(ir::module &mod);
+private:
+  layouts* layouts_;
+  target* tgt_;
+  std::map<data_layout*, int> per_phase_;
+  std::map<data_layout*, int> max_phase_;
+  std::map<data_layout*, int> vec_;
+};
+
+}
+}
+}
+
+
+#endif
--- a/include/triton/codegen/pass.h
+++ b/include/triton/codegen/pass.h
@@ -0,0 +1,30 @@
+#ifndef _TRITON_CODEGEN_PASS_H_
+#define _TRITON_CODEGEN_PASS_H_
+
+#include <list>
+
+namespace triton{
+
+namespace ir{
+  class module;
+}
+
+namespace codegen{
+
+class pass {
+public:
+  virtual void run(ir::module& m);
+};
+
+
+class pass_manager {
+public:
+  void add(pass* p);
+  void run(ir::module& m);
+
+private:
+  std::list<pass*> passes;
+};
+
+}
+}
--- a/include/triton/codegen/selection/generator.h
+++ b/include/triton/codegen/selection/generator.h
@@ -0,0 +1,199 @@
+#pragma once
+
+#ifndef _TRITON_SELECTION_GENERATOR_H_
+#define _TRITON_SELECTION_GENERATOR_H_
+
+#include "triton/ir/visitor.h"
+#include "triton/codegen/analysis/layout.h"
+#include <functional>
+
+// forward
+namespace llvm{
+  class Type;
+  class Value;
+  class BasicBlock;
+  class Attribute;
+  class Instruction;
+  class Constant;
+  class LLVMContext;
+  class Module;
+  class ConstantFolder;
+  class IRBuilderDefaultInserter;
+  template <typename T, typename Inserter>
+  class IRBuilder;
+  class ArrayType;
+  class Function;
+}
+
+namespace triton{
+
+namespace ir{
+class attribute;
+class load_inst;
+class store_inst;
+}
+
+namespace codegen{
+
+// forward
+namespace analysis{
+class liveness;
+class tiles;
+class align;
+class allocation;
+class cts;
+class axes;
+class layouts;
+class swizzle;
+}
+// typedef
+typedef llvm::IRBuilder<llvm::ConstantFolder,
+                        llvm::IRBuilderDefaultInserter> Builder;
+typedef llvm::LLVMContext LLVMContext;
+typedef llvm::Type Type;
+typedef llvm::Value Value;
+typedef llvm::Attribute Attribute;
+typedef llvm::BasicBlock BasicBlock;
+typedef llvm::Module Module;
+typedef llvm::Instruction Instruction;
+typedef llvm::Constant Constant;
+typedef llvm::ArrayType ArrayType;
+typedef llvm::Function Function;
+typedef std::vector<Value*> indices_t;
+class target;
+
+}
+}
+
+namespace triton{
+namespace codegen{
+
+struct distributed_axis {
+  int contiguous;
+  std::vector<Value*> values;
+  Value* thread_id;
+};
+
+class generator: public ir::visitor, public analysis::layout_visitor {
+private:
+  void init_idx(ir::value *x);
+  Instruction* add_barrier();
+  Value* shared_off(const std::vector<unsigned>& shapes, const std::vector<int>& order, indices_t idx);
+  void finalize_shared_layout(analysis::shared_layout*);
+  void finalize_function(ir::function*);
+  void finalize_phi_node(ir::phi_node*);
+
+private:
+  Type *cvt(ir::type *ty);
+  llvm::Attribute cvt(ir::attribute attr);
+
+public:
+  generator(analysis::axes *a_axes,
+            analysis::layouts *layouts,
+            analysis::align *alignment,
+            analysis::allocation *alloc,
+            analysis::swizzle *swizzle,
+            target *tgt,
+            unsigned num_warps);
+
+  void visit_value(ir::value* v);
+  void visit_phi_node(ir::phi_node*);
+  void visit_binary_operator(ir::binary_operator*);
+  void visit_getelementptr_inst(ir::getelementptr_inst*);
+  void visit_icmp_inst(ir::icmp_inst*);
+  void visit_fcmp_inst(ir::fcmp_inst*);
+  void visit_cast_inst(ir::cast_inst*);
+  void visit_return_inst(ir::return_inst*);
+  void visit_cond_branch_inst(ir::cond_branch_inst*);
+  void visit_uncond_branch_inst(ir::uncond_branch_inst*);
+  void visit_load_inst(ir::load_inst*);
+  void visit_unmasked_load_inst(ir::unmasked_load_inst*);
+  void visit_masked_load_inst(ir::masked_load_inst*);
+  void visit_store_inst(ir::store_inst*);
+  void visit_unmasked_store_inst(ir::unmasked_store_inst*);
+  void visit_masked_store_inst(ir::masked_store_inst*);
+  void visit_reshape_inst(ir::reshape_inst*);
+  void visit_splat_inst(ir::splat_inst*);
+  void visit_broadcast_inst(ir::broadcast_inst*);
+  void visit_downcast_inst(ir::downcast_inst*);
+  void visit_exp_inst(ir::exp_inst*);
+  void visit_log_inst(ir::log_inst*);
+  void visit_get_program_id_inst(ir::get_program_id_inst*);
+  void visit_get_num_program_inst(ir::get_num_program_inst*);
+  void visit_atomic_cas_inst(ir::atomic_cas_inst*);
+  void visit_atomic_exch_inst(ir::atomic_exch_inst*);
+  void visit_atomic_add_inst(ir::atomic_add_inst*);
+  void visit_mma884(ir::dot_inst*, ir::value *A, ir::value *B, ir::value *D, unsigned NK);
+  void visit_mma16816(ir::dot_inst*, ir::value *A, ir::value *B, ir::value *D, unsigned NK);
+  void visit_fmadot(ir::dot_inst*, ir::value *A, ir::value *B, ir::value *D, unsigned NK, Type *c_ty, Function *f_mul_add);
+  void visit_dot_inst(ir::dot_inst*);
+  void visit_trans_inst(ir::trans_inst*);
+  void visit_sqrt_inst(ir::sqrt_inst*);
+  void visit_reduce1d_inst(ir::reduce_inst*, std::function<Value*(Value*,Value*)>, Value*);
+  void visit_reducend_inst(ir::reduce_inst*, std::function<Value*(Value*,Value*)>, Value*);
+  void visit_reduce_inst(ir::reduce_inst*);
+  void visit_select_inst(ir::select_inst*);
+  void visit_recoalesce_inst(ir::recoalesce_inst*);
+  void visit_masked_load_async_inst(ir::masked_load_async_inst*);
+  void visit_copy_to_shared_inst(ir::copy_to_shared_inst*);
+  void visit_copy_from_shared_inst(ir::copy_from_shared_inst*);
+  void visit_barrier_inst(ir::barrier_inst*);
+  void visit_async_wait_inst(ir::async_wait_inst*);
+  void visit_make_range_dyn(ir::make_range_dyn*);
+  void visit_make_range(ir::make_range*);
+  void visit_make_range_sta(ir::make_range_sta*);
+  void visit_undef_value(ir::undef_value*);
+  void visit_constant_int(ir::constant_int*);
+  void visit_constant_fp(ir::constant_fp*);
+  void visit_alloc_const(ir::alloc_const*);
+  void visit_function(ir::function*);
+  void visit_basic_block(ir::basic_block*);
+  void visit_argument(ir::argument*);
+  void visit(ir::module &, llvm::Module &);
+
+  // layouts
+  void visit_layout_mma(analysis::mma_layout*);
+  void visit_layout_scanline(analysis::scanline_layout*);
+  void visit_layout_shared(analysis::shared_layout*);
+
+
+private:
+  LLVMContext *ctx_;
+  Builder* builder_;
+  Module *mod_;
+
+  analysis::axes *a_axes_;
+  analysis::swizzle *swizzle_;
+  std::map<unsigned, distributed_axis> axes_;
+  target *tgt_;
+  analysis::layouts *layouts_;
+  analysis::align *alignment_;
+  analysis::allocation *alloc_;
+  Value *shmem_;
+  unsigned num_warps_;
+  std::set<ir::value*> seen_;
+
+  std::map<analysis::data_layout*, Value*> offset_a_m_;
+  std::map<analysis::data_layout*, Value*> offset_a_k_;
+  std::map<analysis::data_layout*, Value*> offset_b_k_;
+  std::map<analysis::data_layout*, Value*> offset_b_n_;
+
+  std::map<analysis::data_layout*, Value*> shared_ptr_;
+  std::map<analysis::data_layout*, Value*> shared_pre_ptr_;
+  std::map<analysis::data_layout*, Value*> shared_next_ptr_;
+  std::map<analysis::data_layout*, Value*> shared_off_;
+
+
+  std::map<ir::value*, Value*> shmems_;
+  std::map<ir::value*, Value*> shoffs_;
+  std::map<ir::value*, std::vector<indices_t>> idxs_;
+  std::map<ir::value*, std::map<indices_t, Value*>> vals_;
+  std::map<ir::value*, BasicBlock *> bbs_;
+  std::map<ir::value*, std::vector<int>> ords_;
+
+};
+
+}
+}
+
+#endif
--- a/include/triton/codegen/target.h
+++ b/include/triton/codegen/target.h
@@ -0,0 +1,105 @@
+#ifndef TDL_INCLUDE_IR_CODEGEN_TARGET_H
+#define TDL_INCLUDE_IR_CODEGEN_TARGET_H
+
+namespace llvm{
+  class Type;
+  class Value;
+  class Instruction;
+  class Constant;
+  class LLVMContext;
+  class Module;
+  class ConstantFolder;
+  class IRBuilderDefaultInserter;
+  template <typename T, typename Inserter>
+  class IRBuilder;
+  class ArrayType;
+  class Function;
+}
+
+// typedefs
+namespace triton{
+namespace codegen{
+  typedef llvm::IRBuilder<llvm::ConstantFolder,
+                          llvm::IRBuilderDefaultInserter> Builder;
+  typedef llvm::LLVMContext LLVMContext;
+  typedef llvm::Type Type;
+  typedef llvm::Value Value;
+  typedef llvm::Module Module;
+  typedef llvm::Instruction Instruction;
+  typedef llvm::Constant Constant;
+  typedef llvm::ArrayType ArrayType;
+  typedef llvm::Function Function;
+}
+}
+
+namespace triton{
+namespace codegen{
+
+class nvidia_cu_target;
+
+class target {
+public:
+  target(bool is_gpu): is_gpu_(is_gpu){}
+  virtual ~target() {}
+  virtual void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn) = 0;
+  virtual Instruction* add_barrier(Module *module, Builder& builder) = 0;
+  virtual Instruction* add_memfence(Module *module, Builder& builder) = 0;
+  virtual Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax) = 0;
+  virtual Value* get_local_id(Module *module, Builder& builder, unsigned ax) = 0;
+  virtual Value* get_block_id(Module *module, Builder& builder, unsigned ax) = 0;
+  virtual Value* get_num_blocks(Module *module, Builder& builder, unsigned ax) = 0;
+  virtual unsigned guaranteed_alignment() = 0;
+  nvidia_cu_target* as_nvidia();
+  bool is_gpu() const;
+
+private:
+  bool is_gpu_;
+};
+
+class amd_cl_target: public target {
+public:
+  amd_cl_target(): target(true){}
+  void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn);
+  Instruction* add_barrier(Module *module, Builder& builder);
+  Instruction* add_memfence(Module *module, Builder& builder);
+  Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax);
+  Value* get_local_id(Module *module, Builder& builder, unsigned ax);
+  Value* get_block_id(Module *module, Builder& builder, unsigned ax);
+  Value* get_num_blocks(Module *module, Builder& builder, unsigned ax);
+  unsigned guaranteed_alignment() { return 16; }
+};
+
+class nvidia_cu_target: public target {
+public:
+  nvidia_cu_target(int sm): target(true), sm_(sm){}
+  void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn);
+  Instruction* add_barrier(Module *module, Builder& builder);
+  Instruction* add_memfence(Module *module, Builder& builder);
+  Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax);
+  Value* get_local_id(Module *module, Builder& builder, unsigned ax);
+  Value* get_block_id(Module *module, Builder& builder, unsigned ax);
+  Value* get_num_blocks(Module *module, Builder& builder, unsigned ax);
+  int sm() { return sm_; }
+  unsigned guaranteed_alignment() { return 16; }
+
+private:
+  int sm_;
+};
+
+class cpu_target: public target {
+public:
+  cpu_target(): target(false){}
+  void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn);
+  Instruction* add_barrier(Module *module, Builder& builder);
+  Instruction* add_memfence(Module *module, Builder& builder);
+  Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax);
+  Value* get_local_id(Module *module, Builder& builder, unsigned ax);
+  Value* get_block_id(Module *module, Builder& builder, unsigned ax);
+  Value* get_num_blocks(Module *module, Builder& builder, unsigned ax);
+  unsigned guaranteed_alignment() { return 1; }
+};
+
+}
+}
+
+#endif
--- a/include/triton/codegen/transform/coalesce.h
+++ b/include/triton/codegen/transform/coalesce.h
@@ -0,0 +1,47 @@
+#ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_REORDER_H
+#define TDL_INCLUDE_CODEGEN_OPTIMIZE_REORDER_H
+
+#include <map>
+#include <set>
+#include <vector>
+
+namespace triton {
+
+namespace ir {
+  class module;
+  class value;
+  class io_inst;
+  class instruction;
+  class builder;
+}
+
+namespace codegen{
+
+namespace analysis{
+  class align;
+  class layouts;
+  class cts;
+}
+
+namespace transform{
+
+class coalesce {
+private:
+  void extract_io_use(ir::value *v, std::set<ir::io_inst*>& result);
+  void extract_ld(ir::io_inst *i, std::map<int, std::vector<triton::ir::io_inst *> > &result);
+  ir::value* rematerialize(ir::value *v, ir::builder& builder, std::map<ir::value*, ir::value*>& seen);
+
+public:
+  coalesce(analysis::align* align, triton::codegen::analysis::layouts *layouts);
+  void run(ir::module &mod);
+
+private:
+  analysis::align* align_;
+  analysis::layouts* layout_;
+};
+
+}
+}
+}
+
+#endif
--- a/include/triton/codegen/transform/cts.h
+++ b/include/triton/codegen/transform/cts.h
@@ -0,0 +1,36 @@
+#ifndef TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H
+#define TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H
+
+#include <set>
+#include <map>
+
+namespace triton {
+
+namespace ir {
+  class module;
+  class value;
+  class phi_node;
+  class instruction;
+  class builder;
+}
+
+namespace codegen{
+namespace transform{
+
+class cts {
+private:
+  void add_copy(ir::instruction *parent, ir::value *x, ir::builder &builder, bool to_shared);
+
+public:
+  cts(bool use_async = false): use_async_(use_async) {}
+  void run(ir::module &mod);
+
+private:
+  bool use_async_;
+};
+
+}
+}
+}
+
+#endif
--- a/include/triton/codegen/transform/dce.h
+++ b/include/triton/codegen/transform/dce.h
@@ -0,0 +1,24 @@
+#ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_CSE_H
+#define TDL_INCLUDE_CODEGEN_OPTIMIZE_CSE_H
+
+
+namespace triton {
+
+namespace ir {
+  class module;
+}
+
+namespace codegen{
+namespace transform{
+
+class dce {
+public:
+  dce() {}
+  void run(ir::module &mod);
+};
+
+}
+}
+}
+
+#endif
--- a/include/triton/codegen/transform/disassociate.h
+++ b/include/triton/codegen/transform/disassociate.h
@@ -0,0 +1,22 @@
+#ifndef _TRITON_SELECTION_TRANSFORM_DISASSOCIATE_H_
+#define _TRITON_SELECTION_TRANSFORM_DISASSOCIATE_H_
+
+
+namespace triton {
+namespace ir {
+  class module;
+}
+
+namespace codegen{
+namespace transform{
+
+class disassociate {
+public:
+  void run(ir::module &mod);
+};
+
+}
+}
+}
+
+#endif
--- a/include/triton/codegen/transform/membar.h
+++ b/include/triton/codegen/transform/membar.h
@@ -0,0 +1,62 @@
+#ifndef TDL_INCLUDE_CODEGEN_BARRIERS_H
+#define TDL_INCLUDE_CODEGEN_BARRIERS_H
+
+#include <vector>
+#include <map>
+#include <list>
+#include <set>
+
+namespace triton {
+
+namespace ir {
+  class module;
+  class basic_block;
+  class instruction;
+  class masked_load_async_inst;
+  class value;
+  class builder;
+}
+
+namespace codegen{
+
+namespace analysis{
+
+class allocation;
+class liveness;
+class layouts;
+class cts;
+
+}
+
+namespace transform{
+
+class membar {
+private:
+  typedef std::pair<unsigned, unsigned> interval_t;
+  typedef std::set<ir::value*> val_set_t;
+  typedef std::vector<ir::value*> val_vec_t;
+
+private:
+  bool intersect(const val_set_t &X, const val_set_t &Y);
+  int group_of(triton::ir::value *i, std::vector<triton::ir::value *> &async_write);
+  val_set_t intersect_with(const val_set_t& as, const val_set_t& bs);
+  void transfer(ir::basic_block *block, val_vec_t &async_write, val_set_t &sync_write, val_set_t &sync_read,
+                std::set<triton::ir::value *> &safe_war, bool &inserted, ir::builder &builder);
+
+public:
+  membar(analysis::liveness *liveness, analysis::layouts *layouts, analysis::allocation *alloc):
+    liveness_(liveness), layouts_(layouts), alloc_(alloc) {}
+  void run(ir::module &mod);
+
+private:
+  analysis::liveness *liveness_;
+  analysis::layouts *layouts_;
+  analysis::allocation *alloc_;
+};
+
+
+}
+}
+}
+
+#endif
--- a/include/triton/codegen/transform/peephole.h
+++ b/include/triton/codegen/transform/peephole.h
@@ -0,0 +1,54 @@
+#ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_TRANS_H
+#define TDL_INCLUDE_CODEGEN_OPTIMIZE_TRANS_H
+
+#include "triton/codegen/target.h"
+
+namespace triton {
+
+namespace ir {
+  class module;
+  class value;
+  class instruction;
+  class trans_inst;
+  class builder;
+  class constant_int;
+  class dot_inst;
+}
+
+namespace codegen{
+namespace analysis{
+class layouts;
+}
+
+namespace transform{
+
+class peephole {
+private:
+//  bool rewrite_cts_cfs(ir::instruction *value, ir::builder &builder);
+  bool rewrite_trans_phi(ir::instruction* value, ir::builder &builder);
+  bool rewrite_dot_fp32(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, ir::value *A, ir::value *B, ir::value *D);
+  bool rewrite_dot_hmma(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, ir::value *A, ir::value *B, ir::value *D);
+  bool rewrite_dot(ir::instruction *value, ir::builder& builder);
+  bool rewrite_mult(ir::instruction *value, ir::builder& builder);
+  bool rewrite_unit_red(ir::instruction *value, ir::builder& builder);
+  bool rewrite_gep_ptr_min_off_plus_off(ir::instruction *value, ir::builder& builder);
+  bool rewrite_select_masked_load(ir::instruction *value, ir::builder& builder);
+  bool rewrite_load_to_shared(ir::instruction *value, ir::builder& builder);
+
+private:
+
+public:
+  peephole(target* tgt, analysis::layouts* layouts): tgt_(tgt), layouts_(layouts) {}
+  void run(ir::module &mod);
+
+private:
+  target* tgt_;
+  analysis::layouts* layouts_;
+};
+
+
+}
+}
+}
+
+#endif
--- a/include/triton/codegen/transform/pipeline.h
+++ b/include/triton/codegen/transform/pipeline.h
@@ -0,0 +1,28 @@
+#ifndef TRITON_INCLUDE_IR_CODEGEN_PIPELINE_H
+#define TRITON_INCLUDE_IR_CODEGEN_PIPELINE_H
+
+// forward declaration
+namespace triton {
+namespace ir {
+class module;
+}
+} // namespace triton
+
+namespace triton {
+namespace codegen {
+namespace transform {
+
+class pipeline {
+public:
+  pipeline(bool has_copy_async): has_copy_async_(has_copy_async) {}
+  void run(ir::module &module);
+
+private:
+  bool has_copy_async_;
+};
+
+} // namespace transform
+} // namespace codegen
+} // namespace triton
+
+#endif
--- a/include/triton/codegen/transform/reassociate.h
+++ b/include/triton/codegen/transform/reassociate.h
@@ -0,0 +1,49 @@
+#ifndef TDL_INCLUDE_IR_CODEGEN_REASSOCIATE_H
+#define TDL_INCLUDE_IR_CODEGEN_REASSOCIATE_H
+
+#include <map>
+#include <set>
+#include <vector>
+
+namespace triton {
+
+// forward declaration
+namespace ir {
+class module;
+class value;
+class builder;
+class instruction;
+class getelementptr_inst;
+}
+
+namespace codegen{
+
+namespace analysis{
+class tiles;
+class align;
+}
+
+namespace transform{
+
+class reassociate {
+  struct cst_info {
+    ir::value* dyn_ptr;
+    ir::getelementptr_inst* sta_ptr;
+  };
+
+private:
+  ir::instruction* is_bin_add(ir::value *x);
+  ir::value *reassociate_idx(ir::value *value, ir::builder &builder, ir::value *&noncst, ir::value *&cst);
+  ir::value *reassociate_ptr(ir::getelementptr_inst* pz, ir::builder &builder, std::map<ir::value*, cst_info> &offsets);
+
+public:
+  void run(ir::module& module);
+};
+
+}
+
+}
+
+}
+
+#endif
--- a/include/triton/codegen/transform/reorder.h
+++ b/include/triton/codegen/transform/reorder.h
@@ -0,0 +1,26 @@
+#ifndef TRITON_INCLUDE_IR_CODEGEN_REORDER_H
+#define TRITON_INCLUDE_IR_CODEGEN_REORDER_H
+
+namespace triton {
+
+// forward declaration
+namespace ir {
+class module;
+}
+
+namespace codegen{
+
+namespace transform{
+
+class reorder {
+public:
+  void run(ir::module& module);
+};
+
+}
+
+}
+
+}
+
+#endif
--- a/include/triton/driver/backend.h
+++ b/include/triton/driver/backend.h
@@ -0,0 +1,137 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_BACKEND_H_
+#define _TRITON_DRIVER_BACKEND_H_
+
+
+#include <map>
+#include <list>
+#include <vector>
+#include "triton/driver/context.h"
+
+namespace llvm
+{
+class Module;
+}
+
+namespace triton
+{
+namespace driver
+{
+
+class buffer;
+class stream;
+class device;
+class context;
+class platform;
+class module;
+class kernel;
+
+struct backend
+{
+
+  // platforms
+  class platforms
+  {
+    friend class backend;
+  private:
+    static void init();
+
+  public:
+    static void get(std::vector<driver::platform*> &results);
+
+  private:
+    static std::vector<driver::platform*> cache_;
+  };
+
+  // devices
+  class devices
+  {
+    friend class backend;
+
+  private:
+    static void init(const std::vector<platform *> &platforms);
+
+  public:
+    static void get(std::vector<driver::device*>& devs);
+
+  private:
+    static std::vector<driver::device*> cache_;
+  };
+
+  // modules
+  class modules
+  {
+    friend class backend;
+
+  public:
+    static void release();
+
+  private:
+    static std::map<std::tuple<driver::stream*, std::string>, driver::module*> cache_;
+  };
+
+  // kernels
+  class kernels
+  {
+    friend class backend;
+  public:
+    static void release();
+    static driver::kernel* get(driver::module* mod, const std::string & name);
+  private:
+    static std::map<std::tuple<module*, std::string>, driver::kernel*> cache_;
+  };
+
+  // contexts
+  class contexts
+  {
+    friend class backend;
+  private:
+    static void init(const std::vector<device *> &);
+    static void release();
+  public:
+    static driver::context* get_default();
+
+    static driver::context* import(CUcontext ctx)
+    {
+      for(driver::context* x: cache_){
+        driver::cu_context* cu_x = (driver::cu_context*)x;
+        if(*cu_x->cu()==ctx)
+          return x;
+      }
+      cache_.emplace_back(new driver::cu_context(ctx, false));
+      return cache_.back();
+    }
+
+    static void get(std::list<driver::context*> &);
+
+  private:
+    static std::list<driver::context*> cache_;
+  };
+
+  // streams
+  class streams
+  {
+    friend class backend;
+  private:
+    static void init(std::list<context*> const &);
+    static void release();
+  public:
+    static void get(driver::context*, std::vector<driver::stream *> &streams);
+    static driver::stream* get(driver::context*, unsigned int id = 0);
+    static driver::stream* get_default();
+  private:
+    static std::map<driver::context*, std::vector<driver::stream*> > cache_;
+  };
+
+  static void init();
+  static void release();
+  static void synchronize(triton::driver::context *);
+
+  static unsigned int default_device;
+};
+
+}
+}
+
+#endif
--- a/include/triton/driver/buffer.h
+++ b/include/triton/driver/buffer.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_BUFFER_H_
+#define _TRITON_DRIVER_BUFFER_H_
+
+#include "triton/driver/handle.h"
+#include "triton/driver/context.h"
+
+namespace triton
+{
+namespace driver
+{
+
+class stream;
+
+// Base
+class buffer : public polymorphic_resource<CUdeviceptr, host_buffer_t> {
+public:
+  buffer(size_t size, CUdeviceptr cl, bool take_ownership);
+  buffer(size_t size, host_buffer_t hst, bool take_ownership);
+  uintptr_t addr_as_uintptr_t();
+  static buffer* create(driver::context* ctx, size_t size);
+  size_t size();
+
+protected:
+  size_t size_;
+};
+
+// CPU
+class host_buffer: public buffer
+{
+public:
+  host_buffer(size_t size);
+};
+
+// CUDA
+class cu_buffer: public buffer
+{
+public:
+  cu_buffer(size_t size);
+  cu_buffer(size_t size, CUdeviceptr cu, bool take_ownership);
+  void set_zero(triton::driver::stream *queue, size_t size);
+};
+
+}
+}
+
+#endif
--- a/include/triton/driver/context.h
+++ b/include/triton/driver/context.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_CONTEXT_H_
+#define _TRITON_DRIVER_CONTEXT_H_
+
+#include "triton/driver/device.h"
+#include "triton/driver/handle.h"
+
+namespace triton
+{
+namespace driver
+{
+
+class context: public polymorphic_resource<CUcontext, host_context_t>{
+protected:
+  static std::string get_cache_path();
+
+public:
+  context(driver::device *dev, CUcontext cu, bool take_ownership);
+  context(driver::device *dev, host_context_t hst, bool take_ownership);
+  driver::device* device() const;
+  std::string const & cache_path() const;
+  // factory methods
+  static context* create(driver::device *dev);
+
+protected:
+  driver::device* dev_;
+  std::string cache_path_;
+};
+
+// Host
+class host_context: public context {
+public:
+  host_context(driver::device* dev);
+};
+
+// CUDA
+class cu_context: public context {
+private:
+  static CUdevice get_device_of(CUcontext);
+public:
+  //Constructors
+  cu_context(CUcontext cu, bool take_ownership = true);
+  cu_context(driver::device* dev);
+};
+
+}
+}
+
+#endif
--- a/include/triton/driver/device.h
+++ b/include/triton/driver/device.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_DEVICE_H_
+#define _TRITON_DRIVER_DEVICE_H_
+
+#include "triton/driver/platform.h"
+#include "triton/driver/handle.h"
+
+namespace triton
+{
+
+namespace codegen
+{
+class target;
+}
+
+namespace driver
+{
+
+class context;
+
+// Base device
+class device: public polymorphic_resource<CUdevice, host_device_t>{
+public:
+  using polymorphic_resource::polymorphic_resource;
+  virtual size_t max_threads_per_block() const = 0;
+  virtual size_t max_shared_memory() const = 0;
+  virtual std::unique_ptr<codegen::target> make_target() const = 0;
+};
+
+// Host device
+class host_device: public device {
+public:
+  host_device(): device(host_device_t(), true){ }
+  size_t max_threads_per_block() const { return 1; }
+  size_t max_shared_memory() const { return 0; }
+  std::unique_ptr<codegen::target> make_target() const;
+};
+
+// CUDA device
+class cu_device: public device {
+private:
+  //Metaprogramming elper to get cuda info from attribute
+  template<CUdevice_attribute attr>
+  int cuGetInfo() const;
+
+  inline nvmlDevice_t nvml_device() const;
+
+public:
+  cu_device(CUdevice cu = CUdevice(), bool take_ownership = true): device(cu, take_ownership){}
+  // Informations
+  std::string infos() const;
+  size_t address_bits() const;
+  std::vector<size_t> max_block_dim() const;
+  size_t warp_size() const;
+  // Compute Capability
+  void interpret_as(int cc);
+  int compute_capability() const;
+  // Identifier
+  std::string name() const;
+  std::string pci_bus_id() const;
+  // Clocks
+  size_t current_sm_clock() const;
+  size_t current_mem_clock() const;
+  size_t max_threads_per_block() const;
+  size_t max_shared_memory() const;
+  size_t max_sm_clock() const;
+  size_t max_mem_clock() const;
+  void set_max_clock();
+  // Target
+  std::unique_ptr<codegen::target> make_target() const;
+
+private:
+  std::shared_ptr<int> interpreted_as_;
+};
+
+}
+
+}
+
+#endif
--- a/include/triton/driver/dispatch.h
+++ b/include/triton/driver/dispatch.h
@@ -0,0 +1,185 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_DISPATCH_H_
+#define _TRITON_DRIVER_DISPATCH_H_
+
+#include <type_traits>
+#include <dlfcn.h>
+
+//CUDA Backend
+#include "triton/external/CUDA/cuda.h"
+#include "triton/external/CUDA/nvml.h"
+
+//Exceptions
+#include <iostream>
+#include <stdexcept>
+
+namespace llvm {
+class PassRegistry;
+class Module;
+}
+
+namespace triton
+{
+namespace driver
+{
+
+class cu_context;
+
+template<class T> void check(T){}
+void check(CUresult err);
+
+class dispatch
+{
+protected:
+  template <class F>
+  struct return_type;
+
+  template <class R, class... A>
+  struct return_type<R (*)(A...)>
+  { typedef R type; };
+
+  typedef bool (*f_init_t)();
+
+  template<f_init_t initializer, typename FunPtrT, typename... Args>
+  static typename return_type<FunPtrT>::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args)
+  {
+    initializer();
+    if(cache == nullptr){
+      cache = dlsym(lib_h, name);
+			if(cache == 0)
+				throw std::runtime_error("dlsym unable to load function");
+		}
+    FunPtrT fptr;
+    *reinterpret_cast<void **>(&fptr) = cache;
+    typename return_type<FunPtrT>::type res = (*fptr)(args...);
+    check(res);
+    return res;
+  }
+
+public:
+  static bool nvmlinit();
+  static bool cuinit();
+  static bool spvllvminit();
+  static void release();
+
+  // CUDA
+  static CUresult cuCtxGetCurrent(CUcontext *pctx);
+  static CUresult cuCtxSetCurrent(CUcontext ctx);
+  static CUresult cuCtxDestroy_v2(CUcontext ctx);
+  static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags);
+  static CUresult cuDeviceGet(CUdevice *device, int ordinal);
+  static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+  static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
+  static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+  static CUresult cuMemFree_v2(CUdeviceptr dptr);
+  static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+  static CUresult cuDriverGetVersion(int *driverVersion);
+  static CUresult cuDeviceGetName(char *name, int len, CUdevice dev);
+  static CUresult cuDeviceGetPCIBusId(char *id, int len, CUdevice dev);
+  static CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t* bytes, CUmodule hmod, const char *name);
+  static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+  static CUresult cuModuleLoad(CUmodule *module, const char *fname);
+  static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+  static CUresult cuModuleUnload(CUmodule hmod);
+  static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+  static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
+  static CUresult cuDeviceGetCount(int *count);
+  static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+  static CUresult cuInit(unsigned int Flags);
+  static CUresult cuEventRecord(CUevent hEvent, CUstream hStream);
+  static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
+  static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
+  static CUresult cuCtxPopCurrent_v2(CUcontext *pctx);
+  static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
+  static CUresult cuStreamSynchronize(CUstream hStream);
+  static CUresult cuStreamGetCtx(CUstream hStream, CUcontext* pctx);
+  static CUresult cuStreamDestroy_v2(CUstream hStream);
+  static CUresult cuEventDestroy_v2(CUevent hEvent);
+  static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
+  static CUresult cuPointerGetAttribute(void * data, CUpointer_attribute attribute, CUdeviceptr ptr);
+  static CUresult cuCtxGetDevice(CUdevice* result);
+  static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N, CUstream stream);
+  static CUresult cuFuncGetAttribute(int* pi, CUfunction_attribute attrib, CUfunction hfunc);
+  static CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int  value);
+  static CUresult cuFuncSetCacheConfig (CUfunction hfunc, CUfunc_cache config);
+  // NVML
+  static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device);
+  static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
+  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
+  static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock);
+
+
+  // SPIR-V libraries
+  static int initializeLLVMToSPIRVPass(llvm::PassRegistry &);
+  static bool writeSpirv(llvm::Module *M, std::ostream &OS, std::string &ErrMsg);
+
+
+private:
+
+  // Libraries
+  static void* cuda_;
+  static void* nvml_;
+  static void* vulkan_;
+  static void* spvllvm_;
+  static void* spvcross_;
+  static void* opengl_;
+
+
+  // CUDA functions
+  static void* cuCtxGetCurrent_;
+  static void* cuCtxSetCurrent_;
+  static void* cuCtxDestroy_v2_;
+  static void* cuEventCreate_;
+  static void* cuDeviceGet_;
+  static void* cuMemcpyDtoH_v2_;
+  static void* cuStreamCreate_;
+  static void* cuEventElapsedTime_;
+  static void* cuMemFree_v2_;
+  static void* cuMemcpyDtoHAsync_v2_;
+  static void* cuDriverGetVersion_;
+  static void* cuDeviceGetName_;
+  static void* cuDeviceGetPCIBusId_;
+  static void* cuModuleGetGlobal_v2_;
+  static void* cuMemcpyHtoDAsync_v2_;
+  static void* cuModuleLoad_;
+  static void* cuLaunchKernel_;
+  static void* cuModuleUnload_;
+  static void* cuModuleLoadDataEx_;
+  static void* cuDeviceGetAttribute_;
+  static void* cuDeviceGetCount_;
+  static void* cuMemcpyHtoD_v2_;
+  static void* cuInit_;
+  static void* cuEventRecord_;
+  static void* cuCtxCreate_v2_;
+  static void* cuModuleGetFunction_;
+  static void* cuStreamSynchronize_;
+  static void* cuStreamDestroy_v2_;
+  static void* cuStreamGetCtx_;
+  static void* cuEventDestroy_v2_;
+  static void* cuMemAlloc_v2_;
+  static void* cuPointerGetAttribute_;
+  static void* cuCtxGetDevice_;
+  static void* cuMemsetD8Async_;
+  static void* cuCtxPushCurrent_v2_;
+  static void* cuCtxPopCurrent_v2_;
+  static void* cuFuncGetAttribute_;
+  static void* cuFuncSetAttribute_;
+  static void* cuFuncSetCacheConfig_;
+  // NVML
+  static void* nvmlInit_v2_;
+  static void* nvmlDeviceGetHandleByPciBusId_v2_;
+  static void* nvmlDeviceGetClockInfo_;
+  static void* nvmlDeviceGetMaxClockInfo_;
+  static void* nvmlDeviceSetApplicationsClocks_;
+
+  // LLVM to SPIR-V
+  static void* initializeLLVMToSPIRVPass_;
+  static void* writeSpirv_;
+};
+
+}
+}
+
+
+#endif
--- a/include/triton/driver/error.h
+++ b/include/triton/driver/error.h
@@ -0,0 +1,148 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_ERROR_H_
+#define _TRITON_DRIVER_ERROR_H_
+
+#include <exception>
+#include "triton/driver/dispatch.h"
+
+
+namespace triton
+{
+
+  namespace driver
+  {
+
+  namespace exception
+  {
+
+  namespace nvrtc
+  {
+
+#define TRITON_CREATE_NVRTC_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } }
+
+  TRITON_CREATE_NVRTC_EXCEPTION(out_of_memory              ,"out of memory");
+  TRITON_CREATE_NVRTC_EXCEPTION(program_creation_failure   ,"program creation failure");
+  TRITON_CREATE_NVRTC_EXCEPTION(invalid_input              ,"invalid input");
+  TRITON_CREATE_NVRTC_EXCEPTION(invalid_program            ,"invalid program");
+  TRITON_CREATE_NVRTC_EXCEPTION(invalid_option             ,"invalid option");
+  TRITON_CREATE_NVRTC_EXCEPTION(compilation                ,"compilation");
+  TRITON_CREATE_NVRTC_EXCEPTION(builtin_operation_failure  ,"builtin operation failure");
+  TRITON_CREATE_NVRTC_EXCEPTION(unknown_error              ,"unknown error");
+
+#undef TRITON_CREATE_NVRTC_EXCEPTION
+  }
+
+
+  namespace cuda
+  {
+  class base: public std::exception{};
+
+#define TRITON_CREATE_CUDA_EXCEPTION(name, msg) class name: public base { public:const char * what() const throw(){ return "CUDA: Error- " msg; } }
+
+
+  TRITON_CREATE_CUDA_EXCEPTION(invalid_value                   ,"invalid value");
+  TRITON_CREATE_CUDA_EXCEPTION(out_of_memory                   ,"out of memory");
+  TRITON_CREATE_CUDA_EXCEPTION(not_initialized                 ,"not initialized");
+  TRITON_CREATE_CUDA_EXCEPTION(deinitialized                   ,"deinitialized");
+  TRITON_CREATE_CUDA_EXCEPTION(profiler_disabled               ,"profiler disabled");
+  TRITON_CREATE_CUDA_EXCEPTION(profiler_not_initialized        ,"profiler not initialized");
+  TRITON_CREATE_CUDA_EXCEPTION(profiler_already_started        ,"profiler already started");
+  TRITON_CREATE_CUDA_EXCEPTION(profiler_already_stopped        ,"profiler already stopped");
+  TRITON_CREATE_CUDA_EXCEPTION(no_device                       ,"no device");
+  TRITON_CREATE_CUDA_EXCEPTION(invalid_device                  ,"invalid device");
+  TRITON_CREATE_CUDA_EXCEPTION(invalid_image                   ,"invalid image");
+  TRITON_CREATE_CUDA_EXCEPTION(invalid_context                 ,"invalid context");
+  TRITON_CREATE_CUDA_EXCEPTION(context_already_current         ,"context already current");
+  TRITON_CREATE_CUDA_EXCEPTION(map_failed                      ,"map failed");
+  TRITON_CREATE_CUDA_EXCEPTION(unmap_failed                    ,"unmap failed");
+  TRITON_CREATE_CUDA_EXCEPTION(array_is_mapped                 ,"array is mapped");
+  TRITON_CREATE_CUDA_EXCEPTION(already_mapped                  ,"already mapped");
+  TRITON_CREATE_CUDA_EXCEPTION(no_binary_for_gpu               ,"no binary for gpu");
+  TRITON_CREATE_CUDA_EXCEPTION(already_acquired                ,"already acquired");
+  TRITON_CREATE_CUDA_EXCEPTION(not_mapped                      ,"not mapped");
+  TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_array             ,"not mapped as array");
+  TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer           ,"not mapped as pointer");
+  TRITON_CREATE_CUDA_EXCEPTION(ecc_uncorrectable               ,"ecc uncorrectable");
+  TRITON_CREATE_CUDA_EXCEPTION(unsupported_limit               ,"unsupported limit");
+  TRITON_CREATE_CUDA_EXCEPTION(context_already_in_use          ,"context already in use");
+  TRITON_CREATE_CUDA_EXCEPTION(peer_access_unsupported         ,"peer access unsupported");
+  TRITON_CREATE_CUDA_EXCEPTION(invalid_ptx                     ,"invalid ptx");
+  TRITON_CREATE_CUDA_EXCEPTION(invalid_graphics_context        ,"invalid graphics context");
+  TRITON_CREATE_CUDA_EXCEPTION(invalid_source                  ,"invalid source");
+  TRITON_CREATE_CUDA_EXCEPTION(file_not_found                  ,"file not found");
+  TRITON_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found  ,"shared object symbol not found");
+  TRITON_CREATE_CUDA_EXCEPTION(shared_object_init_failed       ,"shared object init failed");
+  TRITON_CREATE_CUDA_EXCEPTION(operating_system                ,"operating system");
+  TRITON_CREATE_CUDA_EXCEPTION(invalid_handle                  ,"invalid handle");
+  TRITON_CREATE_CUDA_EXCEPTION(not_found                       ,"not found");
+  TRITON_CREATE_CUDA_EXCEPTION(not_ready                       ,"not ready");
+  TRITON_CREATE_CUDA_EXCEPTION(illegal_address                 ,"illegal address");
+  TRITON_CREATE_CUDA_EXCEPTION(launch_out_of_resources         ,"launch out of resources");
+  TRITON_CREATE_CUDA_EXCEPTION(launch_timeout                  ,"launch timeout");
+  TRITON_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing   ,"launch incompatible texturing");
+  TRITON_CREATE_CUDA_EXCEPTION(peer_access_already_enabled     ,"peer access already enabled");
+  TRITON_CREATE_CUDA_EXCEPTION(peer_access_not_enabled         ,"peer access not enabled");
+  TRITON_CREATE_CUDA_EXCEPTION(primary_context_active          ,"primary context active");
+  TRITON_CREATE_CUDA_EXCEPTION(context_is_destroyed            ,"context is destroyed");
+  TRITON_CREATE_CUDA_EXCEPTION(assert_error                    ,"assert");
+  TRITON_CREATE_CUDA_EXCEPTION(too_many_peers                  ,"too many peers");
+  TRITON_CREATE_CUDA_EXCEPTION(host_memory_already_registered  ,"host memory already registered");
+  TRITON_CREATE_CUDA_EXCEPTION(host_memory_not_registered      ,"hot memory not registered");
+  TRITON_CREATE_CUDA_EXCEPTION(hardware_stack_error            ,"hardware stack error");
+  TRITON_CREATE_CUDA_EXCEPTION(illegal_instruction             ,"illegal instruction");
+  TRITON_CREATE_CUDA_EXCEPTION(misaligned_address              ,"misaligned address");
+  TRITON_CREATE_CUDA_EXCEPTION(invalid_address_space           ,"invalid address space");
+  TRITON_CREATE_CUDA_EXCEPTION(invalid_pc                      ,"invalid pc");
+  TRITON_CREATE_CUDA_EXCEPTION(launch_failed                   ,"launch failed");
+  TRITON_CREATE_CUDA_EXCEPTION(not_permitted                   ,"not permitted");
+  TRITON_CREATE_CUDA_EXCEPTION(not_supported                   ,"not supported");
+  TRITON_CREATE_CUDA_EXCEPTION(unknown                         ,"unknown");
+
+#undef TRITON_CREATE_CUDA_EXCEPTION
+  }
+
+  namespace cublas
+  {
+  class base: public std::exception{};
+
+#define TRITON_CREATE_CUBLAS_EXCEPTION(name, msg) class name: public base { public: const char * what() const throw(){ return "CUBLAS: Error- " msg; } }
+
+  TRITON_CREATE_CUBLAS_EXCEPTION(not_initialized              ,"not initialized");
+  TRITON_CREATE_CUBLAS_EXCEPTION(alloc_failed                 ,"alloc failed");
+  TRITON_CREATE_CUBLAS_EXCEPTION(invalid_value                ,"invalid value");
+  TRITON_CREATE_CUBLAS_EXCEPTION(arch_mismatch                ,"arch mismatch");
+  TRITON_CREATE_CUBLAS_EXCEPTION(mapping_error                ,"mapping error");
+  TRITON_CREATE_CUBLAS_EXCEPTION(execution_failed             ,"execution failed");
+  TRITON_CREATE_CUBLAS_EXCEPTION(internal_error               ,"internal error");
+  TRITON_CREATE_CUBLAS_EXCEPTION(not_supported                ,"not supported");
+  TRITON_CREATE_CUBLAS_EXCEPTION(license_error                ,"license error");
+  TRITON_CREATE_CUBLAS_EXCEPTION(unknown                      ,"unknown");
+
+#undef TRITON_CREATE_CUBLAS_EXCEPTION
+  }
+
+  namespace cudnn
+  {
+#define TRITON_CREATE_CUDNN_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "CUDNN: Error- " msg; } }
+
+  TRITON_CREATE_CUDNN_EXCEPTION(not_initialized              ,"not initialized");
+  TRITON_CREATE_CUDNN_EXCEPTION(alloc_failed                 ,"allocation failed");
+  TRITON_CREATE_CUDNN_EXCEPTION(bad_param                    ,"bad param");
+  TRITON_CREATE_CUDNN_EXCEPTION(internal_error               ,"internal error");
+  TRITON_CREATE_CUDNN_EXCEPTION(invalid_value                ,"invalid value");
+  TRITON_CREATE_CUDNN_EXCEPTION(arch_mismatch                ,"arch mismatch");
+  TRITON_CREATE_CUDNN_EXCEPTION(mapping_error                ,"mapping error");
+  TRITON_CREATE_CUDNN_EXCEPTION(execution_failed             ,"execution failed");
+  TRITON_CREATE_CUDNN_EXCEPTION(not_supported                ,"not supported");
+  TRITON_CREATE_CUDNN_EXCEPTION(license_error                ,"license error");
+  TRITON_CREATE_CUDNN_EXCEPTION(runtime_prerequisite_missing ,"prerequisite missing");
+  TRITON_CREATE_CUDNN_EXCEPTION(runtime_in_progress          ,"runtime in progress");
+  TRITON_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow          ,"runtime fp overflow");
+  }
+
+  }
+  }
+}
+
+#endif
--- a/include/triton/driver/handle.h
+++ b/include/triton/driver/handle.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_HANDLE_H_
+#define _TRITON_DRIVER_HANDLE_H_
+
+#include <memory>
+#include <map>
+#include <iostream>
+#include <functional>
+#include <type_traits>
+#include "triton/driver/dispatch.h"
+#include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "triton/tools/thread_pool.h"
+
+namespace llvm
+{
+class ExecutionEngine;
+class Function;
+}
+
+namespace triton
+{
+
+namespace driver
+{
+
+enum backend_t {
+  CUDA,
+  Host
+};
+
+// Host handles
+struct host_platform_t{
+
+};
+
+struct host_device_t{
+
+};
+
+struct host_context_t{
+
+};
+
+struct host_stream_t{
+  std::shared_ptr<ThreadPool> pool;
+  std::shared_ptr<std::vector<std::future<void>>> futures;
+  std::vector<std::shared_ptr<char*>> args;
+};
+
+struct host_module_t{
+  std::string error;
+  llvm::ExecutionEngine* engine;
+  std::map<std::string, llvm::Function*> functions;
+  void(*fn)(char**, int32_t, int32_t, int32_t);
+  llvm::orc::ExecutionSession* ES;
+  llvm::orc::RTDyldObjectLinkingLayer* ObjectLayer;
+  llvm::orc::IRCompileLayer* CompileLayer;
+  llvm::DataLayout* DL;
+  llvm::orc::MangleAndInterner* Mangle;
+  llvm::orc::ThreadSafeContext* Ctx;
+  llvm::orc::JITDylib *MainJD;
+};
+
+struct host_function_t{
+  llvm::Function* fn;
+};
+
+struct host_buffer_t{
+  char* data;
+};
+
+
+// Extra CUDA handles
+struct cu_event_t{
+  operator bool() const { return first && second; }
+  CUevent first;
+  CUevent second;
+};
+
+struct CUPlatform{
+  CUPlatform() : status_(dispatch::cuInit(0)) { }
+  operator bool() const { return status_; }
+private:
+  CUresult status_;
+};
+
+template<class T, class CUType>
+class handle_interface{
+public:
+    //Accessors
+    operator CUType() const { return *(((T*)this)->cu().h_); }
+    //Comparison
+    bool operator==(handle_interface const & y) { return (CUType)(*this) == (CUType)(y); }
+    bool operator!=(handle_interface const & y) { return (CUType)(*this) != (CUType)(y); }
+    bool operator<(handle_interface const & y) { return (CUType)(*this) < (CUType)(y); }
+};
+
+template<class T>
+class handle{
+public:
+  template<class, class> friend class handle_interface;
+public:
+  //Constructors
+  handle(T h, bool take_ownership = true);
+  handle();
+  ~handle();
+  T& operator*() { return *h_; }
+  T const & operator*() const { return *h_; }
+  T* operator->() const { return h_.get(); }
+
+protected:
+  std::shared_ptr<T> h_;
+  bool has_ownership_;
+};
+
+template<class CUType, class HostType>
+class polymorphic_resource {
+public:
+  polymorphic_resource(CUType cu, bool take_ownership): cu_(cu, take_ownership), backend_(CUDA){}
+  polymorphic_resource(HostType hst, bool take_ownership): hst_(hst, take_ownership), backend_(Host){}
+  virtual ~polymorphic_resource() { }
+
+  handle<CUType> cu() { return cu_; }
+  handle<HostType> hst() { return hst_; }
+  const handle<CUType>& cu() const { return cu_; }
+  const handle<HostType>& hst() const { return hst_; }
+  backend_t backend() { return backend_; }
+
+protected:
+  handle<CUType> cu_;
+  handle<HostType> hst_;
+  backend_t backend_;
+};
+
+}
+}
+
+#endif
--- a/include/triton/driver/kernel.h
+++ b/include/triton/driver/kernel.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_KERNEL_H_
+#define _TRITON_DRIVER_KERNEL_H_
+
+#include "triton/driver/module.h"
+#include "triton/driver/handle.h"
+#include <memory>
+
+namespace llvm
+{
+class GenericValue;
+}
+
+namespace triton
+{
+
+namespace driver
+{
+
+class cu_buffer;
+
+// Base
+class kernel: public polymorphic_resource<CUfunction, host_function_t> {
+public:
+  kernel(driver::module* program, CUfunction fn, bool has_ownership);
+  kernel(driver::module* program, host_function_t fn, bool has_ownership);
+  driver::module* module();
+  static kernel* create(driver::module* program, const char* name);
+private:
+  driver::module* program_;
+};
+
+// Host
+class host_kernel: public kernel {
+public:
+  //Constructors
+  host_kernel(driver::module* program, const char* name);
+};
+
+// CUDA
+class cu_kernel: public kernel {
+public:
+  //Constructors
+  cu_kernel(driver::module* program, const char * name);
+};
+
+}
+
+}
+
+#endif
+
--- a/include/triton/driver/module.h
+++ b/include/triton/driver/module.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_MODULE_H_
+#define _TRITON_DRIVER_MODULE_H_
+
+#include <map>
+#include "triton/driver/handle.h"
+#include "triton/driver/context.h"
+#include "triton/driver/buffer.h"
+
+namespace llvm
+{
+  class Module;
+  template<class T>
+  class SmallVectorImpl;
+}
+
+namespace triton
+{
+
+namespace driver
+{
+
+class cu_context;
+class cu_device;
+
+// Base
+class module: public polymorphic_resource<CUmodule, host_module_t> {
+protected:
+  void init_llvm();
+
+  enum file_type_t{
+    Object,
+    Assembly
+  };
+
+public:
+  module(CUmodule mod, bool has_ownership);
+  module(host_module_t mod, bool has_ownership);
+  static module* create(driver::device* device, std::unique_ptr<llvm::Module> src);
+  void compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
+                           const std::string &proc, std::string layout,
+                           llvm::SmallVectorImpl<char> &buffer,
+                           const std::string &features,
+                           file_type_t file_type);
+  virtual std::unique_ptr<buffer> symbol(const char * name) const = 0;
+  int spilled() const { return spilled_; }
+
+protected:
+  int spilled_;
+};
+
+// CPU
+class host_module: public module{
+public:
+  host_module(std::unique_ptr<llvm::Module> module);
+  std::unique_ptr<buffer> symbol(const char * name) const;
+};
+
+// CUDA
+class cu_module: public module {
+  std::string compile_llvm_module(std::unique_ptr<llvm::Module> module, driver::device* device);
+  void init_from_ptx(const std::string& ptx);
+
+public:
+  cu_module(driver::device* device, std::unique_ptr<llvm::Module> module);
+  cu_module(driver::device* device, const std::string& source);
+  std::unique_ptr<buffer> symbol(const char * name) const;
+  std::string llir() const { return llir_; }
+  const std::string& ptx() const { return ptx_; }
+
+private:
+  std::string ptx_;
+  std::string llir_;
+};
+
+
+}
+
+}
+
+#endif
--- a/include/triton/driver/platform.h
+++ b/include/triton/driver/platform.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_PLATFORM_H_
+#define _TRITON_DRIVER_PLATFORM_H_
+
+#include <vector>
+#include <string>
+
+#include "triton/driver/handle.h"
+
+namespace triton
+{
+
+namespace driver
+{
+
+class device;
+
+class platform
+{
+public:
+  // Constructor
+  platform(const std::string& name): name_(name){ }
+  // Accessors
+  std::string name() const { return name_; }
+  // Virtual methods
+  virtual std::string version() const = 0;
+  virtual void devices(std::vector<driver::device *> &devices) const = 0;
+private:
+  std::string name_;
+};
+
+// CUDA
+class cu_platform: public platform
+{
+public:
+  cu_platform(): platform("CUDA") { }
+  std::string version() const;
+  void devices(std::vector<driver::device*> &devices) const;
+
+private:
+  handle<CUPlatform> cu_;
+};
+
+// Host
+class host_platform: public platform
+{
+public:
+  host_platform(): platform("CPU") { }
+  std::string version() const;
+  void devices(std::vector<driver::device*> &devices) const;
+};
+
+}
+
+}
+
+#endif
--- a/include/triton/driver/stream.h
+++ b/include/triton/driver/stream.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#ifndef _TRITON_DRIVER_STREAM_H_
+#define _TRITON_DRIVER_STREAM_H_
+
+#include <map>
+#include "triton/driver/context.h"
+#include "triton/driver/device.h"
+#include "triton/driver/handle.h"
+#include "triton/driver/buffer.h"
+
+namespace triton
+{
+
+namespace driver
+{
+
+class kernel;
+class event;
+class Range;
+class cu_buffer;
+
+// Base
+class stream: public polymorphic_resource<CUstream, host_stream_t> {
+public:
+  stream(CUstream, bool has_ownership);
+  stream(host_stream_t, bool has_ownership);
+  // factory
+  static driver::stream* create(backend_t backend);
+  // methods
+  virtual void synchronize() = 0;
+  virtual void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem = 0) = 0;
+  virtual void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr) = 0;
+  virtual void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr) = 0;
+  // template helpers
+  template<class T> void write(driver::buffer* buf, bool blocking, std::size_t offset, std::vector<T> const & x)
+  { write(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
+  template<class T> void read(driver::buffer* buf, bool blocking, std::size_t offset, std::vector<T>& x)
+  { read(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
+};
+
+// Host
+class host_stream: public stream {
+public:
+  host_stream();
+  void synchronize();
+  void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem);
+  void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
+  void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr);
+};
+
+// CUDA
+class cu_stream: public stream {
+public:
+  cu_stream(CUstream str, bool take_ownership);
+  cu_stream();
+  void synchronize();
+  void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem);
+  void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
+  void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr);
+};
+
+
+}
+
+}
+
+#endif
--- a/include/triton/external/CL/cl.h
+++ b/include/triton/external/CL/cl.h
--- a/include/triton/external/CL/cl.hpp
+++ b/include/triton/external/CL/cl.hpp
--- a/include/triton/external/CL/cl2.hpp
+++ b/include/triton/external/CL/cl2.hpp
--- a/include/triton/external/CL/cl_d3d10.h
+++ b/include/triton/external/CL/cl_d3d10.h
@@ -0,0 +1,131 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D10_H
+#define __OPENCL_CL_D3D10_H
+
+#include <d3d10.h>
+#include "cl.h"
+#include "cl_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d10_sharing                                                       */
+#define cl_khr_d3d10_sharing 1
+
+typedef cl_uint cl_d3d10_device_source_khr;
+typedef cl_uint cl_d3d10_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
+
+/* cl_d3d10_device_source_nv */
+#define CL_D3D10_DEVICE_KHR                          0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
+
+/* cl_d3d10_device_set_nv */
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
+
+/* cl_mem_info */
+#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
+
+/* cl_image_info */
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d10_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d10_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D10Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_D3D10_H */
+
--- a/include/triton/external/CL/cl_d3d11.h
+++ b/include/triton/external/CL/cl_d3d11.h
@@ -0,0 +1,131 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D11_H
+#define __OPENCL_CL_D3D11_H
+
+#include <d3d11.h>
+#include "cl.h"
+#include "cl_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d11_sharing                                                       */
+#define cl_khr_d3d11_sharing 1
+
+typedef cl_uint cl_d3d11_device_source_khr;
+typedef cl_uint cl_d3d11_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
+#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
+
+/* cl_d3d11_device_source */
+#define CL_D3D11_DEVICE_KHR                          0x4019
+#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
+
+/* cl_d3d11_device_set */
+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
+#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
+
+/* cl_mem_info */
+#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
+
+/* cl_image_info */
+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d11_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d11_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D11Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_D3D11_H */
+
--- a/include/triton/external/CL/cl_dx9_media_sharing.h
+++ b/include/triton/external/CL/cl_dx9_media_sharing.h
@@ -0,0 +1,132 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_H
+
+#include "cl.h"
+#include "cl_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+/* cl_khr_dx9_media_sharing                                                   */
+#define cl_khr_dx9_media_sharing 1
+
+typedef cl_uint             cl_dx9_media_adapter_type_khr;
+typedef cl_uint             cl_dx9_media_adapter_set_khr;
+    
+#if defined(_WIN32)
+#include <d3d9.h>
+typedef struct _cl_dx9_surface_info_khr
+{
+    IDirect3DSurface9 *resource;
+    HANDLE shared_handle;
+} cl_dx9_surface_info_khr;
+#endif
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
+#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
+
+/* cl_media_adapter_type_khr */
+#define CL_ADAPTER_D3D9_KHR                              0x2020
+#define CL_ADAPTER_D3D9EX_KHR                            0x2021
+#define CL_ADAPTER_DXVA_KHR                              0x2022
+
+/* cl_media_adapter_set_khr */
+#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
+
+/* cl_context_info */
+#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
+#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
+
+/* cl_mem_info */
+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
+#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+    cl_platform_id                   platform,
+    cl_uint                          num_media_adapters,
+    cl_dx9_media_adapter_type_khr *  media_adapter_type,
+    void *                           media_adapters,
+    cl_dx9_media_adapter_set_khr     media_adapter_set,
+    cl_uint                          num_entries,
+    cl_device_id *                   devices,
+    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
+    cl_context                    context,
+    cl_mem_flags                  flags,
+    cl_dx9_media_adapter_type_khr adapter_type,
+    void *                        surface_info,
+    cl_uint                       plane,                                                                          
+    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
+
--- a/include/triton/external/CL/cl_dx9_media_sharing_intel.h
+++ b/include/triton/external/CL/cl_dx9_media_sharing_intel.h
@@ -0,0 +1,182 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2016 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/*****************************************************************************\
+
+Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
+
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File Name: cl_dx9_media_sharing_intel.h
+
+Abstract:
+
+Notes:
+
+\*****************************************************************************/
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#include <d3d9.h>
+#include <dxvahd.h>
+#include <wtypes.h>
+#include <d3d9types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************
+* cl_intel_dx9_media_sharing extension *
+****************************************/
+
+#define cl_intel_dx9_media_sharing 1
+
+typedef cl_uint cl_dx9_device_source_intel;
+typedef cl_uint cl_dx9_device_set_intel;
+
+/* error codes */
+#define CL_INVALID_DX9_DEVICE_INTEL                   -1010
+#define CL_INVALID_DX9_RESOURCE_INTEL                 -1011
+#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012
+#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013
+
+/* cl_dx9_device_source_intel */
+#define CL_D3D9_DEVICE_INTEL                          0x4022
+#define CL_D3D9EX_DEVICE_INTEL                        0x4070
+#define CL_DXVA_DEVICE_INTEL                          0x4071
+
+/* cl_dx9_device_set_intel */
+#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024
+#define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026
+#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072
+#define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073
+
+/* cl_mem_info */
+#define CL_MEM_DX9_RESOURCE_INTEL                     0x4027
+#define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_PLANE_INTEL                      0x4075
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A
+#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B
+/******************************************************************************/
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromDX9INTEL(
+    cl_platform_id              /* platform */,
+    cl_dx9_device_source_intel  /* dx9_device_source */,
+    void*                       /* dx9_object */,
+    cl_dx9_device_set_intel     /* dx9_device_set */,
+    cl_uint                     /* num_entries */, 
+    cl_device_id*               /* devices */, 
+    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
+    cl_platform_id              /* platform */,
+    cl_dx9_device_source_intel  /* dx9_device_source */,
+    void*                       /* dx9_object */,
+    cl_dx9_device_set_intel     /* dx9_device_set */,
+    cl_uint                     /* num_entries */, 
+    cl_device_id*               /* devices */, 
+    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromDX9MediaSurfaceINTEL(
+    cl_context                  /* context */,
+    cl_mem_flags                /* flags */,
+    IDirect3DSurface9*          /* resource */,
+    HANDLE                      /* sharedHandle */,
+    UINT                        /* plane */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
+    cl_context                  /* context */,
+    cl_mem_flags                /* flags */,
+    IDirect3DSurface9*          /* resource */,
+    HANDLE                      /* sharedHandle */,
+    UINT                        /* plane */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireDX9ObjectsINTEL(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    const cl_mem*               /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    const cl_mem*               /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseDX9ObjectsINTEL(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    cl_mem*                     /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    cl_mem*                     /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */
+
--- a/include/triton/external/CL/cl_egl.h
+++ b/include/triton/external/CL/cl_egl.h
@@ -0,0 +1,136 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_EGL_H
+#define __OPENCL_CL_EGL_H
+
+#ifdef __APPLE__
+
+#else
+#include "cl.h"
+#endif  
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
+
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR             -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
+
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void* CLeglImageKHR;
+
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void* CLeglDisplayKHR;
+
+/* CLeglSyncKHR is an opaque handle to an EGLSync object */
+typedef void* CLeglSyncKHR;
+
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t cl_egl_image_properties_khr;
+
+
+#define cl_khr_egl_image 1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(cl_context                  /* context */,
+                        CLeglDisplayKHR             /* egldisplay */,
+                        CLeglImageKHR               /* eglimage */,
+                        cl_mem_flags                /* flags */,
+                        const cl_egl_image_properties_khr * /* properties */,
+                        cl_int *                    /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+	cl_context                  context,
+	CLeglDisplayKHR             egldisplay,
+	CLeglImageKHR               eglimage,
+	cl_mem_flags                flags,
+	const cl_egl_image_properties_khr * properties,
+	cl_int *                    errcode_ret);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+
+
+#define cl_khr_egl_event 1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(cl_context      /* context */,
+                            CLeglSyncKHR    /* sync */,
+                            CLeglDisplayKHR /* display */,
+                            cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+	cl_context      context,
+	CLeglSyncKHR    sync,
+	CLeglDisplayKHR display,
+	cl_int *        errcode_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_EGL_H */
--- a/include/triton/external/CL/cl_ext.h
+++ b/include/triton/external/CL/cl_ext.h
@@ -0,0 +1,670 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+        #include <OpenCL/cl.h>
+    #include <AvailabilityMacros.h>
+#else
+        #include "cl.h"
+#endif
+
+/* cl_khr_fp64 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources 
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in 
+ * which they were registered. The user callback functions are called and then the memory object is deleted 
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
+                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
+                                            const void * /* private_info */, 
+                                            size_t       /* cb */, 
+                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************ 
+* cl_khr_icd extension *                                                  
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
+                       cl_platform_id * /* platforms */,
+                       cl_uint *        /* num_platforms */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+    cl_uint          /* num_entries */,
+    cl_platform_id * /* platforms */,
+    cl_uint *        /* num_platforms */);
+
+
+/* Extension: cl_khr_image2D_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
+ * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
+ * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
+ * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
+ * for 2D images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the width,
+ * height, image format (i.e. channel order and channel data type) and optionally the row pitch
+ *
+ * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
+ * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
+ */
+    
+/*************************************
+ * cl_khr_initalize_memory extension *
+ *************************************/
+    
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
+    
+    
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+    
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
+#define CL_CONTEXT_TERMINATE_KHR                    0x2032
+
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+    
+    
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a 
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+
+#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
+
+
+/*****************************************
+ * cl_khr_create_command_queue extension *
+ *****************************************/
+#define cl_khr_create_command_queue 1
+
+typedef cl_bitfield cl_queue_properties_khr;
+
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithPropertiesKHR( cl_context /* context */,
+                                       cl_device_id /* device */,
+                                       const cl_queue_properties_khr* /* properties */,
+                                       cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_command_queue
+(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)( cl_context /* context */,
+                                                         cl_device_id /* device */,
+                                                         const cl_queue_properties_khr* /* properties */,
+                                                         cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+
+/*********************************
+* cl_amd_device_memory_flags *
+*********************************/
+#define cl_amd_device_memory_flags 1
+
+#define CL_MEM_USE_PERSISTENT_MEM_AMD       (1 << 6)        // Alloc from GPU's CPU visible heap
+
+/* cl_device_info */
+#define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT           0x4032
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+#define CL_DEVICE_TOPOLOGY_AMD                      0x4037
+#define CL_DEVICE_BOARD_NAME_AMD                    0x4038
+#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD            0x4039
+#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD         0x4040
+#define CL_DEVICE_SIMD_WIDTH_AMD                    0x4041
+#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD        0x4042
+#define CL_DEVICE_WAVEFRONT_WIDTH_AMD               0x4043
+#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD           0x4044
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD      0x4045
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046
+#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD   0x4047
+#define CL_DEVICE_LOCAL_MEM_BANKS_AMD               0x4048
+
+typedef union
+{
+    struct { cl_uint type; cl_uint data[5]; } raw;
+    struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
+} cl_device_topology_amd;
+
+#define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD            1
+
+
+/**************************
+* cl_amd_offline_devices *
+**************************/
+#define CL_CONTEXT_OFFLINE_DEVICES_AMD              0x403F
+
+/*********************************
+* cl_arm_printf extension
+*********************************/
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
+
+#ifdef CL_VERSION_1_1
+   /***********************************
+    * cl_ext_device_fission extension *
+    ***********************************/
+    #define cl_ext_device_fission   1
+    
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef cl_ulong  cl_device_partition_property_ext;
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
+                            const cl_device_partition_property_ext * /* properties */,
+                            cl_uint /*num_entries*/,
+                            cl_device_id * /*out_devices*/,
+                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef CL_API_ENTRY cl_int 
+    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
+                                                const cl_device_partition_property_ext * /* properties */,
+                                                cl_uint /*num_entries*/,
+                                                cl_device_id * /*out_devices*/,
+                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    /* cl_device_partition_property_ext */
+    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+    #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
+    
+    /* clDeviceGetInfo selectors */
+    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+    
+    /* error codes */
+    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
+    #define CL_INVALID_PARTITION_NAME_EXT               -1059
+    
+    /* CL_AFFINITY_DOMAINs */
+    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+    #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
+    
+    /* cl_device_partition_property_ext list terminators */
+    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+
+    /* cl_ext_atomic_counters_32 and cl_ext_atomic_counters_64 extensions
+     * no extension #define since they have no functions
+     */
+    #define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT           0x4032
+
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0      
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
+    cl_uint  allocation_type;
+            
+    /* Host cache policy for this external memory allocation. */
+    cl_uint  host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* ION file descriptor */
+    int                  ion_filedesc;
+            
+    /* Host pointer to the ION allocated memory */
+    void*                ion_hostptr;
+
+} cl_mem_ion_host_ptr;
+
+#endif /* CL_VERSION_1_1 */
+
+#if defined(CL_VERSION_1_2)
+
+/******************************************
+ * cl_img_yuv_image extension *
+ ******************************************/
+
+/* Image formats used in clCreateImage */
+#define CL_NV21_IMG                                 0x40D0
+#define CL_YV12_IMG                                 0x40D1
+
+/******************************************
+ * cl_img_cached_allocations extension *
+ ******************************************/
+
+/* Flag values used by clCreteBuffer */
+#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG         	(1 << 26)
+#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG           	(1 << 27)
+
+/******************************************
+ * cl_img_use_gralloc_ptr extension *
+ ******************************************/
+
+/* Flag values used by clCreteBuffer */
+#define CL_MEM_USE_GRALLOC_PTR_IMG                 	(1 << 28)
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
+#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3
+
+/* Error code from clEnqueueReleaseGrallocObjectsIMG */
+#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      /* command_queue */,
+                                  cl_uint               /* num_objects */,
+                                  const cl_mem *        /* mem_objects */,
+                                  cl_uint               /* num_events_in_wait_list */,
+                                  const cl_event *      /* event_wait_list */,
+                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      /* command_queue */,
+                                  cl_uint               /* num_objects */,
+                                  const cl_mem *        /* mem_objects */,
+                                  cl_uint               /* num_events_in_wait_list */,
+                                  const cl_event *      /* event_wait_list */,
+                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+#endif /* CL_VERSION_1_2 */
+
+#ifdef CL_VERSION_2_0
+/*********************************
+* cl_khr_subgroups extension
+*********************************/
+#define cl_khr_subgroups 1
+
+/* cl_kernel_sub_group_info is declared in CL.h. */
+
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR	0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR		0x2034
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */,
+						   cl_device_id /*in_device*/,
+						   cl_kernel_sub_group_info /* param_name */,
+						   size_t /*input_value_size*/,
+						   const void * /*input_value*/,
+						   size_t /*param_value_size*/,
+						   void* /*param_value*/,
+						   size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+						   
+typedef CL_API_ENTRY cl_int
+     ( CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */,
+						      cl_device_id /*in_device*/,
+						      cl_kernel_sub_group_info /* param_name */,
+						      size_t /*input_value_size*/,
+						      const void * /*input_value*/,
+						      size_t /*param_value_size*/,
+						      void* /*param_value*/,
+						      size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+#endif /* CL_VERSION_2_0 */
+
+#ifdef CL_VERSION_2_1
+/*********************************
+* cl_khr_priority_hints extension
+*********************************/
+#define cl_khr_priority_hints 1
+
+typedef cl_uint  cl_queue_priority_khr;
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_PRIORITY_KHR 0x1096
+
+/* cl_queue_priority_khr */
+#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
+#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
+#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
+
+#endif /* CL_VERSION_2_1 */
+
+#ifdef CL_VERSION_2_1
+/*********************************
+* cl_khr_throttle_hints extension
+*********************************/
+#define cl_khr_throttle_hints 1
+
+typedef cl_uint  cl_queue_throttle_khr;
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_THROTTLE_KHR 0x1097
+
+/* cl_queue_throttle_khr */
+#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
+#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
+#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
+
+#endif /* CL_VERSION_2_1 */
+
+#ifdef CL_VERSION_2_2
+/*********************************
+* cl_khr_subgroup_named_barrier
+*********************************/
+#define cl_khr_subgroup_named_barrier 1
+
+/* cl_device_info */
+#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035
+
+#endif /* CL_VERSION_2_2 */
+
+/**********************************
+ * cl_arm_import_memory extension *
+ **********************************/
+
+#ifdef CL_VERSION_1_0
+
+typedef intptr_t cl_import_properties_arm;
+
+/* Default and valid proporties name for cl_arm_import_memory */
+#define CL_IMPORT_TYPE_ARM                        0x40B2
+
+/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_HOST_ARM                   0x40B3
+
+/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4
+
+/* Secure DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_SECURE_ARM                 0x40B5
+
+/* This extension adds a new function that allows for direct memory import into
+ * OpenCL via the clImportMemoryARM function.
+ *
+ * Memory imported through this interface will be mapped into the device's page
+ * tables directly, providing zero copy access. It will never fall back to copy
+ * operations and aliased buffers.
+ *
+ * Types of memory supported for import are specified as additional extension
+ * strings.
+ *
+ * This extension produces cl_mem allocations which are compatible with all other
+ * users of cl_mem in the standard API.
+ *
+ * This extension maps pages with the same properties as the normal buffer creation
+ * function clCreateBuffer.
+ */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clImportMemoryARM( cl_context context,
+                   cl_mem_flags flags,
+                   const cl_import_properties_arm *properties,
+                   void *memory,
+                   size_t size,
+                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
+
+
+#endif /* CL_VERSION_1_0 */
+
+/******************************************
+ * cl_arm_shared_virtual_memory extension *
+ ******************************************/
+
+#ifdef CL_VERSION_1_2
+
+/* Used by clGetDeviceInfo */
+#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
+
+/* Used by clGetMemObjectInfo */
+#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
+
+/* Used by clSetKernelExecInfoARM: */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_SVM_FREE_ARM                         0x40BA
+#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
+#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
+#define CL_COMMAND_SVM_MAP_ARM                          0x40BD
+#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
+
+/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
+
+/* Flag values used by clSVMAllocARM: */
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
+#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
+
+typedef cl_bitfield cl_svm_mem_flags_arm;
+typedef cl_uint     cl_kernel_exec_info_arm;
+typedef cl_bitfield cl_device_svm_capabilities_arm;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAllocARM(cl_context       /* context */,
+              cl_svm_mem_flags_arm /* flags */,
+              size_t           /* size */,
+              cl_uint          /* alignment */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFreeARM(cl_context        /* context */,
+             void *            /* svm_pointer */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFreeARM(cl_command_queue  /* command_queue */,
+                    cl_uint           /* num_svm_pointers */,
+                    void *[]          /* svm_pointers[] */,
+                    void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                                           cl_uint          /* num_svm_pointers */,
+                                                           void *[]         /* svm_pointers[] */,
+                                                           void *           /* user_data */),
+                    void *            /* user_data */,
+                    cl_uint           /* num_events_in_wait_list */,
+                    const cl_event *  /* event_wait_list */,
+                    cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpyARM(cl_command_queue  /* command_queue */,
+                      cl_bool           /* blocking_copy */,
+                      void *            /* dst_ptr */,
+                      const void *      /* src_ptr */,
+                      size_t            /* size */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFillARM(cl_command_queue  /* command_queue */,
+                       void *            /* svm_ptr */,
+                       const void *      /* pattern */,
+                       size_t            /* pattern_size */,
+                       size_t            /* size */,
+                       cl_uint           /* num_events_in_wait_list */,
+                       const cl_event *  /* event_wait_list */,
+                       cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMapARM(cl_command_queue  /* command_queue */,
+                   cl_bool           /* blocking_map */,
+                   cl_map_flags      /* flags */,
+                   void *            /* svm_ptr */,
+                   size_t            /* size */,
+                   cl_uint           /* num_events_in_wait_list */,
+                   const cl_event *  /* event_wait_list */,
+                   cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmapARM(cl_command_queue  /* command_queue */,
+                     void *            /* svm_ptr */,
+                     cl_uint           /* num_events_in_wait_list */,
+                     const cl_event *  /* event_wait_list */,
+                     cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointerARM(cl_kernel    /* kernel */,
+                            cl_uint      /* arg_index */,
+                            const void * /* arg_value */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfoARM(cl_kernel            /* kernel */,
+                       cl_kernel_exec_info_arm  /* param_name */,
+                       size_t               /* param_value_size */,
+                       const void *         /* param_value */) CL_EXT_SUFFIX__VERSION_1_2;
+
+#endif /* CL_VERSION_1_2 */
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
--- a/include/triton/external/CL/cl_ext_intel.h
+++ b/include/triton/external/CL/cl_ext_intel.h
@@ -0,0 +1,429 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2017 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+/*****************************************************************************\
+
+Copyright (c) 2013-2017 Intel Corporation All Rights Reserved.
+
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File Name: cl_ext_intel.h
+
+Abstract:
+
+Notes:
+
+\*****************************************************************************/
+
+#ifndef __CL_EXT_INTEL_H
+#define __CL_EXT_INTEL_H
+
+#ifdef __APPLE__
+    #include <OpenCL/cl.h>
+    #include <OpenCL/cl_platform.h>
+#else
+    #include "cl.h"
+    #include "cl_platform.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************
+* cl_intel_thread_local_exec extension *
+****************************************/
+
+#define cl_intel_thread_local_exec 1
+
+#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)
+
+/***********************************************
+* cl_intel_device_partition_by_names extension *
+************************************************/
+
+#define cl_intel_device_partition_by_names 1
+
+#define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052
+#define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1
+
+/************************************************
+* cl_intel_accelerator extension                *
+* cl_intel_motion_estimation extension          *
+* cl_intel_advanced_motion_estimation extension *
+*************************************************/
+
+#define cl_intel_accelerator 1
+#define cl_intel_motion_estimation 1
+#define cl_intel_advanced_motion_estimation 1
+
+typedef struct _cl_accelerator_intel* cl_accelerator_intel;
+typedef cl_uint cl_accelerator_type_intel;
+typedef cl_uint cl_accelerator_info_intel;
+
+typedef struct _cl_motion_estimation_desc_intel {
+    cl_uint mb_block_type;
+    cl_uint subpixel_mode;
+    cl_uint sad_adjust_mode;
+    cl_uint search_path_type;
+} cl_motion_estimation_desc_intel;
+
+/* error codes */
+#define CL_INVALID_ACCELERATOR_INTEL                              -1094
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097
+
+/* cl_accelerator_type_intel */
+#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0
+
+/* cl_accelerator_info_intel */
+#define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090
+#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091
+#define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092
+#define CL_ACCELERATOR_TYPE_INTEL                                 0x4093
+
+/* cl_motion_detect_desc_intel flags */
+#define CL_ME_MB_TYPE_16x16_INTEL                                 0x0
+#define CL_ME_MB_TYPE_8x8_INTEL                                   0x1
+#define CL_ME_MB_TYPE_4x4_INTEL                                   0x2
+
+#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0
+#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1
+#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2
+
+#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0
+#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1
+
+#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0
+#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1
+#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5
+
+#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0
+#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1
+#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2
+#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4
+
+#define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1
+#define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2
+#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3
+
+#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16
+#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21
+#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32
+#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43
+#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48
+
+#define CL_ME_COST_PENALTY_NONE_INTEL                             0x0
+#define CL_ME_COST_PENALTY_LOW_INTEL                              0x1
+#define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2
+#define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3
+
+#define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0
+#define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1
+#define CL_ME_COST_PRECISION_PEL_INTEL                            0x2
+#define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3
+
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1
+#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3
+
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8
+
+#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0
+#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1
+#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2
+#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3
+
+/* cl_device_info */
+#define CL_DEVICE_ME_VERSION_INTEL                                0x407E
+
+#define CL_ME_VERSION_LEGACY_INTEL                                0x0
+#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1
+#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2
+
+extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
+clCreateAcceleratorINTEL(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetAcceleratorInfoINTEL(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+/******************************************
+* cl_intel_simultaneous_sharing extension *
+*******************************************/
+
+#define cl_intel_simultaneous_sharing 1
+
+#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104
+#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105
+
+/***********************************
+* cl_intel_egl_image_yuv extension *
+************************************/
+
+#define cl_intel_egl_image_yuv 1
+
+#define CL_EGL_YUV_PLANE_INTEL                           0x4107
+
+/********************************
+* cl_intel_packed_yuv extension *
+*********************************/
+
+#define cl_intel_packed_yuv 1
+
+#define CL_YUYV_INTEL                                    0x4076
+#define CL_UYVY_INTEL                                    0x4077
+#define CL_YVYU_INTEL                                    0x4078
+#define CL_VYUY_INTEL                                    0x4079
+
+/********************************************
+* cl_intel_required_subgroup_size extension *
+*********************************************/
+
+#define cl_intel_required_subgroup_size 1
+
+#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108
+#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109
+#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A
+
+/****************************************
+* cl_intel_driver_diagnostics extension *
+*****************************************/
+
+#define cl_intel_driver_diagnostics 1
+
+typedef cl_uint cl_diagnostics_verbose_level;
+
+#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106
+
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )
+
+/********************************
+* cl_intel_planar_yuv extension *
+*********************************/
+
+#define CL_NV12_INTEL                                       0x410E
+
+#define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )
+#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )
+
+#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
+#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F
+
+/*******************************************************
+* cl_intel_device_side_avc_motion_estimation extension *
+********************************************************/
+
+#define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
+#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
+#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D
+
+#define CL_AVC_ME_VERSION_0_INTEL                           0x0;  // No support.
+#define CL_AVC_ME_VERSION_1_INTEL                           0x1;  // First supported version.
+
+#define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
+#define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
+#define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
+#define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3
+
+#define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
+#define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
+#define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
+#define CL_AVC_ME_MINOR_4x4_INTEL                           0x3
+
+#define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
+#define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
+#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2
+
+#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
+#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
+#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
+#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B
+#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77
+#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F
+#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
+#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F
+
+#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
+#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
+#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
+#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3
+#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4
+#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5
+#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6
+#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7
+#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8
+#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9
+#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
+#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa
+
+#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
+#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2
+
+#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
+#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
+#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3
+
+#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
+#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
+#define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
+#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3
+
+#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
+#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
+#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
+#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
+#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30
+
+#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
+#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
+#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
+#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8
+
+#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
+#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000
+
+#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )
+
+#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
+#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80
+
+#define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
+#define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
+#define CL_AVC_ME_INTRA_4x4_INTEL                           0x2
+
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3 
+
+#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4
+
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3
+
+#define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
+#define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
+#define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3
+
+#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
+#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
+#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2
+
+#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
+#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1  
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CL_EXT_INTEL_H */
+
--- a/include/triton/external/CL/cl_gl.h
+++ b/include/triton/external/CL/cl_gl.h
@@ -0,0 +1,167 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include "cl.h"
+#endif	
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#define CL_GL_NUM_SAMPLES                       0x2012
+
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     /* context */,
+                     cl_mem_flags   /* flags */,
+                     cl_GLuint      /* bufobj */,
+                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context      /* context */,
+                      cl_mem_flags    /* flags */,
+                      cl_GLenum       /* target */,
+                      cl_GLint        /* miplevel */,
+                      cl_GLuint       /* texture */,
+                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+    
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   /* context */,
+                           cl_mem_flags /* flags */,
+                           cl_GLuint    /* renderbuffer */,
+                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                /* memobj */,
+                  cl_gl_object_type *   /* gl_object_type */,
+                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+                  
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               /* memobj */,
+                   cl_gl_texture_info   /* param_name */,
+                   size_t               /* param_value_size */,
+                   void *               /* param_value */,
+                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+/* cl_khr_gl_sharing extension  */
+    
+#define cl_khr_gl_sharing 1
+    
+typedef cl_uint     cl_gl_context_info;
+    
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+    
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+    
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+                      cl_gl_context_info            /* param_name */,
+                      size_t                        /* param_value_size */,
+                      void *                        /* param_value */,
+                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+    
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_GL_H */
--- a/include/triton/external/CL/cl_gl_ext.h
+++ b/include/triton/external/CL/cl_gl_ext.h
@@ -0,0 +1,74 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
+/* OpenGL dependencies.                                                         */
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+    #include <OpenCL/cl_gl.h>
+#else
+    #include "cl_gl.h"
+#endif
+
+/*
+ * For each extension, follow this template
+ *  cl_VEN_extname extension  */
+/* #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ *  This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+
+/* 
+ *  cl_khr_gl_event  extension
+ *  See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context           /* context */,
+                           cl_GLsync            /* cl_GLsync */,
+                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* __OPENCL_CL_GL_EXT_H  */
--- a/include/triton/external/CL/cl_platform.h
+++ b/include/triton/external/CL/cl_platform.h
--- a/include/triton/external/CL/cl_va_api_media_sharing_intel.h
+++ b/include/triton/external/CL/cl_va_api_media_sharing_intel.h
@@ -0,0 +1,172 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2016 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/*****************************************************************************\
+
+Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
+
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File Name: cl_va_api_media_sharing_intel.h
+
+Abstract:
+
+Notes:
+
+\*****************************************************************************/
+
+
+#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
+#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
+
+#include "cl.h"
+#include "cl_platform.h"
+#include <va/va.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************
+* cl_intel_va_api_media_sharing extension *
+*******************************************/
+
+#define cl_intel_va_api_media_sharing 1
+
+/* error codes */
+#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098
+#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099
+#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100
+#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101
+
+/* cl_va_api_device_source_intel */
+#define CL_VA_API_DISPLAY_INTEL                             0x4094
+
+/* cl_va_api_device_set_intel */
+#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095
+#define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096
+
+/* cl_context_info */
+#define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097
+
+/* cl_mem_info */
+#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098
+
+/* cl_image_info */
+#define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A
+#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B
+
+typedef cl_uint cl_va_api_device_source_intel;
+typedef cl_uint cl_va_api_device_set_intel;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
+    cl_platform_id                /* platform */,
+    cl_va_api_device_source_intel /* media_adapter_type */,
+    void*                         /* media_adapter */,
+    cl_va_api_device_set_intel    /* media_adapter_set */,
+    cl_uint                       /* num_entries */,
+    cl_device_id*                 /* devices */,
+    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
+    cl_platform_id                /* platform */,
+    cl_va_api_device_source_intel /* media_adapter_type */,
+    void*                         /* media_adapter */,
+    cl_va_api_device_set_intel    /* media_adapter_set */,
+    cl_uint                       /* num_entries */,
+    cl_device_id*                 /* devices */,
+    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromVA_APIMediaSurfaceINTEL(
+    cl_context                    /* context */,
+    cl_mem_flags                  /* flags */,
+    VASurfaceID*                  /* surface */,
+    cl_uint                       /* plane */,
+    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
+    cl_context                    /* context */,
+    cl_mem_flags                  /* flags */,
+    VASurfaceID*                  /* surface */,
+    cl_uint                       /* plane */,
+    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireVA_APIMediaSurfacesINTEL(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseVA_APIMediaSurfacesINTEL(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+	
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */
+
--- a/include/triton/external/CL/opencl.h
+++ b/include/triton/external/CL/opencl.h
@@ -0,0 +1,59 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+
+#else
+
+#include "cl.h"
+#include "cl_gl.h"
+#include "cl_gl_ext.h"
+#include "cl_ext.h"
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_H   */
+
--- a/include/triton/external/CUDA/cuda.h
+++ b/include/triton/external/CUDA/cuda.h
--- a/include/triton/external/CUDA/nvml.h
+++ b/include/triton/external/CUDA/nvml.h
--- a/include/triton/external/half.hpp
+++ b/include/triton/external/half.hpp
--- a/include/triton/ir/basic_block.h
+++ b/include/triton/ir/basic_block.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#ifndef _TRITON_IR_BASIC_BLOCK_H_
+#define _TRITON_IR_BASIC_BLOCK_H_
+
+#include <string>
+#include <list>
+#include "value.h"
+#include "visitor.h"
+
+namespace triton{
+namespace ir{
+
+class context;
+class function;
+class instruction;
+
+/* Basic Block */
+class basic_block: public value{
+public:
+  // instruction iterator types
+  typedef std::list<instruction*>                inst_list_t;
+  typedef inst_list_t::iterator                  iterator;
+  typedef inst_list_t::const_iterator            const_iterator;
+  typedef inst_list_t::reverse_iterator          reverse_iterator;
+  typedef inst_list_t::const_reverse_iterator    const_reverse_iterator;
+
+private:
+  // constructors
+  basic_block(context &ctx, const std::string &name, function *parent);
+
+public:
+  // accessors
+  function* get_parent() { return parent_; }
+  context& get_context() { return ctx_; }
+
+  // get iterator to first instruction that is not a phi
+  iterator get_first_non_phi();
+
+  // get instruction list
+  inst_list_t           &get_inst_list()       { return inst_list_; }
+  void  erase(instruction *i)                  {  inst_list_.remove(i); }
+
+  // instruction iterator functions
+  inline iterator                begin()       { return inst_list_.begin(); }
+  inline const_iterator          begin() const { return inst_list_.begin(); }
+  inline iterator                end  ()       { return inst_list_.end();   }
+  inline const_iterator          end  () const { return inst_list_.end();   }
+
+  inline reverse_iterator        rbegin()       { return inst_list_.rbegin(); }
+  inline const_reverse_iterator  rbegin() const { return inst_list_.rbegin(); }
+  inline reverse_iterator        rend  ()       { return inst_list_.rend();   }
+  inline const_reverse_iterator  rend  () const { return inst_list_.rend();   }
+
+  inline size_t                   size() const { return inst_list_.size();  }
+  inline bool                    empty() const { return inst_list_.empty(); }
+  inline const instruction      &front() const { return *inst_list_.front(); }
+  inline       instruction      &front()       { return *inst_list_.front(); }
+  inline const instruction       &back() const { return *inst_list_.back();  }
+  inline       instruction       &back()       { return *inst_list_.back();  }
+
+  // predecessors
+  const std::vector<basic_block*>& get_predecessors() const { return preds_; }
+  const std::vector<basic_block*>& get_successors() const { return succs_; }
+  void add_predecessor(basic_block* pred);
+
+  // factory functions
+  static basic_block* create(context &ctx, const std::string &name, function *parent);
+
+  // visitor
+  void accept(visitor *v) { v->visit_basic_block(this); }
+
+private:
+  context &ctx_;
+  std::string name_;
+  function *parent_;
+  std::vector<basic_block*> preds_;
+  std::vector<basic_block*> succs_;
+  inst_list_t inst_list_;
+};
+
+}
+}
+
+#endif
--- a/include/triton/ir/builder.h
+++ b/include/triton/ir/builder.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#ifndef _TRITON_IR_BUILDER_H_
+#define _TRITON_IR_BUILDER_H_
+
+#include <vector>
+#include <string>
+#include "instructions.h"
+#include "basic_block.h"
+#include "type.h"
+
+namespace triton{
+namespace ir{
+
+class basic_block;
+class value;
+class type;
+class constant_int;
+class instruction;
+class context;
+class phi_node;
+
+/* Builder */
+class builder{
+  typedef basic_block::iterator iterator;
+
+public:
+  // Constructor
+  builder(context &ctx);
+  // Setters
+  void set_insert_point(iterator instr);
+  void set_insert_point(instruction* i);
+  void set_insert_point_after(instruction* i);
+  void set_insert_point(basic_block* block);
+  basic_block* get_insert_block() { return block_; }
+  iterator get_insert_point() { return insert_point_;}
+  // Constants
+  value *get_int1(bool val);
+  value *get_int32(int32_t val);
+  value *get_int64(int64_t val);
+  // Types
+  type *get_void_ty();
+  type *get_int1_ty();
+  type *get_int8_ty();
+  type *get_int16_ty();
+  type *get_int32_ty();
+  type *get_int64_ty();
+  type *get_half_ty();
+  type *get_float_ty();
+  type *get_double_ty();
+  // Insert
+  template<typename InstTy>
+  InstTy* insert(InstTy *inst, const std::string &name = ""){
+    assert(block_);
+    block_->get_inst_list().insert(insert_point_, inst);
+    inst->set_parent(block_);
+    inst->set_name(name);
+//    for(ir::value* op: inst->ops())
+//      op->add_use(inst);
+    return inst;
+  }
+  // terminator instructions
+  value* create_br(basic_block *dest);
+  value* create_cond_br(value *cond, basic_block* if_dest, basic_block* else_dest);
+  value* create_ret_void();
+  // Cast instructions
+  value *create_cast(cast_op_t op, value *v, type *dst_ty, const std::string &name = "");
+  value* create_ptr_to_int(value *src, type *dst_ty, const std::string &name = "");
+  value* create_si_to_fp(value *src, type *dst_ty, const std::string &name = "");
+  value* create_ui_to_fp(value *src, type *dst_ty, const std::string &name = "");
+  value* create_fp_to_si(value *src, type *dst_ty, const std::string &name = "");
+  value* create_fp_to_ui(value *src, type *dst_ty, const std::string &name = "");
+  value* create_fp_ext(value *src, type *dst_ty, const std::string &name = "");
+  value* create_fp_trunc(value *src, type *dst_ty, const std::string &name = "");
+  value* create_int_cast(value *src, type *dst_ty, bool is_signed, const std::string &name = "");
+  value *create_downcast(value *arg, const std::string &name = "");
+  // Phi instruction
+  phi_node* create_phi(type *ty, unsigned num_reserved, const std::string &name = "");
+  // Binary instructions
+  value *create_insert_nuwnswb_binop(binary_op_t op, value *lhs, value *rhs, const std::string &name, bool has_nuw, bool has_nsw);
+  value *create_fmul(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fdiv(value *lhs, value *rhs, const std::string &name = "");
+  value *create_frem(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fadd(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fsub(value *lhs, value *rhs, const std::string &name = "");
+  value *create_mul(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false);
+  value *create_sdiv(value *lhs, value *rhs, const std::string &name = "");
+  value *create_udiv(value *lhs, value *rhs, const std::string &name = "");
+  value *create_srem(value *lhs, value *rhs, const std::string &name = "");
+  value *create_urem(value *lhs, value *rhs, const std::string &name = "");
+  value *create_add(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false);
+  value *create_sub(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false);
+  value *create_shl(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false);
+  value *create_lshr(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false);
+  value *create_ashr(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false);
+  // GEP
+  value *create_gep(value *ptr, const std::vector<value*>& idx_list, const std::string &name = "");
+  // Comparison (int)
+  value *create_icmp(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpSLE(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpSLT(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpSGE(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpSGT(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpULE(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpULT(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpUGE(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpUGT(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpEQ(value *lhs, value *rhs, const std::string &name = "");
+  value *create_icmpNE(value *lhs, value *rhs, const std::string &name = "");
+  // Comparison (float)
+  value *create_fcmp(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name = "");
+  value *create_fcmpOLT(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fcmpOGT(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fcmpOLE(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fcmpOGE(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fcmpOEQ(value *lhs, value *rhs, const std::string &name = "");
+  value *create_fcmpONE(value *lhs, value *rhs, const std::string &name = "");
+  // Logical
+  value *create_and(value *lhs, value *rhs, const std::string &name = "");
+  value *create_xor(value *lhs, value *rhs, const std::string &name = "");
+  value *create_or(value *lhs, value *rhs, const std::string &name = "");
+  // Unary
+//  value *create_fneg(value *arg, const std::string &name = "");
+//  value *create_neg(value *arg, const std::string &name = "");
+//  value *create_not(value *arg, const std::string &name = "");
+  // Input/Output
+  value *create_load(value *arg, const std::string &name = "");
+  value *create_store(value *ptr, value *val, const std::string &name = "");
+  value *create_masked_load(value *arg, value *mask, value *false_value, const std::string &name = "");
+  value *create_masked_store(value *ptr, value *val, value *mask, const std::string &name = "");
+  // Tile instruction
+  value *create_splat(value *arg, const type::tile_shapes_t &shapes, const std::string &name = "");
+  value *create_reshape(value *arg, const type::tile_shapes_t &shapes, const std::string &name = "");
+  value *create_broadcast(value *arg, const type::tile_shapes_t &shapes, const std::string &name = "");
+  // Built-in instruction
+  value *create_get_program_id(unsigned axis, const std::string &name = "");
+  value *create_get_num_program(unsigned axis, const std::string &name = "");
+  value *create_atomic_cas(value *ptr, value *cmp, value *val, const std::string &name = "");
+  value *create_atomic_exch(value *ptr, value *val, const std::string &name = "");
+  value *create_atomic_add(value *ptr, value *val, value *msk, const std::string &name = "");
+  value *create_exp(value* arg, const std::string &name = "");
+  value *create_log(value* arg, const std::string &name = "");
+  value *create_dot(value *A, value *B, value *C, const std::string &name = "");
+  value *create_trans(value *A, const std::vector<int> &perm = {}, const std::string &name = "");
+  value *create_sqrt(value *A, const std::string &name = "");
+  value *create_reduce(value *A, reduce_inst::op_t op, unsigned axis, const std::string &name = "");
+  value *create_select(value *pred, value *if_value, value *else_value, const std::string &name = "");
+  // Intrinsics
+  value *create_copy_to_shared(value *arg, const std::string &name = "");
+  value *create_masked_load_async(value *arg, value *mask, value *false_value, const std::string &name = "");
+  value *create_copy_from_shared(value *arg, const std::string &name = "");
+  value *create_barrier(const std::string &name = "");
+  value *create_async_wait(int N);
+
+private:
+  context &ctx_;
+  basic_block *block_;
+  iterator insert_point_;
+};
+
+}
+}
+
+#endif
--- a/include/triton/ir/constant.h
+++ b/include/triton/ir/constant.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#ifndef _TRITON_IR_CONSTANT_H_
+#define _TRITON_IR_CONSTANT_H_
+
+#include "enums.h"
+#include "value.h"
+#include <cassert>
+#include "visitor.h"
+
+namespace triton{
+namespace ir{
+
+class type;
+class context;
+
+/* Constant */
+class constant: public user{
+protected:
+  using user::user;
+
+public:
+  static constant* get_all_ones_value(type *ty);
+  static constant* get_null_value(type *ty);
+  virtual std::string repr() const = 0;
+};
+
+/* Undef value */
+class undef_value: public constant{
+private:
+  undef_value(type *ty);
+
+public:
+  static undef_value* get(type* ty);
+  std::string repr() const { return "undef"; }
+  void accept(visitor* vst) { vst->visit_undef_value(this); }
+};
+
+
+/* Constant int */
+class constant_int: public constant{
+protected:
+  constant_int(type *ty, uint64_t value);
+
+public:
+  virtual uint64_t get_value() const { return value_; }
+  static constant_int *get(type *ty, uint64_t value);
+  std::string repr() const { return std::to_string(value_); }
+  void accept(visitor* vst) { vst->visit_constant_int(this); }
+
+protected:
+  uint64_t value_;
+};
+
+/* Constant fp */
+class constant_fp: public constant{
+  constant_fp(type *ty, double value);
+
+public:
+  double get_value() { return value_; }
+  static constant* get_negative_zero(type *ty);
+  static constant* get_zero_value_for_negation(type *ty);
+  static constant* get(context &ctx, double v);
+  static constant* get(type *ty, double v);
+  std::string repr() const { return std::to_string(value_); }
+  void accept(visitor* vst) { vst->visit_constant_fp(this); }
+
+private:
+  double value_;
+};
+
+
+/* Global Value */
+class global_value: public constant {
+public:
+  enum linkage_types_t {
+    external
+  };
+
+public:
+  global_value(type *ty, unsigned num_ops,
+               linkage_types_t linkage, const std::string &name,
+               unsigned addr_space);
+  std::string repr() const { return get_name(); }
+
+private:
+  linkage_types_t linkage_;
+};
+
+/* global object */
+class global_object: public global_value {
+public:
+  global_object(type *ty, unsigned num_ops,
+               linkage_types_t linkage, const std::string &name,
+               unsigned addr_space = 0);
+  std::string repr() const { return get_name(); }
+};
+
+/* global variable */
+class alloc_const: public global_object {
+public:
+  alloc_const(type *ty, constant_int *size,
+              const std::string &name = "");
+  std::string repr() const { return get_name(); }
+  void accept(visitor* vst) { vst->visit_alloc_const(this); }
+
+
+};
+
+}
+}
+
+#endif
--- a/include/triton/ir/context.h
+++ b/include/triton/ir/context.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#ifndef _TRITON_IR_CONTEXT_H_
+#define _TRITON_IR_CONTEXT_H_
+
+#include <memory>
+#include "triton/ir/type.h"
+
+namespace triton{
+namespace ir{
+
+class type;
+class context_impl;
+
+/* Context */
+class context {
+public:
+  context();
+
+public:
+  std::shared_ptr<context_impl> p_impl;
+};
+
+}
+}
+
+#endif
--- a/include/triton/ir/context_impl.h
+++ b/include/triton/ir/context_impl.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#ifndef _TRITON_IR_CONTEXT_IMPL_H_
+#define _TRITON_IR_CONTEXT_IMPL_H_
+
+#include <map>
+#include "triton/ir/type.h"
+
+namespace triton{
+namespace ir{
+
+class context;
+class constant;
+class constant_int;
+class constant_fp;
+class undef_value;
+
+/* Context impl */
+class context_impl {
+public:
+  // constructors
+  context_impl(context &ctx);
+
+public:
+  // primitive types
+  type void_ty, label_ty, half_ty, float_ty, double_ty;
+  // derived types
+  integer_type int1_ty, int8_ty, int16_ty, int32_ty, int64_ty, int128_ty;
+  // Pointer types
+  std::map<std::pair<type*, unsigned>, pointer_type*> ptr_tys;
+  std::map<std::pair<type*, type::tile_shapes_t>, tile_type*> tile_tys;
+  // Int constants
+  std::map<std::pair<type*, uint64_t>, constant_int*> int_constants_;
+  // Float constants
+  std::map<std::pair<type*, double>, constant_fp*> fp_constants_;
+  // undef values
+  std::map<type*, undef_value*> uv_constants_;
+};
+
+}
+}
+
+#endif
--- a/include/triton/ir/enums.h
+++ b/include/triton/ir/enums.h
@@ -0,0 +1,154 @@
+#pragma once
+
+#ifndef _TRITON_IR_ENUMS_H_
+#define _TRITON_IR_ENUMS_H_
+
+namespace triton{
+namespace ir{
+
+
+enum binary_op_t: unsigned int{
+  Add,
+  FAdd,
+  Sub,
+  FSub,
+  Mul,
+  FMul,
+  UDiv,
+  SDiv,
+  FDiv,
+  URem,
+  SRem,
+  FRem,
+  Shl,
+  LShr,
+  AShr,
+  And,
+  Or,
+  Xor
+};
+
+enum cast_op_t: unsigned int {
+  Trunc,
+  ZExt,
+  SExt,
+  FPTrunc,
+  FPExt,
+  UIToFP,
+  SIToFP,
+  FPToUI,
+  FPToSI,
+  PtrToInt,
+  IntToPtr,
+  BitCast,
+  AddrSpaceCast
+};
+
+enum cmp_pred_t: unsigned int {
+  FIRST_FCMP_PREDICATE,
+  FCMP_FALSE,
+  FCMP_OEQ,
+  FCMP_OGT,
+  FCMP_OGE,
+  FCMP_OLT,
+  FCMP_OLE,
+  FCMP_ONE,
+  FCMP_ORD,
+  FCMP_UNO,
+  FCMP_UEQ,
+  FCMP_UGT,
+  FCMP_UGE,
+  FCMP_ULT,
+  FCMP_ULE,
+  FCMP_UNE,
+  FCMP_TRUE,
+  LAST_FCMP_PREDICATE,
+  FIRST_ICMP_PREDICATE,
+  ICMP_EQ,
+  ICMP_NE,
+  ICMP_UGT,
+  ICMP_UGE,
+  ICMP_ULT,
+  ICMP_ULE,
+  ICMP_SGT,
+  ICMP_SGE,
+  ICMP_SLT,
+  ICMP_SLE,
+  LAST_ICMP_PREDICATE
+};
+
+enum value_id_t: unsigned {
+  /* ------------ *
+    INSTRUCTIONS
+   * ------------ */
+  INST_BEGIN,
+  // phi
+  INST_PHI,
+  // arithmetic
+  INST_BINOP,
+  INST_GETELEMENTPTR,
+  INST_SELECT,
+  INST_SQRT,
+  // cmp
+  INST_ICMP,
+  INST_FCMP,
+  // cast
+  INST_CAST_TRUNC,
+  INST_CAST_ZEXT,
+  INST_CAST_SEXT,
+  INST_CAST_FP_TRUNC,
+  INST_CAST_FP_EXT,
+  INST_CAST_UI_TO_FP,
+  INST_CAST_SI_TO_FP,
+  INST_CAST_FP_TO_UI,
+  INST_CAST_FP_TO_SI,
+  INST_CAST_PTR_TO_INT,
+  INST_CAST_INT_TO_PTR,
+  INST_CAST_BIT_CAST,
+  INST_CAST_ADDR_SPACE_CAST,
+  // terminators
+  INST_RETURN,
+  INST_COND_BRANCH,
+  INST_UNCOND_BRANCH,
+  // io
+  INST_UNMASKED_LOAD,
+  INST_MASKED_LOAD,
+  INST_MASKED_LOAD_ASYNC,
+  INST_UNMASKED_STORE,
+  INST_MASKED_STORE,
+  // retile
+  INST_RESHAPE,
+  INST_SPLAT,
+  INST_BROADCAST,
+  INST_DOWNCAST,
+  // builtin
+  INST_GET_PROGRAM_ID,
+  INST_GET_NUM_PROGRAMS,
+  // atomics
+  INST_ATOMIC_CAS,
+  INST_ATOMIC_EXCH,
+  INST_ATOMIC_ADD,
+  // math
+  INST_EXP,
+  INST_LOG,
+  // array arithmetic
+  INST_TRANS,
+  INST_REDUCE,
+  INST_DOT,
+  // intrinsics
+  INST_COPY_TO_SHARED,
+  INST_COPY_FROM_SHARED,
+  INST_RECOALESCE,
+  INST_BARRIER,
+  INST_ASYNC_WAIT,
+  INST_MAKE_RANGE_DYN,
+  INST_MAKE_RANGE_STA,
+  INST_MAKE_RANGE
+};
+
+
+
+}
+}
+
+#endif
--- a/include/triton/ir/function.h
+++ b/include/triton/ir/function.h
@@ -0,0 +1,136 @@
+#pragma once
+
+#ifndef _TRITON_IR_FUNCTION_H_
+#define _TRITON_IR_FUNCTION_H_
+
+#include <string>
+#include <map>
+#include "value.h"
+#include "constant.h"
+
+namespace triton{
+namespace ir{
+
+class function;
+class function_type;
+class module;
+class basic_block;
+
+/* Argument */
+class argument: public value{
+  argument(type *ty, const std::string &name, function *parent, unsigned arg_no);
+
+public:
+  static argument* create(type *ty, const std::string &name,
+                          function *parent = nullptr, unsigned arg_no = 0);
+  function* get_parent() const;
+  unsigned get_arg_no() const;
+
+  void accept(visitor *v);
+
+private:
+  function *parent_;
+  unsigned arg_no_;
+};
+
+/* Attribute */
+enum attribute_kind_t {
+  readonly,
+  writeonly,
+  noalias,
+  aligned,
+  multiple_of,
+  retune,
+  not_implemented
+};
+
+class attribute {
+public:
+  attribute(attribute_kind_t kind, unsigned value = 0):
+    kind_(kind), value_(value){}
+
+  bool operator<(const attribute& other) const {
+    return std::make_pair(kind_, value_) < std::make_pair(other.kind_, other.value_);
+  }
+
+  attribute_kind_t get_kind() const {
+    return kind_;
+  }
+
+  unsigned get_value() const {
+    return value_;
+  }
+
+  bool is_llvm_attr() const {
+    return kind_ != multiple_of;
+  }
+
+  std::string repr() const {
+    switch(kind_){
+      case readonly: return ".readonly";
+      case writeonly: return ".writeonly";
+      case noalias: return ".noalias";
+      case aligned: return ".aligned(" + std::to_string(value_) + ")";
+      case multiple_of: return ".readonly";
+      case retune: return ".retunr";
+      default: break;
+    }
+    assert(false);
+    return "";
+  }
+
+private:
+  attribute_kind_t kind_;
+  unsigned value_;
+};
+
+/* Function */
+class function: public global_object{
+  typedef std::vector<argument*> args_t;
+  typedef args_t::iterator       arg_iterator;
+  typedef args_t::const_iterator const_arg_iterator;
+
+  typedef std::vector<basic_block*> blocks_t;
+  typedef blocks_t::iterator        block_iterator;
+  typedef blocks_t::const_iterator  const_block_iterator;
+
+  typedef std::map<unsigned, std::set<attribute>> attr_map_t;
+
+private:
+  function(function_type *ty, linkage_types_t linkage,
+           const std::string &name = "", module *parent = nullptr);
+
+public:
+  // accessors
+  const args_t &args() { return args_; }
+  function_type* get_fn_type() { return fn_ty_; }
+
+  // factory methods
+  static function *create(function_type *ty, linkage_types_t linkage,
+                          const std::string &name, module *mod);
+  // blocks
+  const blocks_t &blocks() { return blocks_; }
+  void insert_block(basic_block* block, basic_block *next = nullptr);
+
+  // attributes
+  void add_attr(unsigned arg_id, attribute attr) { attrs_[arg_id].insert(attr); }
+  const attr_map_t &attrs() { return attrs_; }
+  bool has_attr(unsigned arg_id) const { return  attrs_.find(arg_id) != attrs_.end(); }
+  std::set<attribute> get_attributes(argument* arg) { return attrs_[arg->get_arg_no() + 1]; }
+
+  // visitor
+  void accept(visitor *v) { v->visit_function(this); }
+
+private:
+  module *parent_;
+  bool init_;
+  function_type *fn_ty_;
+  args_t args_;
+  blocks_t blocks_;
+  attr_map_t attrs_;
+};
+
+}
+}
+
+#endif
--- a/include/triton/ir/instructions.h
+++ b/include/triton/ir/instructions.h
@@ -0,0 +1,874 @@
+#pragma once
+
+#ifndef _TRITON_IR_INSTRUCTIONS_H_
+#define _TRITON_IR_INSTRUCTIONS_H_
+
+#include <vector>
+#include <map>
+#include "triton/ir/enums.h"
+#include "triton/ir/constant.h"
+#include "triton/ir/value.h"
+#include "triton/ir/type.h"
+#include "triton/ir/metadata.h"
+#include "triton/ir/visitor.h"
+
+#define _TRITON_DEFINE_CLONE(name) \
+  ir::instruction* clone_impl() const { return new name(*this); }
+
+#define _TRITON_DEFINE_ACCEPT(name) \
+  void accept(visitor* v) { v->visit_ ## name (this); }
+
+namespace triton{
+namespace ir{
+
+class constant_int;
+class constant;
+class make_range;
+class basic_block;
+class context;
+class visitor;
+
+//===----------------------------------------------------------------------===//
+//                               instruction classes
+//===----------------------------------------------------------------------===//
+
+class result_reference;
+
+
+class instruction: public user{
+public:
+  virtual std::string repr_impl() const = 0;
+
+private:
+  virtual ir::instruction* clone_impl() const = 0;
+
+protected:
+  // constructors
+  instruction(type *ty, value_id_t ity, unsigned num_ops,
+              const std::string &name = "", instruction *next = nullptr);
+
+public:
+  // parent
+  void set_parent(basic_block *block)                         { parent_ = block; }
+  const basic_block *get_parent() const                       { return parent_;  }
+  basic_block *get_parent()                                   { return parent_;  }
+  void erase_from_parent();
+  // helpers
+  bool has_tile_result_or_op();
+  // repr
+  std::string repr() const                                    { return repr_impl(); }
+  // metadata
+  void set_metadata(ir::metadata::kind_t kind,
+                    unsigned value)                           { metadatas_[kind] = value;}
+  unsigned get_metadata(ir::metadata::kind_t kind)            { return metadatas_[kind];}
+  // cloning
+  ir::instruction* clone() {
+    ir::instruction* res = clone_impl();
+//    for(auto it = op_begin(); it != op_end(); it++)
+//      (*it)->add_use(res);
+    res->parent_ = nullptr;
+    res->users_.clear();
+    return res;
+  }
+  // instruction id
+  value_id_t get_id() const { return id_; }
+
+private:
+  basic_block *parent_;
+  std::map<ir::metadata::kind_t, unsigned> metadatas_;
+  value_id_t id_;
+};
+
+
+//===----------------------------------------------------------------------===//
+//                               phi_node classes
+//===----------------------------------------------------------------------===//
+
+class phi_node: public instruction {
+private:
+  phi_node(type *ty, unsigned num_reserved, const std::string &name, instruction *next);
+  std::string repr_impl() const { return "phi"; }
+
+public:
+  void set_incoming_value(unsigned i, value *v);
+  void set_incoming_block(unsigned i, basic_block *block);
+  value *get_value_for_block(basic_block *block);
+  value *get_incoming_value(unsigned i) { return get_operand(i); }
+  basic_block *get_incoming_block(unsigned i) { return blocks_[i]; }
+  unsigned get_num_incoming() { return get_num_operands(); }
+  void add_incoming(value *v, basic_block *block);
+
+  // Type
+  void set_type(type *ty) { ty_ = ty; }
+
+  // Factory methods
+  static phi_node* create(type *ty, unsigned num_reserved, const std::string &name = "", instruction *next = nullptr);
+
+  _TRITON_DEFINE_CLONE(phi_node)
+  _TRITON_DEFINE_ACCEPT(phi_node)
+
+private:
+  unsigned num_reserved_;
+  std::vector<basic_block*> blocks_;
+};
+
+//===----------------------------------------------------------------------===//
+//                               binary_operator classes
+//===----------------------------------------------------------------------===//
+class binary_operator: public instruction {
+public:
+  typedef binary_op_t op_t;
+
+private:
+  std::string repr_impl() const;
+
+protected:
+  // Constructors
+  binary_operator(binary_op_t op, value *lhs, value *rhs, type *ty, const std::string &name, instruction *next);
+
+public:
+  // Get operand
+  binary_op_t get_op() const { return op_; }
+
+  // Bool
+  bool is_terminator()  const;
+  bool is_binary_op()   const;
+  bool is_int_div_rem() const;
+  bool is_shift()       const;
+  bool is_cast()        const;
+  bool is_int_mult()    const;
+  bool is_int_add_sub() const;
+  bool is_int_div()     const;
+  bool is_int_rem()     const;
+  bool is_shl()         const;
+  bool is_shr()         const;
+
+  // Wraps
+  void set_has_no_unsigned_wrap(bool b = true) { has_no_unsigned_wrap_ = b; }
+  void set_has_no_signed_wrap(bool b = true)   { has_no_signed_wrap_ = b; }
+
+  // Factory methods
+  static binary_operator *create(binary_op_t op, value *lhs, value *rhs,
+                                 const std::string &name = "", instruction *next = nullptr);
+//  static binary_operator *create_fneg(value *arg, const std::string &name = "", instruction *next = nullptr);
+//  static binary_operator *create_neg(value *arg, const std::string &name = "", instruction *next = nullptr);
+//  static binary_operator *create_not(value *arg, const std::string &name = "", instruction *next = nullptr);
+
+  _TRITON_DEFINE_CLONE(binary_operator)
+  _TRITON_DEFINE_ACCEPT(binary_operator)
+
+public:
+  binary_op_t op_;
+  bool has_no_unsigned_wrap_;
+  bool has_no_signed_wrap_;
+};
+
+
+//===----------------------------------------------------------------------===//
+//                               cmp_inst classes
+//===----------------------------------------------------------------------===//
+
+class cmp_inst: public instruction{
+public:
+  typedef cmp_pred_t pred_t;
+
+private:
+  std::string repr_impl() const;
+
+protected:
+  cmp_inst(type *ty, value_id_t id, cmp_pred_t pred,
+           value *lhs, value *rhs, const std::string &name, instruction *next);
+  static bool is_fp_predicate(cmp_pred_t pred);
+  static bool is_int_predicate(cmp_pred_t pred);
+  static type* make_cmp_result_type(type *ty);
+
+public:
+  cmp_pred_t get_pred() const { return pred_; }
+
+private:
+  cmp_pred_t pred_;
+};
+
+class icmp_inst: public cmp_inst {
+  icmp_inst(type *ty, cmp_pred_t pred,
+            value *lhs, value *rhs, const std::string &name, instruction *next);
+
+public:
+  static icmp_inst* create(cmp_pred_t pred, value *lhs, value *rhs,
+                    const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(icmp_inst)
+  _TRITON_DEFINE_ACCEPT(icmp_inst)
+};
+
+class fcmp_inst: public cmp_inst {
+  fcmp_inst(type *ty, cmp_pred_t pred,
+            value *lhs, value *rhs, const std::string &name, instruction *next);
+
+public:
+  static fcmp_inst* create(cmp_pred_t pred, value *lhs, value *rhs,
+                    const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(fcmp_inst)
+  _TRITON_DEFINE_ACCEPT(fcmp_inst)
+};
+
+//===----------------------------------------------------------------------===//
+//                               unary_inst classes
+//===----------------------------------------------------------------------===//
+
+class unary_inst: public instruction {
+protected:
+  unary_inst(type *ty, value_id_t id, value *v, const std::string &name, instruction *next);
+};
+
+
+//===----------------------------------------------------------------------===//
+//                               cast_inst classes
+//===----------------------------------------------------------------------===//
+
+class cast_inst: public unary_inst{
+private:
+  std::string repr_impl() const;
+
+protected:
+  cast_inst(type *ty, value_id_t id, value *v, const std::string &name, instruction *next, cast_op_t op)
+    : unary_inst(ty, id, v, name, next), op_(op) { }
+
+private:
+  static bool is_valid(cast_op_t op, value *arg, type *ty);
+
+public:
+  // accessors
+  cast_op_t get_op() const { return op_; }
+
+  // factory methods
+  static cast_inst *create(cast_op_t op, value *arg, type *ty,
+                           const std::string &name = "", instruction *next = nullptr);
+  static cast_inst *create_integer_cast(value *arg, type *ty, bool is_signed,
+                           const std::string &name = "", instruction *next = nullptr);
+
+  _TRITON_DEFINE_ACCEPT(cast_inst)
+
+private:
+  cast_op_t op_;
+};
+
+#define TRITON_IR_DECLARE_CAST_INST_SIMPL(name, id, op) \
+class name : public cast_inst { \
+  _TRITON_DEFINE_CLONE(name) \
+  friend class cast_inst; \
+  name(type *ty, value *v, const std::string &name, instruction *next) \
+    : cast_inst(ty, id, v, name, next, op){ } \
+};
+
+TRITON_IR_DECLARE_CAST_INST_SIMPL(trunc_inst, INST_CAST_TRUNC, cast_op_t::Trunc)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(z_ext_inst, INST_CAST_ZEXT, cast_op_t::ZExt)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(s_ext_inst, INST_CAST_SEXT, cast_op_t::SExt)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_trunc_inst, INST_CAST_FP_TRUNC, cast_op_t::FPTrunc)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_ext_inst, INST_CAST_FP_EXT, cast_op_t::FPExt)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(ui_to_fp_inst, INST_CAST_UI_TO_FP, cast_op_t::UIToFP)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(si_to_fp_inst, INST_CAST_SI_TO_FP, cast_op_t::SIToFP)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_to_ui_inst, INST_CAST_FP_TO_UI, cast_op_t::FPToUI)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_to_si_inst, INST_CAST_FP_TO_SI, cast_op_t::FPToSI)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(ptr_to_int_inst, INST_CAST_PTR_TO_INT, cast_op_t::PtrToInt)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(int_to_ptr_inst, INST_CAST_INT_TO_PTR, cast_op_t::IntToPtr)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(bit_cast_inst, INST_CAST_BIT_CAST, cast_op_t::BitCast)
+TRITON_IR_DECLARE_CAST_INST_SIMPL(addr_space_cast_inst, INST_CAST_ADDR_SPACE_CAST, cast_op_t::AddrSpaceCast)
+
+//===----------------------------------------------------------------------===//
+//                               terminator_inst classes
+//===----------------------------------------------------------------------===//
+
+class terminator_inst: public instruction{
+  using instruction::instruction;
+};
+
+// return instruction
+class return_inst: public terminator_inst {
+private:
+  std::string repr_impl() const { return "ret"; }
+  return_inst(context &ctx, value *ret_val, instruction *next);
+
+public:
+  // accessors
+  value *get_return_value()
+  { return get_num_operands() ? get_operand(0) : nullptr; }
+
+  unsigned get_num_successors() const { return 0; }
+
+  // factory methods
+  static return_inst* create(context &ctx, value *ret_val = nullptr, instruction *next = nullptr);
+
+  _TRITON_DEFINE_CLONE(return_inst)
+  _TRITON_DEFINE_ACCEPT(return_inst)
+};
+
+// base branch instruction
+class branch_inst: public terminator_inst{
+private:
+  std::string repr_impl() const { return "br"; }
+
+protected:
+  using terminator_inst::terminator_inst;
+
+public:
+  static branch_inst* create(basic_block *dest,
+                             instruction *next = nullptr);
+  static branch_inst* create(value *cond, basic_block *if_dest, basic_block *else_dest,
+                             instruction *next = nullptr);
+};
+
+// conditional branch
+class cond_branch_inst: public branch_inst {
+private:
+  friend class branch_inst;
+  cond_branch_inst(basic_block *if_dst, basic_block *else_dst, value *cond, instruction *next);
+
+public:
+  basic_block *get_true_dest()  { return (basic_block*)get_operand(0); }
+  basic_block *get_false_dest() { return (basic_block*)get_operand(1); }
+  value *get_cond()             { return get_operand(2); }
+  _TRITON_DEFINE_CLONE(cond_branch_inst)
+  _TRITON_DEFINE_ACCEPT(cond_branch_inst)
+};
+
+// unconditional branch
+class uncond_branch_inst: public branch_inst {
+private:
+  friend class branch_inst;
+  uncond_branch_inst(basic_block *dst, instruction *next);
+
+public:
+  basic_block *get_dest()  { return (basic_block*)get_operand(0); }
+  _TRITON_DEFINE_CLONE(uncond_branch_inst)
+  _TRITON_DEFINE_ACCEPT(uncond_branch_inst)
+};
+
+
+//===----------------------------------------------------------------------===//
+//                               getelementptr_inst classes
+//===----------------------------------------------------------------------===//
+
+class getelementptr_inst: public instruction {
+private:
+  std::string repr_impl() const { return "getelementptr"; }
+  getelementptr_inst(type *pointee_ty, value *ptr, const std::vector<value*> &idx, const std::string &name, instruction *next);
+
+private:
+  static type *get_return_type(type *ty, value *ptr, const std::vector<value*> &idx);
+  static type *get_indexed_type_impl(type *ty, const std::vector<value *> &idx);
+  static type *get_indexed_type(type *ty, const std::vector<value*> &idx);
+
+public:
+  // accessors
+  type *get_source_elt_ty() { return source_elt_ty; }
+  op_iterator idx_begin()       { return op_begin() + 1; }
+  op_iterator idx_end()         { return op_end(); }
+  value *get_pointer_operand()  { return *op_begin(); }
+
+  // factory methods
+  static getelementptr_inst* create(value *ptr, const std::vector<value*> &idx,
+                                    const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(getelementptr_inst)
+  _TRITON_DEFINE_ACCEPT(getelementptr_inst)
+
+private:
+  type *source_elt_ty;
+  type *res_elt_ty;
+};
+
+//===----------------------------------------------------------------------===//
+//                          load_inst/store_inst classes
+//===----------------------------------------------------------------------===//
+
+class io_inst: public instruction {
+protected:
+  io_inst(type *ty, value_id_t id, unsigned num_ops,
+          const std::string &name = "", instruction *next = nullptr);
+
+public:
+  // accessors
+  value *get_pointer_operand() { return get_operand(0); }
+};
+
+// load
+class load_inst: public io_inst {
+protected:
+  load_inst(value *ptr, value_id_t id, unsigned num_ops,
+          const std::string &name = "", instruction *next = nullptr);
+
+private:
+  static type *get_pointee_type(type *ty);
+};
+
+// unmasked load
+class unmasked_load_inst: public load_inst {
+private:
+  std::string repr_impl() const { return "unmasked_load"; }
+  unmasked_load_inst(value *ptr, const std::string &name, instruction *next);
+
+public:
+  static unmasked_load_inst* create(value *ptr,
+                                    const std::string &name = "",
+                                    instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(unmasked_load_inst)
+  _TRITON_DEFINE_ACCEPT(unmasked_load_inst)
+};
+
+// masked load
+class masked_load_inst: public load_inst {
+private:
+  std::string repr_impl() const { return "masked_load"; }
+  masked_load_inst(value *ptr, value *mask, value *false_value,
+                   const std::string &name, instruction *next);
+
+public:
+  // accessors
+  value *get_mask_operand() { return get_operand(1); }
+  value *get_false_value_operand() { return get_operand(2); }
+  // factory method
+  static masked_load_inst* create(value *ptr, value *mask, value *false_value,
+                                  const std::string &name = "",
+                                  instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(masked_load_inst)
+  _TRITON_DEFINE_ACCEPT(masked_load_inst)
+};
+
+// masked load async
+class masked_load_async_inst: public load_inst {
+private:
+  std::string repr_impl() const { return "masked_load_async_async"; }
+  masked_load_async_inst(value *ptr, value *mask, value *false_value,
+                   const std::string &name, instruction *next);
+
+public:
+  // accessors
+  value *get_mask_operand() { return get_operand(1); }
+  value *get_false_value_operand() { return get_operand(2); }
+  // factory method
+  static masked_load_async_inst* create(value *ptr, value *mask, value *false_value,
+                                  const std::string &name = "",
+                                  instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(masked_load_async_inst)
+  _TRITON_DEFINE_ACCEPT(masked_load_async_inst)
+};
+
+class atomic_add_inst: public io_inst {
+private:
+  atomic_add_inst(value *ptr, value *val, value *msk, const std::string &name = "", instruction *next = nullptr);
+  std::string repr_impl() const { return "atomic_add"; }
+  _TRITON_DEFINE_CLONE(atomic_add_inst)
+  _TRITON_DEFINE_ACCEPT(atomic_add_inst)
+
+public:
+  static instruction* create(value *ptr, value *val, value *msk, const std::string &name = "", instruction *next = nullptr);
+};
+
+
+// store
+class store_inst: public io_inst {
+protected:
+  store_inst(value *ptr, value_id_t id, unsigned num_ops,
+            const std::string &name = "", instruction *next = nullptr);
+
+public:
+  value *get_value_operand() { return get_operand(1); }
+};
+
+// unmasked_store
+class unmasked_store_inst: public store_inst{
+private:
+  std::string repr_impl() const { return "unmasked_store"; }
+  unmasked_store_inst(value *ptr, value *v, const std::string &name, instruction *next);
+
+public:
+  // factory method
+  static unmasked_store_inst* create(value* ptr, value *v,
+                                    const std::string &name = "",
+                                    instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(unmasked_store_inst)
+  _TRITON_DEFINE_ACCEPT(unmasked_store_inst)
+};
+
+class masked_store_inst: public store_inst{
+private:
+  std::string repr_impl() const { return "masked_store"; }
+  masked_store_inst(value *ptr, value *v, value *mask,
+                    const std::string &name, instruction *next);
+
+public:
+  // accessors
+  value *get_mask_operand() { return get_operand(2); }
+  // factory method
+  static masked_store_inst* create(value *ptr, value *v, value *mask,
+                                   const std::string &name = "",
+                                   instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(masked_store_inst)
+  _TRITON_DEFINE_ACCEPT(masked_store_inst)
+};
+
+//===----------------------------------------------------------------------===//
+//                               retile_inst classes
+//===----------------------------------------------------------------------===//
+
+// retile
+
+class retile_inst: public unary_inst {
+protected:
+  retile_inst(value *arg, value_id_t id, const type::tile_shapes_t &shapes, const std::string &name, instruction *next);
+};
+
+// reshape
+
+class reshape_inst: public retile_inst {
+private:
+  using retile_inst::retile_inst;
+  std::string repr_impl() const { return "reshape"; }
+
+public:
+  static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix,
+                      const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(reshape_inst)
+  _TRITON_DEFINE_ACCEPT(reshape_inst)
+};
+
+// splat
+
+class splat_inst: public retile_inst {
+private:
+  using retile_inst::retile_inst;
+  std::string repr_impl() const { return "splat"; }
+
+public:
+  static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix,
+                      const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(splat_inst)
+  _TRITON_DEFINE_ACCEPT(splat_inst)
+};
+
+// broadcast
+
+class broadcast_inst: public retile_inst {
+private:
+  using retile_inst::retile_inst;
+  std::string repr_impl() const { return "broadcast"; }
+
+public:
+  static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix,
+                      const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(broadcast_inst)
+  _TRITON_DEFINE_ACCEPT(broadcast_inst)
+};
+
+
+// downcast
+
+class downcast_inst: public unary_inst {
+private:
+  using unary_inst::unary_inst;
+  std::string repr_impl() const { return "downcast"; }
+
+public:
+  static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(downcast_inst)
+  _TRITON_DEFINE_ACCEPT(downcast_inst)
+};
+
+//===----------------------------------------------------------------------===//
+//                               builtin_inst classes
+//===----------------------------------------------------------------------===//
+
+class builtin_inst: public instruction{
+protected:
+  using instruction::instruction;
+};
+
+class get_program_id_inst: public builtin_inst {
+private:
+  get_program_id_inst(type *ty, unsigned axis, const std::string &name, instruction *next);
+  std::string repr_impl() const { return "get_program_id(" + std::to_string(axis_) + ")"; }
+
+public:
+  static instruction* create(context &ctx, unsigned axis, const std::string &name = "", instruction *next = nullptr);
+  unsigned get_axis() const { return axis_; }
+  _TRITON_DEFINE_CLONE(get_program_id_inst)
+  _TRITON_DEFINE_ACCEPT(get_program_id_inst)
+
+private:
+  unsigned axis_;
+};
+
+class get_num_program_inst: public builtin_inst {
+private:
+  get_num_program_inst(type *ty, unsigned axis, const std::string &name, instruction *next);
+  std::string repr_impl() const { return "get_num_program(" + std::to_string(axis_) + ")"; }
+
+public:
+  static instruction* create(context &ctx, unsigned axis, const std::string &name = "", instruction *next = nullptr);
+  unsigned get_axis() const { return axis_; }
+  _TRITON_DEFINE_CLONE(get_num_program_inst)
+  _TRITON_DEFINE_ACCEPT(get_num_program_inst)
+
+private:
+  unsigned axis_;
+};
+
+class atomic_cas_inst: public builtin_inst {
+private:
+  atomic_cas_inst(value *ptr, value *cmp, value *val, const std::string &name, instruction *next);
+  std::string repr_impl() const { return "atomic_cas"; }
+  _TRITON_DEFINE_CLONE(atomic_cas_inst)
+  _TRITON_DEFINE_ACCEPT(atomic_cas_inst)
+
+public:
+  static instruction* create(value *ptr, value *cmp, value *val, const std::string &name = "", instruction *next = nullptr);
+};
+
+class atomic_exch_inst: public builtin_inst {
+private:
+  atomic_exch_inst(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr);
+  std::string repr_impl() const { return "atomic_exch"; }
+  _TRITON_DEFINE_CLONE(atomic_exch_inst)
+  _TRITON_DEFINE_ACCEPT(atomic_exch_inst)
+
+public:
+  static instruction* create(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr);
+};
+
+
+class exp_inst: public builtin_inst {
+private:
+  exp_inst(value *val, const std::string &name = "", instruction *next = nullptr);
+  std::string repr_impl() const { return "exp"; }
+  _TRITON_DEFINE_CLONE(exp_inst)
+  _TRITON_DEFINE_ACCEPT(exp_inst)
+
+public:
+  static instruction* create(value *val, const std::string &name = "", instruction *next = nullptr);
+};
+
+class log_inst: public builtin_inst {
+private:
+  log_inst(value *val, const std::string &name = "", instruction *next = nullptr);
+  std::string repr_impl() const { return "log"; }
+  _TRITON_DEFINE_CLONE(log_inst)
+  _TRITON_DEFINE_ACCEPT(log_inst)
+
+public:
+  static instruction* create(value *val, const std::string &name = "", instruction *next = nullptr);
+};
+
+
+class dot_inst: public builtin_inst {
+public:
+  enum TransT { NoTrans, Trans };
+
+private:
+  dot_inst(value *A, value *B, value *C, TransT AT, TransT BT, const std::string &name, instruction *next);
+  std::string repr_impl() const { return "dot"; }
+
+public:
+  static instruction *create(value *A, value *B, value *C, bool AT, bool BT, const std::string &name = "", instruction *next = nullptr);
+  static instruction* create_nn(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr);
+  static instruction* create_nt(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr);
+  static instruction* create_tn(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr);
+  static instruction* create_tt(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(dot_inst)
+  _TRITON_DEFINE_ACCEPT(dot_inst)
+};
+
+//class outer_inst: public builtin_inst {
+//private:
+//  outer_inst(value *A, value *B, value *C, const std::string &name, instruction *next);
+//public:
+//  static instruction* create(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr);
+//};
+
+class trans_inst: public builtin_inst {
+public:
+  ir::type* get_res_ty(ir::type* in, std::vector<int> perm);
+  std::vector<int> init_perm(ir::type* ty, const std::vector<int>& perm);
+
+private:
+  trans_inst(value *arg, const std::vector<int>& perm, const std::string& name, instruction* next);
+  std::string repr_impl() const { return "trans"; }
+
+public:
+  static instruction* create(value *arg, const std::vector<int> &perm = {}, const std::string &name = "", instruction *next = nullptr);
+  const std::vector<int> get_perm() const;
+  _TRITON_DEFINE_CLONE(trans_inst)
+  _TRITON_DEFINE_ACCEPT(trans_inst)
+
+private:
+  std::vector<int> perm_;
+};
+
+class sqrt_inst: public builtin_inst {
+private:
+  sqrt_inst(value *arg, const std::string& name, instruction* next);
+  std::string repr_impl() const { return "sqrt"; }
+public:
+  static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(sqrt_inst)
+  _TRITON_DEFINE_ACCEPT(sqrt_inst)
+};
+
+class reduce_inst: public builtin_inst {
+public:
+  enum op_t{
+    ADD, SUB, MAX, MIN,
+    FADD, FSUB, FMAX, FMIN
+  };
+
+private:
+  static type* get_res_type(value *arg, unsigned axis);
+  static std::string to_str(op_t op);
+
+private:
+  reduce_inst(value* arg, op_t op, unsigned axis, const std::string& name, instruction* next);
+  std::string repr_impl() const { return "reduce"; }
+  _TRITON_DEFINE_CLONE(reduce_inst)
+  _TRITON_DEFINE_ACCEPT(reduce_inst)
+
+public:
+  static instruction* create(value *arg, op_t op, unsigned axis, const std::string &name = "", instruction *next = nullptr);
+  unsigned get_axis() const { return axis_; }
+  op_t get_op() const { return op_; }
+
+private:
+  unsigned axis_;
+  op_t op_;
+};
+
+class select_inst: public builtin_inst {
+private:
+  select_inst(value *pred, value *if_value, value *else_value, const std::string& name, instruction* next);
+  std::string repr_impl() const { return "select"; }
+  _TRITON_DEFINE_CLONE(select_inst)
+  _TRITON_DEFINE_ACCEPT(select_inst)
+
+public:
+  static instruction* create(value *pred, value *if_value, value *else_value, const std::string &name = "", instruction *next = nullptr);
+  value* get_pred_op() { return get_operand(0); }
+  value* get_if_value_op() { return get_operand(1); }
+  value* get_else_value_op() { return get_operand(2); }
+};
+
+//===----------------------------------------------------------------------===//
+//                               intrinsics classes
+//===----------------------------------------------------------------------===//
+
+class copy_to_shared_inst: public unary_inst{
+private:
+  using unary_inst::unary_inst;
+  std::string repr_impl() const { return "copy_to_shared"; }
+
+public:
+  static copy_to_shared_inst* create(value *arg, const std::string &name = "",
+                                     instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(copy_to_shared_inst)
+  _TRITON_DEFINE_ACCEPT(copy_to_shared_inst)
+};
+
+class copy_from_shared_inst: public unary_inst{
+private:
+  using unary_inst::unary_inst;
+  std::string repr_impl() const { return "copy_from_shared"; }
+
+public:
+  static copy_from_shared_inst* create(value *arg, const std::string &name = "",
+                                     instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(copy_from_shared_inst)
+  _TRITON_DEFINE_ACCEPT(copy_from_shared_inst)
+};
+
+
+class recoalesce_inst: public unary_inst{
+private:
+  using unary_inst::unary_inst;
+  std::string repr_impl() const { return "recoalesce_inst"; }
+
+public:
+  static recoalesce_inst* create(value *arg, const std::string &name = "", instruction *next = nullptr);
+  _TRITON_DEFINE_CLONE(recoalesce_inst)
+  _TRITON_DEFINE_ACCEPT(recoalesce_inst)
+};
+
+class barrier_inst: public instruction{
+private:
+  barrier_inst(context &ctx, const std::string &name, instruction *next);
+  std::string repr_impl() const { return "barrier"; }
+  _TRITON_DEFINE_CLONE(barrier_inst)
+  _TRITON_DEFINE_ACCEPT(barrier_inst)
+
+public:
+  static barrier_inst* create(context &ctx, const std::string &name = "",
+                                            instruction *next = nullptr);
+};
+
+class async_wait_inst: public instruction{
+private:
+  async_wait_inst(context &ctx, int N, const std::string &name, instruction *next);
+  std::string repr_impl() const { return "async_wait_group " + std::to_string(N_) ; }
+  _TRITON_DEFINE_CLONE(async_wait_inst)
+  _TRITON_DEFINE_ACCEPT(async_wait_inst)
+
+public:
+  static async_wait_inst* create(context &ctx, int N,
+                                 const std::string &name = "", instruction *next = nullptr);
+  int get_N() { return N_; }
+
+private:
+  int N_;
+};
+
+// On NVIDIA, implementation is such that
+// constant_range = nv_dynamic_program_idx + nv_static_program_idx
+// so as to enable re-association on nv_static_program_idx which is constant
+class make_range_dyn: public instruction {
+private:
+  make_range_dyn(type *ty, const std::string &name, instruction *next);
+  std::string repr_impl() const { return "nv_dynamic_program_idx"; }
+  _TRITON_DEFINE_CLONE(make_range_dyn)
+  _TRITON_DEFINE_ACCEPT(make_range_dyn)
+
+public:
+  static make_range_dyn* create(type *ty, const std::string &name = "", instruction *next = nullptr);
+};
+
+class make_range_sta: public constant {
+private:
+  make_range_sta(make_range *range);
+
+public:
+  static make_range_sta *get(make_range* range);
+  make_range* get_range() const;
+  std::string repr() const { return "nv_static_program_idx"; }
+  _TRITON_DEFINE_ACCEPT(make_range_sta)
+
+private:
+  make_range *range_;
+};
+
+
+/* constant range */
+class make_range: public instruction{
+  make_range(type *ty, constant_int* first, constant_int* last);
+  std::string repr_impl() const { return "make_range[" + first_->repr() + " : " + last_->repr() + "]"; }
+  _TRITON_DEFINE_CLONE(make_range)
+  _TRITON_DEFINE_ACCEPT(make_range)
+
+public:
+  static make_range *create(constant_int *first, constant_int *last);
+  const constant_int* get_first() const;
+  const constant_int* get_last() const;
+
+private:
+  constant_int* first_;
+  constant_int* last_;
+};
+
+
+}
+}
+
+#endif
--- a/include/triton/ir/metadata.h
+++ b/include/triton/ir/metadata.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#ifndef _TRITON_IR_METADATA_H_
+#define _TRITON_IR_METADATA_H_
+
+namespace triton{
+namespace ir{
+
+
+/* Metadata */
+class metadata{
+public:
+  enum kind_t{
+    multiple_of
+  };
+
+private:
+  metadata(kind_t kind, unsigned value);
+
+public:
+  static metadata* get(kind_t kind, unsigned value);
+
+private:
+  kind_t kind_;
+  unsigned value_;
+};
+
+}
+}
+
+#endif
--- a/include/triton/ir/module.h
+++ b/include/triton/ir/module.h
@@ -0,0 +1,118 @@
+#pragma once
+
+#ifndef _TRITON_IR_MODULE_H_
+#define _TRITON_IR_MODULE_H_
+
+#include <map>
+#include <set>
+#include <stack>
+#include <string>
+#include <functional>
+#include "triton/ir/builder.h"
+#include "triton/ir/metadata.h"
+#include "triton/ir/context.h"
+
+namespace triton{
+
+namespace lang{
+
+class iteration_statement;
+class compound_statement;
+
+}
+
+namespace ir{
+
+class basic_block;
+class phi_node;
+class value;
+class context;
+class function;
+class attribute;
+class function_type;
+class constant;
+class global_value;
+class alloc_const;
+
+/* Module */
+struct scope {
+  std::map<std::string, ir::type*> types;
+  std::map<std::string, ir::value*> values;
+};
+
+class module {
+  typedef std::pair<std::string, basic_block*> val_key_t;
+  friend class function;
+  typedef std::pair<ir::metadata::kind_t, unsigned> md_pair_t;
+
+public:
+  typedef std::map<std::string, global_value*> symbols_map_t;
+  typedef std::vector<function*> functions_list_t;
+  struct current_iteration_info_t{
+    lang::iteration_statement *statement;
+    basic_block *block;
+  };
+
+private:
+  phi_node *make_phi(type *ty, unsigned num_values, basic_block *block);
+  value *try_remove_trivial_phis(ir::phi_node *&phi);
+  value *add_phi_operands(const std::string& name, phi_node *&phi);
+  value *get_value_recursive(const std::string& name, basic_block *block);
+  void push_function(function *fn) { functions_.push_back(fn); }
+
+public:
+  module(const std::string &name);
+  context& get_context();
+  builder& get_builder();
+  // Setters
+  void set_value(const std::string& name, basic_block* block, value *x);
+  void set_value(const std::string& name, value* x);
+  void set_const(const std::string& name);
+  void set_continue_fn(std::function<ir::value*()> fn);
+  // Getters
+  value *get_value(const std::string& name, basic_block* block);
+  value *get_value(const std::string& name);
+  const std::string& get_name();
+  std::function<ir::value*()> get_continue_fn();
+  // Seal block -- no more predecessors will be added
+  void seal_block(basic_block *block);
+  // Functions
+  const functions_list_t &get_function_list() const { return functions_; }
+  functions_list_t &get_function_list()             { return functions_; }
+  function *get_or_insert_function(const std::string &name, function_type *ty);
+  // Scope
+  void add_new_scope()                                        { if(scopes_.empty()) scopes_.push(scope()); else scopes_.push(scope(get_scope())); }
+  void pop_scope()                                            { scopes_.pop(); }
+  scope& get_scope()                                          { return scopes_.top(); }
+  // Const allocation
+  void add_alloc(ir::alloc_const* x)                          { allocs_.push_back(x); }
+  const std::vector<ir::alloc_const*>& allocs()               { return allocs_; }
+  // Register global
+  void register_global(const std::string& name, ir::value *x) { globals_[name] = x; }
+  const std::map<std::string, ir::value*>& globals() const    { return globals_; }
+  // Metadata
+  void add_metadata(const std::string &name, md_pair_t x)     { metadatas_[name] = x; }
+
+private:
+  std::string name_;
+  context context_;
+  builder builder_;
+  std::map<val_key_t, value*> values_;
+  std::map<val_key_t, type*> types_;
+  std::set<std::string> const_;
+  std::set<basic_block*> sealed_blocks_;
+  std::map<basic_block*, std::map<std::string, phi_node*>> incomplete_phis_;
+  functions_list_t functions_;
+  symbols_map_t symbols_;
+  std::function<ir::value*()> continue_fn_;
+  std::map<value*, value**> current_phi_;
+  std::stack<scope> scopes_;
+  std::vector<ir::alloc_const*> allocs_;
+  std::map<std::string, ir::value*> globals_;
+  std::map<std::string, md_pair_t> metadatas_;
+};
+
+}
+}
+
+#endif
--- a/include/triton/ir/print.h
+++ b/include/triton/ir/print.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#ifndef _TRITON_IR_PRINT_H_
+#define _TRITON_IR_PRINT_H_
+
+#include "builder.h"
+
+namespace triton{
+namespace ir{
+
+class module;
+
+void print(module &mod, std::ostream& os);
+
+}
+}
+
+#endif
--- a/include/triton/ir/type.h
+++ b/include/triton/ir/type.h
@@ -0,0 +1,239 @@
+#pragma once
+
+#ifndef _TRITON_IR_TYPE_H_
+#define _TRITON_IR_TYPE_H_
+
+#include <cassert>
+#include <vector>
+#include <string>
+
+namespace triton{
+namespace ir{
+
+class context;
+class value;
+class integer_type;
+class constant_int;
+
+/* Type */
+class type {
+public:
+  typedef std::vector<unsigned>	         tile_shapes_t;
+
+protected:
+  typedef std::vector<type*>                  contained_tys_vec_t;
+  typedef contained_tys_vec_t::iterator       ty_iterator;
+  typedef contained_tys_vec_t::const_iterator const_ty_iterator;
+
+public:
+  enum id_t {
+    // primitive types
+    VoidTyID = 0,    ///<  0: type with no size
+    HalfTyID,        ///<  1: 16-bit floating point type
+    FloatTyID,       ///<  2: 32-bit floating point type
+    DoubleTyID,      ///<  3: 64-bit floating point type
+    X86_FP80TyID,    ///<  4: 80-bit floating point type (X87)
+    FP128TyID,       ///<  5: 128-bit floating point type (112-bit mantissa)
+    PPC_FP128TyID,   ///<  6: 128-bit floating point type (two 64-bits, PowerPC)
+    LabelTyID,       ///<  7: Labels
+    MetadataTyID,    ///<  8: Metadata
+    TokenTyID,       ///<  9: Token
+    // derived types
+    IntegerTyID,     ///< 10: Arbitrary bit width integers
+    FunctionTyID,    ///< 11: Functions
+    PointerTyID,     ///< 12: Pointers
+    StructTyID,      ///< 13: Struct
+    TileTyID,        ///< 14: Tile
+  };
+
+public:
+  //constructors
+  type(context &ctx, id_t id) : ctx_(ctx), id_(id) { }
+
+  //destructor
+  virtual ~type(){}
+
+  // accessors
+  context &get_context() const { return ctx_; }
+  id_t get_type_id() const     { return id_;  }
+  // type attributes
+  unsigned get_fp_mantissa_width() const;
+  unsigned get_integer_bitwidth() const;
+  unsigned get_tile_bitwidth() const;
+  unsigned get_primitive_size_in_bits() const;
+  type *get_scalar_ty() const;
+  const tile_shapes_t& get_tile_shapes() const;
+  const size_t get_tile_rank() const;
+  const size_t get_tile_ranks1() const;
+  unsigned get_tile_num_elements() const;
+  type *get_tile_element_ty() const;
+  unsigned get_pointer_address_space() const;
+  type *get_pointer_element_ty() const;
+
+  // primitive predicates
+  bool is_void_ty() const               { return id_ == VoidTyID; }
+  bool is_half_ty() const               { return id_ == HalfTyID; }
+  bool is_float_ty() const              { return id_ == FloatTyID; }
+  bool is_double_ty() const             { return id_ == DoubleTyID; }
+  bool is_label_ty()  const             { return id_ == LabelTyID;}
+  bool is_metadata_ty() const           { return id_ == MetadataTyID; }
+  bool is_token_ty() const              { return id_ == TokenTyID; }
+  bool is_integer_ty() const            { return id_ == IntegerTyID; }
+  bool is_integer_ty(unsigned bitwidth) { return is_integer_ty() &&
+                                                 get_integer_bitwidth() == bitwidth;}
+  bool is_bool_ty() const               { return is_integer_ty(1); }
+  bool is_pointer_ty() const            { return id_ == PointerTyID; }
+  bool is_tile_ty() const               { return id_ == TileTyID; }
+
+  // Composite predicates
+  bool is_int_or_tileint_ty();
+  bool is_integer_ty(unsigned width) const;
+  bool is_floating_point_ty() const;
+  bool is_sized() const ;
+
+  // Factory methods
+  // primitive types
+  static type *get_void_ty(context &ctx);
+  static type *get_label_ty(context &ctx);
+  // half
+  static type *get_half_ty(context &ctx);
+  static type *get_float_ty(context &ctx);
+  static type *get_double_ty(context &ctx);
+  // integer types
+  static integer_type *get_int1_ty(context &ctx);
+  static integer_type *get_int8_ty(context &ctx);
+  static integer_type *get_int16_ty(context &ctx);
+  static integer_type *get_int32_ty(context &ctx);
+  static integer_type *get_int64_ty(context &ctx);
+  static integer_type *get_int128_ty(context &ctx);
+
+  // repr
+  std::string tile_repr() const {
+    std::string res = get_tile_element_ty()->repr();
+    auto shapes = get_tile_shapes();
+    res += "<";
+    for(size_t i = 0; i < shapes.size(); i++){
+      if(i > 0)
+        res += ", ";
+      res += std::to_string(shapes[i]);
+    }
+    res+= ">";
+    return res;
+  }
+
+  std::string repr() const {
+    switch(id_) {
+      case VoidTyID: return "void";
+      case HalfTyID: return "f16";
+      case FloatTyID: return "f32";
+      case DoubleTyID: return "f64";
+      case X86_FP80TyID: return "f80";
+      case FP128TyID: return "f128";
+      case PPC_FP128TyID: return "ppcf128";
+      case LabelTyID: return "label";
+      case MetadataTyID: return "md";
+      case TokenTyID: return "tok";
+      case IntegerTyID: return "i" + std::to_string(get_integer_bitwidth());
+      case FunctionTyID: return "fn";
+      case PointerTyID: return get_pointer_element_ty()->repr() + "*";
+      case StructTyID: return "struct";
+      case TileTyID: return tile_repr();
+      default: break;
+    }
+    assert(false);
+    return "";
+  };
+
+private:
+  context &ctx_;
+  id_t id_;
+
+protected:
+  contained_tys_vec_t contained_tys_;
+};
+
+class integer_type: public type {
+  friend class context_impl;
+
+private:
+  // constructors
+  integer_type(context &ctx, unsigned bitwidth)
+    : type(ctx, IntegerTyID), bitwidth_(bitwidth){ }
+
+public:
+  // accessors
+  unsigned get_bitwidth() const { return bitwidth_; }
+
+  // factory methods
+  static integer_type* get(context &ctx, unsigned width);
+
+private:
+  unsigned bitwidth_;
+};
+
+class composite_type: public type{
+protected:
+  using type::type;
+
+public:
+  bool index_valid(value *idx) const;
+  type* get_type_at_index(value *idx) const;
+};
+
+class tile_type: public composite_type {
+private:
+  tile_type(type *ty, const tile_shapes_t &shapes);
+  static bool is_valid_elt_ty(type *ty);
+
+public:
+  // accessors
+  const tile_shapes_t& get_shapes() const { return shapes_; }
+  unsigned get_num_elements() const;
+  unsigned get_bitwidth() const;
+
+  // factory methods
+  static tile_type* get(type *ty, const tile_shapes_t &shapes);
+  static tile_type* get_same_shapes(type *ty, type *ref);
+
+private:
+  tile_shapes_t shapes_;
+};
+
+class pointer_type: public type {
+private:
+  pointer_type(type *ty, unsigned address_space);
+  static bool is_valid_elt_ty(type *ty);
+
+public:
+  // accessors
+  unsigned get_address_space()               const { return address_space_; }
+  type *get_element_ty()                     const { return contained_tys_[0]; }
+  // factory methods
+  static pointer_type* get(type *ty, unsigned address_space);
+
+private:
+  unsigned address_space_;
+};
+
+class function_type: public type {
+private:
+  function_type(type *ret_ty, const std::vector<type *> &param_tys);
+
+public:
+  // accessors
+  unsigned get_num_params()         const { return contained_tys_.size() - 1;  }
+  const_ty_iterator params_begin() const { return contained_tys_.begin() + 1; }
+  const_ty_iterator params_end()   const { return contained_tys_.end(); }
+  ty_iterator       params_begin()       { return contained_tys_.begin() + 1; }
+  ty_iterator       params_end()         { return contained_tys_.end(); }
+  type*    get_param_ty(unsigned i) const { return contained_tys_.at(1 + i);   }
+  type*    get_return_ty()          const { return contained_tys_.at(0);       }
+  // factory methods
+  static function_type* get(type *ret_ty, const std::vector<type*>& param_tys);
+};
+
+
+}
+}
+
+#endif
--- a/include/triton/ir/utils.h
+++ b/include/triton/ir/utils.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#ifndef _TRITON_IR_CFG_H_
+#define _TRITON_IR_CFG_H_
+
+#include <vector>
+#include <functional>
+
+namespace triton{
+namespace ir{
+
+class module;
+class function;
+class basic_block;
+class instruction;
+class value;
+
+class cfg {
+public:
+  static std::vector<basic_block *> reverse_post_order(function* fn);
+};
+
+void for_each_instruction(ir::module& mod, const std::function<void(triton::ir::instruction*)> &fn);
+void for_each_value(ir::module& mod, const std::function<void(triton::ir::value *)> &fn);
+
+}
+}
+
+#endif
--- a/include/triton/ir/value.h
+++ b/include/triton/ir/value.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#ifndef _TRITON_IR_VALUE_H_
+#define _TRITON_IR_VALUE_H_
+
+#include <string>
+#include <vector>
+#include <set>
+
+namespace triton{
+namespace ir{
+
+class type;
+class use;
+class user;
+class visitor;
+
+//===----------------------------------------------------------------------===//
+//                               value class
+//===----------------------------------------------------------------------===//
+
+class value {
+public:
+  typedef std::set<user*> users_t;
+
+public:
+  // constructor
+  value(type *ty, const std::string &name = "");
+  virtual ~value(){ }
+  // uses
+  void add_use(user* arg);
+  users_t::iterator erase_use(user* arg);
+  const std::set<user*> &get_users() { return users_; }
+  void replace_all_uses_with(value *target);
+  // name
+  void set_name(const std::string &name);
+  const std::string &get_name() const { return name_; }
+  type* get_type() const { return ty_; }
+  // visitor
+  virtual void accept(visitor *v) = 0;
+
+private:
+  std::string name_;
+
+protected:
+  type *ty_;
+  users_t users_;
+};
+
+//===----------------------------------------------------------------------===//
+//                               user class
+//===----------------------------------------------------------------------===//
+
+class user: public value{
+public:
+  typedef std::vector<value*>      ops_t;
+  typedef ops_t::iterator       op_iterator;
+  typedef ops_t::const_iterator const_op_iterator;
+
+protected:
+  void resize_ops(unsigned num_ops) { ops_.resize(num_ops + num_hidden_); num_ops_ = num_ops; }
+  void resize_hidden(unsigned num_hidden) { ops_.resize(num_ops_ + num_hidden); num_hidden_ = num_hidden; }
+
+public:
+  // Constructor
+  user(type *ty, unsigned num_ops, const std::string &name = "")
+      : value(ty, name), ops_(num_ops), num_ops_(num_ops), num_hidden_(0){
+  }
+  virtual ~user() { }
+
+  // Operands
+  const ops_t& ops() { return ops_; }
+  op_iterator op_begin() { return ops_.begin(); }
+  op_iterator op_end()   { return ops_.end(); }
+  void     set_operand(unsigned i, value *x);
+  value   *get_operand(unsigned i) const;
+  unsigned get_num_operands() const ;
+  unsigned get_num_hidden() const;
+
+  // Utils
+  value::users_t::iterator replace_uses_of_with(value *before, value *after);
+
+
+private:
+  ops_t ops_;
+  unsigned num_ops_;
+  unsigned num_hidden_;
+};
+
+}
+}
+
+#endif
--- a/include/triton/ir/visitor.h
+++ b/include/triton/ir/visitor.h
@@ -0,0 +1,162 @@
+#pragma once
+
+#ifndef _TRITON_IR_VISITOR_H_
+#define _TRITON_IR_VISITOR_H_
+
+
+namespace triton{
+namespace ir{
+
+class value;
+
+class instruction;
+
+class phi_node;
+class binary_operator;
+class getelementptr_inst;
+
+class icmp_inst;
+class fcmp_inst;
+class cast_inst;
+class trunc_inst;
+class z_ext_inst;
+class s_ext_inst;
+class fp_trunc_inst;
+class fp_ext_inst;
+class ui_to_fp_inst;
+class si_to_fp_inst;
+class fp_to_ui_inst;
+class fp_to_si_inst;
+class ptr_to_int_inst;
+class int_to_ptr_inst;
+class bit_cast_inst;
+class addr_space_cast_inst;
+
+class return_inst;
+class cond_branch_inst;
+class uncond_branch_inst;
+
+
+class unmasked_load_inst;
+class masked_load_inst;
+class unmasked_store_inst;
+class masked_store_inst;
+
+class retile_inst;
+class reshape_inst;
+class splat_inst;
+class broadcast_inst;
+class downcast_inst;
+
+class exp_inst;
+class log_inst;
+
+class get_program_id_inst;
+class get_num_program_inst;
+class atomic_cas_inst;
+class atomic_exch_inst;
+class atomic_add_inst;
+class dot_inst;
+class trans_inst;
+class sqrt_inst;
+class reduce_inst;
+class select_inst;
+
+class recoalesce_inst;
+class copy_to_shared_inst;
+class copy_from_shared_inst;
+class masked_load_async_inst;
+class barrier_inst;
+class async_wait_inst;
+class make_range_dyn;
+class make_range;
+
+class make_range_sta;
+class undef_value;
+class constant_int;
+class constant_fp;
+class global_value;
+class global_object;
+class alloc_const;
+
+class constant_fp;
+class undef_value;
+class constant_int;
+class constant_fp;
+class global_value;
+class global_object;
+class alloc_const;
+
+class function;
+
+class basic_block;
+
+class argument;
+
+class visitor {
+public:
+  virtual ~visitor() {}
+
+  virtual void visit_value(ir::value*);
+
+  virtual void visit_basic_block(basic_block*) = 0;
+  virtual void visit_argument(argument*) = 0;
+  virtual void visit_phi_node(phi_node*) = 0;
+  virtual void visit_binary_operator(binary_operator*) = 0;
+  virtual void visit_getelementptr_inst(getelementptr_inst*) = 0;
+
+  virtual void visit_icmp_inst(icmp_inst*) = 0;
+  virtual void visit_fcmp_inst(fcmp_inst*) = 0;
+  virtual void visit_cast_inst(cast_inst*) = 0;
+
+  virtual void visit_return_inst(return_inst*) = 0;
+  virtual void visit_cond_branch_inst(cond_branch_inst*) = 0;
+  virtual void visit_uncond_branch_inst(uncond_branch_inst*) = 0;
+
+
+  virtual void visit_unmasked_load_inst(unmasked_load_inst*) = 0;
+  virtual void visit_masked_load_inst(masked_load_inst*) = 0;
+  virtual void visit_unmasked_store_inst(unmasked_store_inst*) = 0;
+  virtual void visit_masked_store_inst(masked_store_inst*) = 0;
+
+  virtual void visit_exp_inst(exp_inst*) = 0;
+  virtual void visit_log_inst(log_inst*) = 0;
+
+  virtual void visit_reshape_inst(reshape_inst*) = 0;
+  virtual void visit_splat_inst(splat_inst*) = 0;
+  virtual void visit_broadcast_inst(broadcast_inst*) = 0;
+  virtual void visit_downcast_inst(downcast_inst*) = 0;
+
+  virtual void visit_get_program_id_inst(get_program_id_inst*) = 0;
+  virtual void visit_get_num_program_inst(get_num_program_inst*) = 0;
+  virtual void visit_atomic_cas_inst(atomic_cas_inst*) = 0;
+  virtual void visit_atomic_exch_inst(atomic_exch_inst*) = 0;
+  virtual void visit_atomic_add_inst(atomic_add_inst*) = 0;
+  virtual void visit_dot_inst(dot_inst*) = 0;
+  virtual void visit_trans_inst(trans_inst*) = 0;
+  virtual void visit_sqrt_inst(sqrt_inst*) = 0;
+  virtual void visit_reduce_inst(reduce_inst*) = 0;
+  virtual void visit_select_inst(select_inst*) = 0;
+
+  virtual void visit_recoalesce_inst(recoalesce_inst*) = 0;
+  virtual void visit_copy_to_shared_inst(copy_to_shared_inst*) = 0;
+  virtual void visit_copy_from_shared_inst(copy_from_shared_inst*) = 0;
+  virtual void visit_masked_load_async_inst(masked_load_async_inst*)= 0;
+  virtual void visit_barrier_inst(barrier_inst*) = 0;
+  virtual void visit_async_wait_inst(async_wait_inst*) = 0;
+  virtual void visit_make_range_dyn(make_range_dyn*) = 0;
+  virtual void visit_make_range(make_range*) = 0;
+
+  virtual void visit_function(function*) = 0;
+
+  virtual void visit_make_range_sta(make_range_sta*) = 0;
+  virtual void visit_undef_value(undef_value*) = 0;
+  virtual void visit_constant_int(constant_int*) = 0;
+  virtual void visit_constant_fp(constant_fp*) = 0;
+  virtual void visit_alloc_const(alloc_const*) = 0;
+};
+
+}
+}
+
+#endif
--- a/include/triton/lang/ast.h
+++ b/include/triton/lang/ast.h
@@ -0,0 +1,823 @@
+#pragma once
+
+#ifndef _WGTCC_AST_H_
+#define _WGTCC_AST_H_
+
+#include "error.h"
+#include "token.h"
+#include "type.h"
+
+#include <cassert>
+#include <list>
+#include <memory>
+#include <string>
+
+
+class Visitor;
+template<typename T> class Evaluator;
+class AddrEvaluator;
+class Generator;
+
+class Scope;
+class Parser;
+class ASTNode;
+class Token;
+class TokenSequence;
+
+// Expressions
+class Expr;
+class BinaryOp;
+class UnaryOp;
+class ConditionalOp;
+class FuncCall;
+class TempVar;
+class Constant;
+
+class Identifier;
+class Object;
+struct Initializer;
+class Declaration;
+class Enumerator;
+
+// Statements
+class Stmt;
+class IfStmt;
+class ForStmt;
+class JumpStmt;
+class LabelStmt;
+class EmptyStmt;
+class CompoundStmt;
+class FuncDef;
+class TranslationUnit;
+
+
+/*
+ * AST Node
+ */
+
+class ASTNode {
+public:
+  struct Attr{
+
+    enum KindT{
+      MULTIPLEOF,
+      ALIGNED,
+      NOALIAS,
+      READONLY,
+      WRITEONLY,
+      RETUNE,
+    };
+
+    KindT kind;
+    std::vector<Expr*> vals;
+  };
+  using AttrList = std::vector<Attr>;
+
+public:
+  virtual ~ASTNode() {}
+  virtual void Accept(Visitor* v) = 0;
+
+protected:
+  ASTNode() {}
+
+  MemPool* pool_ {nullptr};
+};
+
+using ExtDecl = ASTNode;
+
+
+/*
+ * Statements
+ */
+
+class Stmt : public ASTNode {
+public:
+  virtual ~Stmt() {}
+
+protected:
+   Stmt() {}
+};
+
+
+class EmptyStmt : public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static EmptyStmt* New();
+  virtual ~EmptyStmt() {}
+  virtual void Accept(Visitor* v);
+
+protected:
+  EmptyStmt() {}
+};
+
+
+class LabelStmt : public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static LabelStmt* New();
+  ~LabelStmt() {}
+  virtual void Accept(Visitor* v);
+  std::string Repr() const { return ".L" + std::to_string(tag_); }
+
+protected:
+  LabelStmt(): tag_(GenTag()) {}
+
+private:
+  static int GenTag() {
+    static int tag = 0;
+    return ++tag;
+  }
+
+  int tag_; // 使用整型的tag值，而不直接用字符串
+};
+
+
+class IfStmt : public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+public:
+  static IfStmt* New(Expr* cond, Stmt* then, Stmt* els=nullptr);
+  virtual ~IfStmt() {}
+  virtual void Accept(Visitor* v);
+
+protected:
+  IfStmt(Expr* cond, Stmt* then, Stmt* els = nullptr)
+      : cond_(cond), then_(then), else_(els) {}
+
+private:
+  Expr* cond_;
+  Stmt* then_;
+  Stmt* else_;
+};
+
+class ForStmt: public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+public:
+  static ForStmt* New(Stmt* body, Stmt* init = nullptr, Expr* cond = nullptr, Expr* step = nullptr);
+  virtual ~ForStmt() {}
+  virtual void Accept(Visitor* v);
+
+protected:
+  ForStmt(Stmt* body, Stmt* init = nullptr, Expr* cond = nullptr, Expr* step = nullptr)
+      : body_(body), init_(init), cond_(cond), step_(step) {}
+
+private:
+  Stmt* body_;
+  Stmt* init_;
+  Expr* cond_;
+  Expr* step_;
+};
+
+class JumpStmt : public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static JumpStmt* New(LabelStmt* label);
+  virtual ~JumpStmt() {}
+  virtual void Accept(Visitor* v);
+  void SetLabel(LabelStmt* label) { label_ = label; }
+
+protected:
+  JumpStmt(LabelStmt* label): label_(label) {}
+
+private:
+  LabelStmt* label_;
+};
+
+
+class ReturnStmt: public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static ReturnStmt* New(Expr* expr);
+  virtual ~ReturnStmt() {}
+  virtual void Accept(Visitor* v);
+
+protected:
+  ReturnStmt(::Expr* expr): expr_(expr) {}
+
+private:
+  ::Expr* expr_;
+};
+
+
+using StmtList = std::list<Stmt*>;
+
+class CompoundStmt : public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static CompoundStmt* New(StmtList& stmts, ::Scope* scope=nullptr);
+  virtual ~CompoundStmt() {}
+  virtual void Accept(Visitor* v);
+  StmtList& Stmts() { return stmts_; }
+  ::Scope* Scope() { return scope_; }
+
+protected:
+  CompoundStmt(const StmtList& stmts, ::Scope* scope=nullptr)
+      : stmts_(stmts), scope_(scope) {}
+
+private:
+  StmtList stmts_;
+  ::Scope* scope_;
+};
+
+
+struct Initializer {
+  Initializer(Type* type,
+              int offset,
+              Expr* expr,
+              unsigned char bitFieldBegin=0,
+              unsigned char bitFieldWidth=0)
+      : type_(type),
+        offset_(offset),
+        bitFieldBegin_(bitFieldBegin),
+        bitFieldWidth_(bitFieldWidth),
+        expr_(expr) {}
+
+  bool operator<(const Initializer& rhs) const;
+
+  // It could be the object it self or, it will be the member
+  // that was initialized
+  Type* type_;
+  int offset_;
+  unsigned char bitFieldBegin_;
+  unsigned char bitFieldWidth_;
+
+  Expr* expr_;
+};
+
+
+using InitList = std::set<Initializer>;
+
+class Declaration: public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static Declaration* New(Object* obj);
+  virtual ~Declaration() {}
+  virtual void Accept(Visitor* v);
+  InitList& Inits() { return inits_; }
+  Object* Obj() { return obj_; }
+  void AddInit(Initializer init);
+
+protected:
+  Declaration(Object* obj): obj_(obj) {}
+
+  Object* obj_;
+  InitList inits_;
+};
+
+
+/*
+ * Expr
+ *  BinaryOp
+ *  UnaryOp
+ *  ConditionalOp
+ *  FuncCall
+ *  Constant
+ *  Identifier
+ *  Object
+ *  TempVar
+ */
+
+class Expr : public Stmt {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+  friend class LValAssigner;
+
+public:
+  virtual ~Expr() {}
+  ::Type* Type() { return type_.GetPtr(); }
+  virtual bool IsLVal() = 0;
+  virtual void TypeChecking() = 0;
+  void EnsureCompatible(const QualType lhs, const QualType rhs) const;
+  void EnsureCompatibleOrVoidPointer(const QualType lhs,
+                                     const QualType rhs) const;
+  const Token* Tok() const { return tok_; }
+  void SetTok(const Token* tok) { tok_ = tok; }
+
+  static Expr* MayCast(Expr* expr);
+  static Expr* MayCast(Expr* expr, QualType desType);
+  static ::Type* TryExtractScalarType(Expr* loc, Expr *operand);
+  static ::Type* ScalarOrLikeTile(Expr* operand, ::Type* ty);
+
+  virtual bool IsNullPointerConstant() const { return false; }
+  bool IsConstQualified() const { return type_.IsConstQualified(); }
+  bool IsRestrictQualified() const { return type_.IsRestrictQualified(); }
+  bool IsVolatileQualified() const { return type_.IsVolatileQualified(); }
+
+protected:
+  // You can construct a expression without specifying a type,
+  // then the type should be evaluated in TypeChecking()
+  Expr(const Token* tok, QualType type): tok_(tok), type_(type) {}
+
+  const Token* tok_;
+  QualType type_;
+};
+
+
+/*
+ * '+', '-', '*', '/', '%', '<', '>', '<<', '>>', '|', '&', '^'
+ * '=',(复合赋值运算符被拆分为两个运算)
+ * '==', '!=', '<=', '>=',
+ * '&&', '||'
+ * '['(下标运算符), '.'(成员运算符)
+ * ','(逗号运算符),
+ */
+class BinaryOp : public Expr {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+  friend class LValAssigner;
+  friend class Declaration;
+
+public:
+  static BinaryOp* New(const Token* tok, Expr* lhs, Expr* rhs);
+  static BinaryOp* New(const Token* tok, int op, Expr* lhs, Expr* rhs);
+  virtual ~BinaryOp() {}
+  virtual void Accept(Visitor* v);
+
+  // Member ref operator is a lvalue
+  virtual bool IsLVal() {
+    switch (op_) {
+    case '.': return !Type()->ToArray() && lhs_->IsLVal();
+    case ']': return !Type()->ToArray();
+    case Token::MASKED_DEREF: return true;
+    default: return false;
+    }
+  }
+  ArithmType* Convert();
+  static void Broadcast(Expr* loc, Expr*& lhs, Expr*& rhs, QualType &type);
+
+  virtual void TypeChecking();
+  void SubScriptingOpTypeChecking();
+  void MemberRefOpTypeChecking();
+  void MultiOpTypeChecking();
+  void AdditiveOpTypeChecking();
+  void ShiftOpTypeChecking();
+  void RangeOpTypeChecking();
+  void MatmulOpTypeChecking();
+  void MaskedDerefOpTypeChecking();
+  void RelationalOpTypeChecking();
+  void EqualityOpTypeChecking();
+  void BitwiseOpTypeChecking();
+  void LogicalOpTypeChecking();
+  void AssignOpTypeChecking();
+  void CommaOpTypeChecking();
+
+protected:
+  BinaryOp(const Token* tok, int op, Expr* lhs, Expr* rhs)
+      : Expr(tok, nullptr), op_(op) {
+        lhs_ = lhs, rhs_ = rhs;
+        if (op != '.') {
+          lhs_ = MayCast(lhs);
+          rhs_ = MayCast(rhs);
+        }
+      }
+
+  int op_;
+  Expr* lhs_;
+  Expr* rhs_;
+};
+
+
+/*
+ * Unary Operator:
+ * '++' (prefix/postfix)
+ * '--' (prefix/postfix)
+ * '&'  (ADDR)
+ * '*'  (DEREF)
+ * '+'  (PLUS)
+ * '-'  (MINUS)
+ * '~'
+ * '!'
+ * CAST // like (int)3
+ */
+class UnaryOp : public Expr {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+  friend class LValAssigner;
+
+public:
+  static UnaryOp* New(int op, Expr* operand, QualType type=nullptr, int info=0);
+  virtual ~UnaryOp() {}
+  virtual void Accept(Visitor* v);
+  virtual bool IsLVal();
+  ::Type *Convert();
+  static int encodeRed(int ax, int tag);
+  static void decodeRed(int info, int& ax, int& tag);
+  void TypeChecking();
+  void IncDecOpTypeChecking();
+  void AddrOpTypeChecking();
+  void DerefOpTypeChecking();
+  void ReduceOpTypeChecking();
+  void UnaryArithmOpTypeChecking();
+  void BitcastOpTypeChecking();
+  void CastOpTypeChecking();
+  void IntrinsicOpTypeChecking();
+
+protected:
+  UnaryOp(int op, Expr* operand, QualType type=nullptr, int info=0)
+    : Expr(operand->Tok(), type), op_(op), info_(info) {
+      operand_ = operand;
+      if (op_ != Token::CAST && op_ != Token::ADDR) {
+        operand_ = MayCast(operand);
+      }
+    }
+
+  int op_;
+  int info_;
+  Expr* operand_;
+};
+
+class TransOp: public Expr {
+  friend class Generator;
+
+public:
+  using PermInt = std::vector<int>;
+
+public:
+  static TransOp* New(const PermInt& perm, Expr* operand);
+  const PermInt& getPerm() const { return perm_; }
+  void Accept(Visitor* v);
+  bool IsLVal() { return false; }
+  void TypeChecking();
+
+protected:
+  TransOp(const PermInt& perm, Expr* operand)
+    : Expr(operand->Tok(), nullptr), operand_(operand), perm_(perm) {}
+
+private:
+  Expr* operand_;
+  PermInt perm_;
+};
+
+
+// cond ? true ： false
+class ConditionalOp : public Expr {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static ConditionalOp* New(const Token* tok,
+      Expr* cond, Expr* exprTrue, Expr* exprFalse);
+  virtual ~ConditionalOp() {}
+  virtual void Accept(Visitor* v);
+  virtual bool IsLVal() { return false; }
+  ArithmType* Convert();
+  virtual void TypeChecking();
+
+protected:
+  ConditionalOp(Expr* cond, Expr* exprTrue, Expr* exprFalse)
+      : Expr(cond->Tok(), nullptr), cond_(MayCast(cond)),
+        exprTrue_(MayCast(exprTrue)), exprFalse_(MayCast(exprFalse)) {}
+
+private:
+  Expr* cond_;
+  Expr* exprTrue_;
+  Expr* exprFalse_;
+};
+
+
+class FuncCall : public Expr {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  using ArgList = std::vector<Expr*>;
+
+public:
+  static FuncCall* New(Expr* designator, const ArgList& args);
+  ~FuncCall() {}
+  virtual void Accept(Visitor* v);
+
+  // A function call is ofcourse not lvalue
+  virtual bool IsLVal() { return false; }
+  ArgList* Args() { return &args_; }
+  Expr* Designator() { return designator_; }
+  const std::string& Name() const { return tok_->str_; }
+  ::FuncType* FuncType() { return designator_->Type()->ToFunc(); }
+  virtual void TypeChecking();
+
+protected:
+  FuncCall(Expr* designator, const ArgList& args)
+    : Expr(designator->Tok(), nullptr),
+      designator_(designator), args_(args) {}
+
+  Expr* designator_;
+  ArgList args_;
+};
+
+
+class Constant: public Expr {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static Constant* New(const Token* tok, int tag, long val);
+  static Constant* New(const Token* tok, int tag, double val);
+  static Constant* New(const Token* tok, int tag, const std::string* val);
+  ~Constant() {}
+  virtual void Accept(Visitor* v);
+  virtual bool IsLVal() { return false; }
+  virtual void TypeChecking() {}
+
+  long IVal() const { return ival_; }
+  double FVal() const { return fval_; }
+  const std::string* SVal() const { return sval_; }
+  std::string SValRepr() const;
+  std::string Repr() const { return std::string(".LC") + std::to_string(id_); }
+
+protected:
+  Constant(const Token* tok, QualType type, long val)
+      : Expr(tok, type), ival_(val) {}
+  Constant(const Token* tok, QualType type, double val)
+      : Expr(tok, type), fval_(val) {}
+  Constant(const Token* tok, QualType type, const std::string* val)
+      : Expr(tok, type), sval_(val) {}
+
+  union {
+    long ival_;
+    double fval_;
+    struct {
+      long id_;
+      const std::string* sval_;
+    };
+  };
+};
+
+
+class TempVar : public Expr {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static TempVar* New(QualType type);
+  virtual ~TempVar() {}
+  virtual void Accept(Visitor* v);
+  virtual bool IsLVal() { return true; }
+  virtual void TypeChecking() {}
+
+protected:
+  TempVar(QualType type): Expr(nullptr, type), tag_(GenTag()) {}
+
+private:
+  static int GenTag() {
+    static int tag = 0;
+    return ++tag;
+  }
+
+  int tag_;
+};
+
+
+enum Linkage {
+  L_NONE,
+  L_EXTERNAL,
+  L_INTERNAL,
+};
+
+
+class Identifier: public Expr {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+  friend class LValAssigner;
+
+public:
+  static Identifier* New(const Token* tok, QualType type, Linkage linkage, const AttrList& attrList={});
+  virtual ~Identifier() {}
+  virtual void Accept(Visitor* v);
+  virtual bool IsLVal() { return false; }
+  virtual Object* ToObject() { return nullptr; }
+  virtual Enumerator* ToEnumerator() { return nullptr; }
+
+   // An identifer can be:
+   //   object, sturct/union/enum tag, typedef name, function, label.
+   Identifier* ToTypeName() {
+    // A typename has no linkage
+    // And a function has external or internal linkage
+    if (ToObject() || ToEnumerator() || linkage_ != L_NONE)
+      return nullptr;
+    return this;
+  }
+  virtual const std::string Name() const { return tok_->str_; }
+  enum Linkage Linkage() const { return linkage_; }
+  void SetLinkage(enum Linkage linkage) { linkage_ = linkage; }
+  virtual void TypeChecking() {}
+
+protected:
+  Identifier(const Token* tok, QualType type, enum Linkage linkage, const AttrList& attrList={})
+      : Expr(tok, type), linkage_(linkage), attrList_(attrList) {}
+
+  // An identifier has property linkage
+  enum Linkage linkage_;
+  AttrList attrList_;
+};
+
+
+class Enumerator: public Identifier {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static Enumerator* New(const Token* tok, int val);
+  virtual ~Enumerator() {}
+  virtual void Accept(Visitor* v);
+  virtual Enumerator* ToEnumerator() { return this; }
+  int Val() const { return cons_->IVal(); }
+
+protected:
+  Enumerator(const Token* tok, int val)
+      : Identifier(tok, ArithmType::New(T_INT), L_NONE),
+        cons_(Constant::New(tok, T_INT, (long)val)) {}
+
+  Constant* cons_;
+};
+
+
+class Object : public Identifier {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+  friend class LValAssigner;
+
+public:
+  static Object* New(const Token* tok,
+                     QualType type,
+                     int storage=0,
+                     enum Linkage linkage=L_NONE,
+                     unsigned char bitFieldBegin=0,
+                     unsigned char bitFieldWidth=0,
+                     const AttrList& attrList={});
+  static Object* NewAnony(const Token* tok,
+                          QualType type,
+                          int storage=0,
+                          enum Linkage linkage=L_NONE,
+                          unsigned char bitFieldBegin=0,
+                          unsigned char bitFieldWidth=0,
+                          const AttrList& attrList={});
+  ~Object() {}
+  virtual void Accept(Visitor* v);
+  virtual Object* ToObject() { return this; }
+  virtual bool IsLVal() {
+    // TODO(wgtdkp): not all object is lval?
+    return true;
+  }
+  bool IsStatic() const {
+    return (Storage() & S_STATIC) || (Linkage() != L_NONE);
+  }
+  int Storage() const { return storage_; }
+  void SetStorage(int storage) { storage_ = storage; }
+  int Align() const { return align_; }
+  void SetAlign(int align) {
+    assert(align > 0);
+    // Allowing reduce alignment to implement __attribute__((packed))
+    //if (align < align_)
+    //  Error(this, "alignment specifier cannot reduce alignment");
+    align_ = align;
+  }
+  int Offset() const { return offset_; }
+  void SetOffset(int offset) { offset_ = offset; }
+  Declaration* Decl() { return decl_; }
+  void SetDecl(Declaration* decl) { decl_ = decl; }
+  const AttrList& GetAttrList() const { return attrList_; }
+  unsigned char BitFieldBegin() const { return bitFieldBegin_; }
+  unsigned char BitFieldEnd() const { return bitFieldBegin_ + bitFieldWidth_; }
+  unsigned char BitFieldWidth() const { return bitFieldWidth_; }
+  static unsigned long BitFieldMask(Object* bitField) {
+    return BitFieldMask(bitField->bitFieldBegin_, bitField->bitFieldWidth_);
+  }
+  static unsigned long BitFieldMask(unsigned char begin, unsigned char width) {
+    auto end = begin + width;
+    return ((0xFFFFFFFFFFFFFFFFUL << (64 - end)) >> (64 - width)) << begin;
+  }
+
+  bool HasInit() const { return decl_ && decl_->Inits().size(); }
+  bool Anonymous() const { return anonymous_; }
+  virtual const std::string Name() const { return Identifier::Name(); }
+  std::string Repr() const {
+    assert(IsStatic() || anonymous_);
+    if (anonymous_)
+      return "anonymous." + std::to_string(id_);
+    if (linkage_ == L_NONE)
+      return Name() + "." + std::to_string(id_);
+    return Name();
+  }
+
+protected:
+  Object(const Token* tok,
+         QualType type,
+         int storage=0,
+         enum Linkage linkage=L_NONE,
+         unsigned char bitFieldBegin=0,
+         unsigned char bitFieldWidth=0,
+         const AttrList& attrList={})
+      : Identifier(tok, type, linkage),
+        storage_(storage),
+        offset_(0),
+        align_(type->Align()),
+        decl_(nullptr),
+        bitFieldBegin_(bitFieldBegin),
+        bitFieldWidth_(bitFieldWidth),
+        anonymous_(false),
+        attrList_(attrList){}
+
+private:
+  int storage_;
+  int offset_;
+  int align_;
+
+  Declaration* decl_;
+
+  unsigned char bitFieldBegin_;
+  // 0 means it's not a bitfield
+  unsigned char bitFieldWidth_;
+
+  bool anonymous_;
+  long id_ {0};
+
+  AttrList attrList_;
+};
+
+
+/*
+ * Declaration
+ */
+
+class FuncDef : public ExtDecl {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  using ParamList = std::vector<Object*>;
+
+public:
+  static FuncDef* New(Identifier* ident, LabelStmt* retLabel);
+  virtual ~FuncDef() {}
+  virtual void Accept(Visitor* v);
+  ::FuncType* FuncType() { return ident_->Type()->ToFunc(); }
+  CompoundStmt* Body() { return body_; }
+  void SetBody(CompoundStmt* body) { body_ = body; }
+  std::string Name() const { return ident_->Name(); }
+  enum Linkage Linkage() { return ident_->Linkage(); }
+
+protected:
+  FuncDef(Identifier* ident, LabelStmt* retLabel)
+      : ident_(ident), retLabel_(retLabel) {}
+
+private:
+  Identifier* ident_;
+  LabelStmt* retLabel_;
+  CompoundStmt* body_;
+};
+
+
+using ExtDeclList = std::list<ExtDecl*>;
+
+class TranslationUnit : public ASTNode {
+  template<typename T> friend class Evaluator;
+  friend class AddrEvaluator;
+  friend class Generator;
+
+public:
+  static TranslationUnit* New() { return new TranslationUnit();}
+  virtual ~TranslationUnit() {}
+  virtual void Accept(Visitor* v);
+  void Add(ExtDecl* extDecl) { extDecls_.push_back(extDecl); }
+  ExtDeclList& ExtDecls() { return extDecls_; }
+  const ExtDeclList& ExtDecls() const { return extDecls_; }
+
+private:
+  TranslationUnit() {}
+
+  ExtDeclList extDecls_;
+};
+
+#endif
--- a/include/triton/lang/code_gen.h
+++ b/include/triton/lang/code_gen.h
@@ -0,0 +1,167 @@
+#pragma once
+
+#ifndef _WGTCC_CODE_GEN_H_
+#define _WGTCC_CODE_GEN_H_
+
+#include "ast.h"
+#include "visitor.h"
+#include <stack>
+
+namespace triton{
+namespace ir{
+
+class value;
+class module;
+class type;
+class context;
+class builder;
+class attribute;
+
+}
+}
+
+using namespace triton;
+
+class Parser;
+struct Addr;
+template<> class Evaluator<Addr>;
+struct StaticInitializer;
+class LValAssigner;
+
+using TypeList = std::vector<Type*>;
+using LocationList = std::vector<std::string>;
+using StaticInitList = std::vector<StaticInitializer>;
+
+// Error
+inline void should_not_happen(const std::string& suffix) { throw std::runtime_error("internal compiler error: " + suffix); }
+inline void error_not_implemented(const std::string& msg) { throw std::runtime_error(msg); }
+
+class Generator: public Visitor {
+  friend class Evaluator<Addr>;
+  friend class LValAssigner;
+
+protected:
+  struct scope {
+    std::map<std::string, ir::type*> types;
+    std::map<std::string, ir::value*> values;
+  };
+
+  void set_ret(ir::value* value);
+  ir::value *GenUnaryMinus(ir::value* arg);
+  ir::value *GenUnaryInc(UnaryOp* arg, bool is_postfix, bool is_inc);
+
+public:
+  Generator(Parser* parser) : parser_(parser) {}
+
+  void Visit(ASTNode* node) { node->Accept(this); }
+  void VisitExpr(Expr* expr) { expr->Accept(this); }
+  void VisitStmt(Stmt* stmt) { stmt->Accept(this); }
+
+  // Expression
+  void VisitBinaryOp(BinaryOp* binaryOp);
+  void VisitUnaryOp(UnaryOp* unaryOp);
+  void VisitTransOp(TransOp* transOp);
+  void VisitConditionalOp(ConditionalOp* condOp);
+  void VisitFuncCall(FuncCall* funcCall);
+  void VisitObject(Object* obj);
+  void VisitEnumerator(Enumerator* enumer);
+  void VisitIdentifier(Identifier* ident);
+  void VisitConstant(Constant* cons);
+  void VisitTempVar(TempVar* tempVar);
+
+  // Statement
+  void VisitDeclaration(Declaration* init);
+  void VisitEmptyStmt(EmptyStmt* emptyStmt);
+  void VisitIfStmt(IfStmt* ifStmt);
+  void VisitForStmt(ForStmt* ifStmt);
+  void VisitJumpStmt(JumpStmt* jumpStmt);
+  void VisitReturnStmt(ReturnStmt* returnStmt);
+  void VisitLabelStmt(LabelStmt* labelStmt);
+  void VisitCompoundStmt(CompoundStmt* compoundStmt);
+
+  void VisitFuncDef(FuncDef* funcDef);
+  void VisitTranslationUnit(TranslationUnit* unit);
+
+  void Gen(ir::module *mod);
+
+protected:
+  // Triton-IR attributes
+  ir::attribute GenIRAttr(ASTNode::Attr attr);
+
+  // Triton-IR metadata
+  void SetIRMetadata(ASTNode::Attr attr, ir::value *rhs);
+
+  // Triton-IR values
+  ir::value* GenAssignOp(Expr* lvalue, ir::value* rhs);
+  ir::value* GenBroadcastOp(ir::value* src, ir::type* dst_ty);
+  ir::value* GenNumcastOp(ir::value*src, ir::type* dst_ty);
+  ir::value* GenSemCastOp(ir::value* op, ir::type* type);
+  ir::value* GenBitCastOp(ir::value* src, ir::type* dst_ty);
+
+  // Triton-IR types
+  static ir::type* GenIRType(::Type* type, ir::context &ctx);
+  static ir::type* GenIRArithmType(ArithmType* type, ir::context& ctx);
+  static ir::type* GenIRArrayType(ArrayType* type,  ir::context& ctx);
+  static ir::type* GenIRTileType(TileType* type,  ir::context& ctx);
+  static ir::type* GenIRFuncType(FuncType* type,  ir::context& ctx);
+  static ir::type* GenIRPointerType(PointerType* type,  ir::context& ctx);
+  static ir::type* GenIRStructType(StructType* type,  ir::context& ctx);
+  void AllocObjects(Scope* scope, const FuncDef::ParamList& params=FuncDef::ParamList());
+
+  // SSA
+  void pushScope();
+  void popScope();
+
+private:
+  Parser* parser_;
+  ir::value* ret_;
+  ir::builder* bld_;
+  ir::context* ctx_;
+  ir::module* mod_;
+
+private:
+//  std::stack<scope> scopes_;
+  LValAssigner* assign_;
+};
+
+
+class LValAssigner: public Visitor {
+public:
+  LValAssigner(Generator* gen): gen_(gen) {}
+
+  // Expression
+  void VisitBinaryOp(BinaryOp* binaryOp);
+  void VisitUnaryOp(UnaryOp* unaryOp);
+  void VisitObject(Object* obj);
+  void VisitIdentifier(Identifier* ident);
+
+  void VisitConditionalOp(ConditionalOp*)      { should_not_happen("conditional cannot be lvalue"); }
+  void VisitFuncCall(FuncCall*)                { should_not_happen("funccall cannot be lvalue"); }
+  void VisitTransOp(TransOp*)                  { should_not_happen("transop cannot be lvalue"); }
+  void VisitEnumerator(Enumerator*)            { should_not_happen("enumerator cannot be lvalue"); }
+  void VisitConstant(Constant*)                { should_not_happen("constant cannot be lvalue"); }
+  void VisitTempVar(TempVar*)                  { should_not_happen("tempvar cannot be lvalue"); }
+  void VisitDeclaration(Declaration*)          { should_not_happen("declaration cannot be lvalue"); }
+  void VisitEmptyStmt(EmptyStmt*)              { should_not_happen("empty statement cannot be lvalue"); }
+  void VisitIfStmt(IfStmt*)                    { should_not_happen("if statement cannot be lvalue"); }
+  void VisitForStmt(ForStmt*)                  { should_not_happen("for statement cannot be lvalue"); }
+  void VisitJumpStmt(JumpStmt*)                { should_not_happen("jump statement cannot be lvalue"); }
+  void VisitReturnStmt(ReturnStmt*)            { should_not_happen("return statement cannot be lvalue"); }
+  void VisitLabelStmt(LabelStmt*)              { should_not_happen("label statement cannot be lvalue"); }
+  void VisitCompoundStmt(CompoundStmt*)        { should_not_happen("compound statement cannot be lvalue"); }
+  void VisitFuncDef(FuncDef*)                  { should_not_happen("function definition cannot be lvalue"); }
+  void VisitTranslationUnit(TranslationUnit*)  { should_not_happen("translation unit cannot be lvalue"); }
+
+  ir::value* GenExpr(Expr* expr, ir::value* rhs) {
+    rhs_ = rhs;
+    expr->Accept(this);
+    return ret_;
+  }
+
+private:
+  ir::value* ret_;
+  ir::value* rhs_;
+  Generator* gen_;
+};
+
+#endif
--- a/include/triton/lang/cpp.h
+++ b/include/triton/lang/cpp.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#ifndef _WGTCC_CPP_H_
+#define _WGTCC_CPP_H_
+
+#include "scanner.h"
+
+#include <cstdio>
+#include <list>
+#include <map>
+#include <set>
+#include <stack>
+#include <string>
+
+class Macro;
+struct CondDirective;
+
+using MacroMap = std::map<std::string, Macro>;
+using ParamList = std::list<std::string>;
+using ParamMap = std::map<std::string, TokenSequence>;
+using PPCondStack = std::stack<CondDirective>;
+using PathList = std::list<std::string>;
+
+
+class Macro {
+public:
+  Macro(const TokenSequence& repSeq, bool preDef=false)
+      : funcLike_(false), variadic_(false),
+        preDef_(preDef), repSeq_(repSeq) {}
+
+  Macro(bool variadic, ParamList& params,
+        TokenSequence& repSeq, bool preDef=false)
+      : funcLike_(true), variadic_(variadic), preDef_(preDef),
+        params_(params), repSeq_(repSeq) {}
+
+  ~Macro() {}
+  bool FuncLike() { return funcLike_; }
+  bool ObjLike() { return !FuncLike(); }
+  bool Variadic() { return variadic_; }
+  bool PreDef() { return preDef_; }
+  ParamList& Params() { return params_; }
+  TokenSequence RepSeq(const std::string* filename, unsigned line);
+
+private:
+  bool funcLike_;
+  bool variadic_;
+  bool preDef_;
+  ParamList params_;
+  TokenSequence repSeq_;
+};
+
+
+struct CondDirective {
+  int tag_;
+  bool enabled_;
+  bool cond_;
+};
+
+
+class Preprocessor {
+public:
+  Preprocessor(const std::string* str, bool isSrc = true)
+      : curLine_(1), lineLine_(0), curCond_(true), fName_(nullptr), fSrc_(nullptr) {
+    if(isSrc)
+      fSrc_ = str;
+    else
+      fName_ = str;
+    // Add predefined
+    Init();
+  }
+
+
+  ~Preprocessor() {}
+  void Finalize(TokenSequence os);
+  void Process(TokenSequence& os);
+  void Expand(TokenSequence& os, TokenSequence is, bool inCond=false);
+  void Subst(TokenSequence& os, TokenSequence is,
+             bool leadingWS, const HideSet& hs, ParamMap& params);
+  void Glue(TokenSequence& os, TokenSequence is);
+  void Glue(TokenSequence& os, const Token* tok);
+  const Token* Stringize(TokenSequence is);
+  void Stringize(std::string& str, TokenSequence is);
+  const Token* ParseActualParam(TokenSequence& is, Macro* macro, ParamMap& paramMap);
+  int GetDirective(TokenSequence& is);
+  const Token* EvalDefOp(TokenSequence& is);
+  void ReplaceIdent(TokenSequence& is);
+  void ParseDirective(TokenSequence& os, TokenSequence& is, int directive);
+  void ParseIf(TokenSequence ls);
+  void ParseIfdef(TokenSequence ls);
+  void ParseIfndef(TokenSequence ls);
+  void ParseElif(TokenSequence ls);
+  void ParseElse(TokenSequence ls);
+  void ParseEndif(TokenSequence ls);
+  void ParseInclude(TokenSequence& is, TokenSequence ls);
+  void ParseDef(TokenSequence ls);
+  void ParseUndef(TokenSequence ls);
+  void ParseLine(TokenSequence ls);
+  void ParseError(TokenSequence ls);
+  void ParsePragma(TokenSequence ls);
+  void IncludeSrc(TokenSequence& is, const std::string* text, const std::string* filename);
+  void IncludeFile(TokenSequence& is, const std::string* filename);
+  bool ParseIdentList(ParamList& params, TokenSequence& is);
+
+
+  Macro* FindMacro(const std::string& name) {
+    auto res = macroMap_.find(name);
+    if (res == macroMap_.end())
+      return nullptr;
+    return &res->second;
+  }
+
+  void AddMacro(const std::string& name,
+                std::string* text, bool preDef=false);
+
+  void AddMacro(const std::string& name, const Macro& macro) {
+    auto res = macroMap_.find(name);
+    if (res != macroMap_.end()) {
+      // TODO(wgtdkp): give warning
+      macroMap_.erase(res);
+    }
+    macroMap_.insert(std::make_pair(name, macro));
+  }
+
+  void RemoveMacro(const std::string& name) {
+    auto res = macroMap_.find(name);
+    if (res == macroMap_.end())
+      return;
+    if(res->second.PreDef()) // Cannot undef predefined macro
+      return;
+    macroMap_.erase(res);
+  }
+
+  std::string* SearchFile(const std::string& name,
+                          const bool libHeader,
+                          bool next,
+                          const std::string& curPath);
+
+  void AddSearchPath(std::string path);
+  void HandleTheFileMacro(TokenSequence& os, const Token* macro);
+  void HandleTheLineMacro(TokenSequence& os, const Token* macro);
+  void UpdateFirstTokenLine(TokenSequence ts);
+
+  bool NeedExpand() const {
+    if (ppCondStack_.empty())
+      return true;
+    auto top = ppCondStack_.top();
+    return top.enabled_ && top.cond_;
+  }
+
+private:
+  void Init();
+
+  PPCondStack ppCondStack_;
+  unsigned curLine_;
+  unsigned lineLine_;
+  bool curCond_;
+
+  MacroMap macroMap_;
+  PathList searchPaths_;
+  const std::string* fName_;
+  const std::string* fSrc_;
+};
+
+#endif
--- a/include/triton/lang/encoding.h
+++ b/include/triton/lang/encoding.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#ifndef _WGTCC_ENCODING_H_
+#define _WGTCC_ENCODING_H_
+
+#include <string>
+
+
+enum class Encoding {
+  NONE,
+  CHAR16,
+  CHAR32,
+  UTF8,
+  WCHAR
+};
+
+
+void ConvertToUTF16(std::string& str);
+void ConvertToUTF32(std::string& str);
+void AppendUCN(std::string& str, int c);
+
+#endif
--- a/include/triton/lang/error.h
+++ b/include/triton/lang/error.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#ifndef _WGTCC_ERROR_H_
+#define _WGTCC_ERROR_H_
+
+
+struct SourceLocation;
+class Token;
+class Expr;
+
+
+[[noreturn]] void Error(const char* format, ...);
+[[noreturn]] void Error(const SourceLocation& loc, const char* format, ...);
+[[noreturn]] void Error(const Token* tok, const char* format, ...);
+[[noreturn]] void Error(const Expr* expr, const char* format, ...);
+
+#endif
--- a/include/triton/lang/evaluator.h
+++ b/include/triton/lang/evaluator.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#ifndef _WGTCC_EVALUATOR_H_
+#define _WGTCC_EVALUATOR_H_
+
+#include "ast.h"
+#include "error.h"
+#include "visitor.h"
+
+
+class Expr;
+
+template<typename T>
+class Evaluator: public Visitor {
+public:
+  Evaluator() {}
+
+  virtual ~Evaluator() {}
+
+  virtual void VisitBinaryOp(BinaryOp* binary);
+  virtual void VisitUnaryOp(UnaryOp* unary);
+  virtual void VisitConditionalOp(ConditionalOp* cond);
+
+  virtual void VisitFuncCall(FuncCall* funcCall) {
+    Error(funcCall, "expect constant expression");
+  }
+  virtual void VisitEnumerator(Enumerator* enumer) {
+    val_ = static_cast<T>(enumer->Val());
+  }
+  virtual void VisitIdentifier(Identifier* ident) {
+    Error(ident, "expect constant expression");
+  }
+  virtual void VisitTransOp(TransOp* trans) {
+    Error(trans, "expect constant expression");
+  }
+  virtual void VisitObject(Object* obj) {
+    Error(obj, "expect constant expression");
+  }
+  virtual void VisitConstant(Constant* cons) {
+    if (cons->Type()->IsFloat()) {
+      val_ = static_cast<T>(cons->FVal());
+    } else if (cons->Type()->IsInteger()) {
+      val_ = static_cast<T>(cons->IVal());
+    } else {
+      assert(false);
+    }
+  }
+  virtual void VisitTempVar(TempVar* tempVar) { assert(false); }
+
+  // We may should assert here
+  virtual void VisitDeclaration(Declaration* init) {}
+  virtual void VisitIfStmt(IfStmt* ifStmt) {}
+  virtual void VisitForStmt(ForStmt* forStmt) {}
+  virtual void VisitJumpStmt(JumpStmt* jumpStmt) {}
+  virtual void VisitReturnStmt(ReturnStmt* returnStmt) {}
+  virtual void VisitLabelStmt(LabelStmt* labelStmt) {}
+  virtual void VisitEmptyStmt(EmptyStmt* emptyStmt) {}
+  virtual void VisitCompoundStmt(CompoundStmt* compStmt) {}
+  virtual void VisitFuncDef(FuncDef* funcDef) {}
+  virtual void VisitTranslationUnit(TranslationUnit* unit) {}
+
+  T Eval(Expr* expr) {
+    expr->Accept(this);
+    return val_;
+  }
+
+private:
+  T val_;
+};
+
+
+struct Addr {
+  std::string label_;
+  int offset_;
+};
+
+template<>
+class Evaluator<Addr>: public Visitor {
+public:
+  Evaluator<Addr>() {}
+  virtual ~Evaluator<Addr>() {}
+  virtual void VisitBinaryOp(BinaryOp* binary);
+  virtual void VisitUnaryOp(UnaryOp* unary);
+  virtual void VisitConditionalOp(ConditionalOp* cond);
+
+  virtual void VisitFuncCall(FuncCall* funcCall) {
+    Error(funcCall, "expect constant expression");
+  }
+  virtual void VisitTransOp(TransOp* trans) {
+    Error(trans, "expect constant expression");
+  }
+  virtual void VisitEnumerator(Enumerator* enumer) {
+    addr_.offset_ = enumer->Val();
+  }
+  virtual void VisitIdentifier(Identifier* ident) {
+    addr_.label_ = ident->Name();
+    addr_.offset_ = 0;
+  }
+  virtual void VisitObject(Object* obj) {
+    if (!obj->IsStatic()) {
+      Error(obj, "expect static object");
+    }
+    addr_.label_ = obj->Repr();
+    addr_.offset_ = 0;
+  }
+  virtual void VisitConstant(Constant* cons);
+  virtual void VisitTempVar(TempVar* tempVar) { assert(false); }
+
+  // We may should assert here
+  virtual void VisitDeclaration(Declaration* init) {}
+  virtual void VisitIfStmt(IfStmt* ifStmt) {}
+  virtual void VisitForStmt(ForStmt* forStmt) {}
+  virtual void VisitJumpStmt(JumpStmt* jumpStmt) {}
+  virtual void VisitReturnStmt(ReturnStmt* returnStmt) {}
+  virtual void VisitLabelStmt(LabelStmt* labelStmt) {}
+  virtual void VisitEmptyStmt(EmptyStmt* emptyStmt) {}
+  virtual void VisitCompoundStmt(CompoundStmt* compStmt) {}
+  virtual void VisitFuncDef(FuncDef* funcDef) {}
+  virtual void VisitTranslationUnit(TranslationUnit* unit) {}
+
+  Addr Eval(Expr* expr) {
+    expr->Accept(this);
+    return addr_;
+  }
+
+private:
+  Addr addr_;
+};
+
+#endif
--- a/include/triton/lang/mem_pool.h
+++ b/include/triton/lang/mem_pool.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#ifndef _WGTCC_MEM_POOL_H_
+#define _WGTCC_MEM_POOL_H_
+
+#include <cstddef>
+#include <vector>
+
+
+class MemPool {
+public:
+  MemPool(): allocated_(0) {}
+  virtual ~MemPool() {}
+  MemPool(const MemPool& other) = delete;
+  MemPool& operator=(const MemPool& other) = delete;
+  virtual void* Alloc() = 0;
+  virtual void Free(void* addr) = 0;
+  virtual void Clear() = 0;
+
+protected:
+  size_t allocated_;
+};
+
+
+template <class T>
+class MemPoolImp: public MemPool {
+public:
+  MemPoolImp() : root_(nullptr) {}
+  virtual ~MemPoolImp() {}
+  MemPoolImp(const MemPool& other) = delete;
+  MemPoolImp& operator=(MemPool& other) = delete;
+  virtual void* Alloc();
+  virtual void Free(void* addr);
+  virtual void Clear();
+
+private:
+  enum {
+    COUNT = (4 * 1024) / sizeof(T)
+  };
+
+  union Chunk {
+    Chunk* next_;
+    char mem_[sizeof(T)];
+  };
+
+  struct Block {
+    Block() {
+      for (size_t i = 0; i < COUNT - 1; ++i)
+        chunks_[i].next_ = &chunks_[i+1];
+      chunks_[COUNT-1].next_ = nullptr;
+    }
+    Chunk chunks_[COUNT];
+  };
+
+  std::vector<Block*> blocks_;
+  Chunk* root_;
+};
+
+
+template <class T>
+void* MemPoolImp<T>::Alloc() {
+  if (nullptr == root_) { // 空间不够，需要分配空间
+    auto block = new Block();
+    root_ = block->chunks_;
+    // 如果blocks实现为std::list, 那么push_back实际的overhead更大
+    // 这也表明，即使我们不需要随机访问功能(那么std::vector的拷贝是一种overhead)，
+    // 仍然倾向于使用std::vector，
+    // 当然std::vector的指数级capacity增长会造成内存浪费。
+    blocks_.push_back(block);
+  }
+
+  auto ret = root_;
+  root_ = root_->next_;
+
+  ++allocated_;
+  return ret;
+}
+
+
+template <class T>
+void MemPoolImp<T>::Free(void* addr) {
+  if (nullptr == addr)
+    return;
+
+  auto chunk = static_cast<Chunk*>(addr);
+  chunk->next_ = root_;
+  root_ = chunk;
+
+  --allocated_;
+}
+
+
+template <class T>
+void MemPoolImp<T>::Clear() {
+  for (auto block: blocks_)
+    delete block;
+
+  blocks_.resize(0);
+  root_ = nullptr;
+  allocated_ = 0;
+}
+
+#endif
--- a/include/triton/lang/parser.h
+++ b/include/triton/lang/parser.h
@@ -0,0 +1,260 @@
+#pragma once
+
+#ifndef _PARSER_H_
+#define _PARSER_H_
+
+#include "ast.h"
+#include "encoding.h"
+#include "error.h"
+#include "mem_pool.h"
+#include "scope.h"
+#include "token.h"
+
+#include <cassert>
+#include <memory>
+#include <stack>
+
+
+class Preprocessor;
+
+struct DeclInfo {
+  DeclInfo(const Token* _tok,
+           QualType _type,
+           ASTNode::AttrList _attrs = {})
+    : tok(_tok), type(_type), attrs(_attrs) {}
+
+  const Token* tok;
+  QualType type;
+  ASTNode::AttrList attrs;
+};
+
+
+class Parser {
+  using LiteralList = std::vector<Constant*>;
+  using StaticObjectList = std::vector<Object*>;
+  using CaseLabelList = std::vector<std::pair<Constant*, LabelStmt*>>;
+  using LabelJumpList = std::list<std::pair<const Token*, JumpStmt*>>;
+  using LabelMap = std::map<std::string, LabelStmt*>;
+  friend class Generator;
+
+public:
+  explicit Parser(TokenSequence& ts)
+    : unit_(TranslationUnit::New()),
+      ts_(ts),
+      externalSymbols_(new Scope(nullptr, S_BLOCK)),
+      errTok_(nullptr),
+      curScope_(new Scope(nullptr, S_FILE)),
+      curFunc_(nullptr),
+      breakDest_(nullptr),
+      continueDest_(nullptr),
+      caseLabels_(nullptr),
+      defaultLabel_(nullptr) {
+        ts_.SetParser(this);
+      }
+
+  ~Parser() {}
+
+  Constant* ParseConstant(const Token* tok);
+  Constant* ParseFloat(const Token* tok);
+  Constant* ParseInteger(const Token* tok);
+  Constant* ParseCharacter(const Token* tok);
+  Encoding ParseLiteral(std::string& str, const Token* tok);
+  Constant* ConcatLiterals(const Token* tok);
+  Expr* ParseGeneric();
+
+  void Parse();
+  void ParseTranslationUnit();
+  FuncDef* ParseFuncDef(Identifier* ident);
+
+
+  // Expressions
+  Expr* ParseExpr();
+  Expr* ParsePrimaryExpr();
+  QualType TryCompoundLiteral();
+  Object* ParseCompoundLiteral(QualType type);
+  Expr* ParsePostfixExpr();
+  Expr* ParsePostfixExprTail(Expr* primExpr);
+  Expr* ParseSubScripting(Expr* pointer);
+  BinaryOp* ParseMemberRef(const Token* tok, int op, Expr* lhs);
+  UnaryOp* ParsePostfixIncDec(const Token* tok, Expr* operand);
+  FuncCall* ParseFuncCall(Expr* caller);
+
+  Expr* ParseUnaryExpr();
+  Constant* ParseSizeof();
+  Constant* ParseAlignof();
+  UnaryOp* ParsePrefixIncDec(const Token* tok);
+  UnaryOp* ParseUnaryIntrinsicOp(int op);
+  UnaryOp* ParseUnaryOp(const Token* tok, int op);
+  Expr* ParseDerefOp(const Token* tok);
+
+  QualType ParseTypeName();
+  Expr* ParseCastExpr();
+  Expr* ParseRangeExpr();
+  Expr* ParseMatmulExpr();
+  Expr* ParseMultiplicativeExpr();
+  Expr* ParseAdditiveExpr();
+  Expr* ParseShiftExpr();
+  Expr* ParseRelationalExpr();
+  Expr* ParseEqualityExpr();
+  Expr* ParseBitiwiseAndExpr();
+  Expr* ParseBitwiseXorExpr();
+  Expr* ParseBitwiseOrExpr();
+  Expr* ParseLogicalAndExpr();
+  Expr* ParseLogicalOrExpr();
+  Expr* ParseConditionalExpr();
+  Expr* ParseCommaExpr();
+  Expr* ParseAssignExpr();
+
+  // Declarations
+  CompoundStmt* ParseDecl();
+  void ParseStaticAssert();
+  QualType ParseDeclSpec(int* storageSpec, int* funcSpec, int* alignSpec);
+  QualType ParseSpecQual();
+  int ParseAlignas();
+  Type* ParseStructUnionSpec(bool isStruct);
+  StructType* ParseStructUnionDecl(StructType* type);
+  void ParseBitField(StructType* structType, const Token* tok, QualType type);
+  Type* ParseEnumSpec();
+  Type* ParseEnumerator(ArithmType* type);
+  int ParseQual();
+  QualType ParsePointer(QualType typePointedTo);
+  DeclInfo ParseDeclarator(QualType type);
+  QualType ParseArrayFuncDeclarator(const Token* ident, QualType base);
+  int ParseArrayLength();
+  TileType::ShapeInt ParseTileShape();
+  bool ParseParamList(FuncType::ParamList& params);
+  Object* ParseParamDecl();
+
+  QualType ParseAbstractDeclarator(QualType type);
+  Identifier* ParseDirectDeclarator(QualType type,
+                                    int storageSpec,
+                                    int funcSpec,
+                                    int align);
+  // Initializer
+  void ParseInitializer(Declaration* decl,
+                        QualType type,
+                        int offset,
+                        bool designated=false,
+                        bool forceBrace=false,
+                        unsigned char bitFieldBegin=0,
+                        unsigned char bitFieldWidth=0);
+  void ParseArrayInitializer(Declaration* decl,
+                             ArrayType* type,
+                             int offset,
+                             bool designated);
+  StructType::Iterator ParseStructDesignator(StructType* type,
+                                             const std::string& name);
+  void ParseStructInitializer(Declaration* decl,
+                              StructType* type,
+                              int offset,
+                              bool designated);
+  bool ParseLiteralInitializer(Declaration* init,
+                               ArrayType* type,
+                               int offset);
+  Declaration* ParseInitDeclarator(Identifier* ident);
+  Declaration* ParseInitDeclaratorSub(Object* obj);
+
+  // Statements
+  Stmt* ParseStmt();
+  CompoundStmt* ParseCompoundStmt(FuncType* funcType=nullptr);
+  IfStmt* ParseIfStmt();
+  CompoundStmt* ParseSwitchStmt();
+  CompoundStmt* ParseWhileStmt();
+  CompoundStmt* ParseDoStmt();
+  ForStmt *ParseForStmt();
+  JumpStmt* ParseGotoStmt();
+  JumpStmt* ParseContinueStmt();
+  JumpStmt* ParseBreakStmt();
+  ReturnStmt* ParseReturnStmt();
+  CompoundStmt* ParseLabelStmt(const Token* label);
+  CompoundStmt* ParseCaseStmt();
+  CompoundStmt* ParseDefaultStmt();
+  Identifier* ProcessDeclarator(const Token* tok,
+                                QualType type, const ASTNode::AttrList &attrs,
+                                int storageSpec,
+                                int funcSpec,
+                                int align);
+  // GNU extensions
+  ASTNode::AttrList TryAttributeSpecList();
+  void ParseAttributeSpec(ASTNode::AttrList &attrList);
+  ASTNode::Attr ParseAttribute();
+  bool IsTypeName(const Token* tok) const{
+    if (tok->IsTypeSpecQual())
+      return true;
+
+    if (tok->IsIdentifier()) {
+      auto ident = curScope_->Find(tok);
+      if (ident && ident->ToTypeName())
+        return true;
+    }
+    return false;
+  }
+  bool IsType(const Token* tok) const{
+    if (tok->IsDecl())
+      return true;
+
+    if (tok->IsIdentifier()) {
+      auto ident = curScope_->Find(tok);
+      return (ident && ident->ToTypeName());
+    }
+
+    return false;
+  }
+  void EnsureInteger(Expr* expr) {
+    if (!expr->Type()->IsInteger()) {
+      Error(expr, "expect integer expression");
+    }
+  }
+
+  void EnterBlock(FuncType* funcType=nullptr);
+  void ExitBlock() { curScope_ = curScope_->Parent(); }
+  void EnterProto() { curScope_ = new Scope(curScope_, S_PROTO); }
+  void ExitProto() { curScope_ = curScope_->Parent(); }
+  FuncDef* EnterFunc(Identifier* ident);
+  void ExitFunc();
+
+  LabelStmt* FindLabel(const std::string& label) {
+    auto ret = curLabels_.find(label);
+    if (curLabels_.end() == ret)
+      return nullptr;
+    return ret->second;
+  }
+  void AddLabel(const std::string& label, LabelStmt* labelStmt) {
+    assert(nullptr == FindLabel(label));
+    curLabels_[label] = labelStmt;
+  }
+  TranslationUnit* Unit() { return unit_; }
+  FuncDef* CurFunc() { return curFunc_; }
+  const TokenSequence& ts() const { return ts_; }
+
+protected:
+  static bool IsBuiltin(FuncType* type);
+  static bool IsBuiltin(const std::string& name);
+  static Identifier* GetBuiltin(const Token* tok);
+  static void DefineBuiltins();
+
+  static FuncType* vaStartType_;
+  static FuncType* vaArgType_;
+
+  // The root of the AST
+  TranslationUnit* unit_;
+
+  TokenSequence& ts_;
+
+  // It is not the real scope,
+  // It contains all external symbols(resolved and not resolved)
+  Scope* externalSymbols_;
+
+  const Token* errTok_;
+  Scope* curScope_;
+  FuncDef* curFunc_;
+  LabelMap curLabels_;
+  LabelJumpList unresolvedJumps_;
+
+  LabelStmt* breakDest_;
+  LabelStmt* continueDest_;
+  CaseLabelList* caseLabels_;
+  LabelStmt* defaultLabel_;
+};
+
+#endif
--- a/include/triton/lang/scanner.h
+++ b/include/triton/lang/scanner.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#ifndef _WGTCC_SCANNER_H_
+#define _WGTCC_SCANNER_H_
+
+#include "error.h"
+#include "encoding.h"
+#include "token.h"
+
+#include <string>
+#include <cassert>
+
+
+class Scanner {
+public:
+  explicit Scanner(const Token* tok)
+      : Scanner(&tok->str_, tok->loc_) {}
+  Scanner(const std::string* text, const SourceLocation& loc)
+      : Scanner(text, loc.filename_, loc.line_, loc.column_) {}
+  explicit Scanner(const std::string* text,
+                   const std::string* filename=nullptr,
+                   unsigned line=1, unsigned column=1)
+      : text_(text), tok_(Token::END) {
+    // TODO(wgtdkp): initialization
+    p_ = &(*text_)[0];
+    loc_ = {filename, p_, line, 1};
+  }
+
+  virtual ~Scanner() {}
+  Scanner(const Scanner& other) = delete;
+  Scanner& operator=(const Scanner& other) = delete;
+
+  // Scan plain text and generate tokens in ts.
+  // The param 'ts' need not be empty, if so, the tokens
+  // are inserted at the *header* of 'ts'.
+  // The param 'ws' tells if there is leading white space
+  // before this token, it is only SkipComment() that will
+  // set this param.
+  Token* Scan(bool ws=false);
+  void Tokenize(TokenSequence& ts);
+  static std::string ScanHeadName(const Token* lhs, const Token* rhs);
+  Encoding ScanCharacter(int& val);
+  Encoding ScanLiteral(std::string& val);
+  std::string ScanIdentifier();
+
+private:
+  Token* SkipIdentifier();
+  Token* SkipNumber();
+  Token* SkipLiteral();
+  Token* SkipCharacter();
+  Token* MakeToken(int tag);
+  Token* MakeNewLine();
+  Encoding ScanEncoding(int c);
+  int ScanEscaped();
+  int ScanHexEscaped();
+  int ScanOctEscaped(int c);
+  int ScanUCN(int len);
+  void SkipWhiteSpace();
+  void SkipComment();
+  bool IsUCN(int c) { return c == '\\' && (Test('u') || Test('U')); }
+  bool IsOctal(int c) { return '0' <= c && c <= '7'; }
+  int XDigit(int c);
+  bool Empty() const { return *p_ == 0; }
+  int Peek();
+  bool Test(int c) { return Peek() == c; };
+  int Next();
+  void PutBack();
+  bool Try(int c) {
+    if (Peek() == c) {
+      Next();
+      return true;
+    }
+    return false;
+  };
+  void Mark() { tok_.loc_ = loc_; };
+
+  const std::string* text_;
+  SourceLocation loc_;
+  Token tok_;
+  const char* p_;
+};
+
+
+std::string* ReadFile(const std::string& filename);
+
+#endif
--- a/include/triton/lang/scope.h
+++ b/include/triton/lang/scope.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#ifndef _WGTCC_SCOPE_H_
+#define _WGTCC_SCOPE_H_
+
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+
+class Identifier;
+class Token;
+
+
+enum ScopeType {
+  S_FILE,
+  S_PROTO,
+  S_BLOCK,
+  S_FUNC,
+};
+
+
+class Scope {
+  friend class StructType;
+  using TagList = std::vector<Identifier*>;
+  using IdentMap = std::map<std::string, Identifier*>;
+
+public:
+  explicit Scope(Scope* parent, enum ScopeType type)
+      : parent_(parent), type_(type) {}
+  ~Scope() {}
+  Scope* Parent() { return parent_; }
+  void SetParent(Scope* parent) { parent_ = parent; }
+  enum ScopeType Type() const { return type_; }
+
+  Identifier* Find(const Token* tok);
+  Identifier* FindInCurScope(const Token* tok);
+  Identifier* FindTag(const Token* tok);
+  Identifier* FindTagInCurScope(const Token* tok);
+  TagList AllTagsInCurScope() const;
+
+  void Insert(Identifier* ident);
+  void Insert(const std::string& name, Identifier* ident);
+  void InsertTag(Identifier* ident);
+  void Print();
+  bool operator==(const Scope& other) const { return type_ == other.type_; }
+  IdentMap::iterator begin() { return identMap_.begin(); }
+  IdentMap::iterator end() { return identMap_.end(); }
+  size_t size() const { return identMap_.size(); }
+
+private:
+  Identifier* Find(const std::string& name);
+  Identifier* FindInCurScope(const std::string& name);
+  Identifier* FindTag(const std::string& name);
+  Identifier* FindTagInCurScope(const std::string& name);
+  std::string TagName(const std::string& name) {
+    return name + "@:tag";
+  }
+  static bool IsTagName(const std::string& name) {
+    return name.size() > 5 && name[name.size() - 5] == '@';
+  }
+  const Scope& operator=(const Scope& other);
+  Scope(const Scope& scope);
+
+  Scope* parent_;
+  enum ScopeType type_;
+
+  IdentMap identMap_;
+};
+
+#endif
--- a/include/triton/lang/token.h
+++ b/include/triton/lang/token.h
@@ -0,0 +1,434 @@
+#pragma once
+
+#ifndef _WGTCC_TOKEN_H_
+#define _WGTCC_TOKEN_H_
+
+#include "error.h"
+
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <list>
+#include <set>
+#include <string>
+#include <unordered_map>
+
+
+class Generator;
+class Parser;
+class Scanner;
+class Token;
+class TokenSequence;
+
+using HideSet = std::set<std::string>;
+using TokenList = std::list<const Token*>;
+
+
+struct SourceLocation {
+  const std::string* filename_;
+  const char* lineBegin_;
+  unsigned line_;
+  unsigned column_;
+
+  const char* Begin() const {
+    return lineBegin_ + column_ - 1;
+  }
+};
+
+
+class Token {
+  friend class Scanner;
+public:
+  enum {
+    // Punctuators
+    LPAR = '(',
+    RPAR = ')',
+    LSQB = '[',
+    RSQB = ']',
+    COLON = ':',
+    COMMA = ',',
+    SEMI = ';',
+    ADD = '+',
+    SUB = '-',
+    MUL = '*',
+    DIV = '/',
+    OR = '|',
+    AND = '&',
+    XOR = '^',
+    LESS = '<',
+    GREATER = '>',
+    EQUAL = '=',
+    DOT = '.',
+    MOD = '%',
+    LBRACE = '{',
+    RBRACE = '}',
+    TILDE = '~',
+    NOT = '!',
+    COND = '?',
+    SHARP = '#',
+    MATMUL = '@',
+    NEW_LINE = '\n',
+
+    DSHARP = 128, // '##'
+    PTR,
+    INC,
+    DEC,
+    LEFT,
+    RIGHT,
+    LE,
+    GE,
+    EQ,
+    NE,
+    LOGICAL_AND,
+    LOGICAL_OR,
+
+    MUL_ASSIGN,
+    DIV_ASSIGN,
+    MOD_ASSIGN,
+    ADD_ASSIGN,
+    SUB_ASSIGN,
+    LEFT_ASSIGN,
+    RIGHT_ASSIGN,
+    AND_ASSIGN,
+    XOR_ASSIGN,
+    OR_ASSIGN,
+
+    ELLIPSIS,
+    MASKED_DEREF,
+    // Punctuators end
+
+    // KEYWORD BEGIN
+    // TYPE QUALIFIER BEGIN
+    CONST,
+    RESTRICT,
+    VOLATILE,
+    ATOMIC,
+    // TYPE QUALIFIER END
+
+    // TYPE SPECIFIER BEGIN
+    VOID,
+    CHAR,
+    SHORT,
+    INT,
+    LONG,
+    HALF,
+    FLOAT,
+    DOUBLE,
+    SIGNED,
+    UNSIGNED,
+    BOOL,		// _Bool
+    COMPLEX,	// _Complex
+    STRUCT,
+    UNION,
+    ENUM,
+    // TYPE SPECIFIER END
+
+    ATTRIBUTE, // GNU extension __attribute__
+    // FUNCTION SPECIFIER BEGIN
+    INLINE,
+    NORETURN,	// _Noreturn
+    // FUNCTION SPECIFIER END
+
+    // TILE ARITHMETICS BEGIN
+    NEWAXIS,
+    MAX,
+    MIN,
+    // TILE ARITHMETICS END
+
+    ALIGNAS, // _Alignas
+    // For syntactic convenience
+    STATIC_ASSERT, // _Static_assert
+    // STORAGE CLASS SPECIFIER BEGIN
+    TYPEDEF,
+    EXTERN,
+    STATIC,
+    THREAD,	// _Thread_local
+    AUTO,
+    GLOBAL,
+    CMEM, // constant memory
+
+    // STORAGE CLASS SPECIFIER END
+    BREAK,
+    CASE,
+    CONTINUE,
+    DEFAULT,
+    DO,
+    ELSE,
+    FOR,
+    GOTO,
+    IF,
+    RETURN,
+    SIZEOF,
+    SWITCH,
+    WHILE,
+    ALIGNOF, // _Alignof
+    GENERIC, // _Generic
+    IMAGINARY, // _Imaginary
+    // function keywords
+    BITCAST,
+    EXP,
+    LOG,
+    SQRTF,
+    // KEYWORD END
+
+    IDENTIFIER,
+    CONSTANT,
+    I_CONSTANT,
+    C_CONSTANT,
+    F_CONSTANT,
+    LITERAL,
+
+    // For the parser, a identifier is a typedef name or user defined type
+    POSTFIX_INC,
+    POSTFIX_DEC,
+    PREFIX_INC,
+    PREFIX_DEC,
+    ADDR,  // '&'
+    DEREF, // '*'
+    PLUS,
+    MINUS,
+    CAST,
+    REDUCE,
+
+    // For preprocessor
+    PP_IF,
+    PP_IFDEF,
+    PP_IFNDEF,
+    PP_ELIF,
+    PP_ELSE,
+    PP_ENDIF,
+    PP_INCLUDE,
+    PP_DEFINE,
+    PP_UNDEF,
+    PP_LINE,
+    PP_ERROR,
+    PP_PRAGMA,
+    PP_NONE,
+    PP_EMPTY,
+
+
+    IGNORE,
+    INVALID,
+    END,
+    NOTOK = -1,
+  };
+
+  static Token* New(int tag);
+  static Token* New(const Token& other);
+  static Token* New(int tag,
+                    const SourceLocation& loc,
+                    const std::string& str,
+                    bool ws=false);
+  Token& operator=(const Token& other) {
+    tag_ = other.tag_;
+    ws_ = other.ws_;
+    loc_ = other.loc_;
+    str_ = other.str_;
+    hs_ = other.hs_ ? new HideSet(*other.hs_): nullptr;
+    return *this;
+  }
+  virtual ~Token() {}
+
+  // Token::NOTOK represents not a kw.
+  static int KeyWordTag(const std::string& key) {
+    auto kwIter = kwTypeMap_.find(key);
+    if (kwTypeMap_.end() == kwIter)
+      return Token::NOTOK;	// Not a key word type
+    return kwIter->second;
+  }
+  static bool IsKeyWord(const std::string& name);
+  static bool IsKeyWord(int tag) { return CONST <= tag && tag < IDENTIFIER; }
+  bool IsKeyWord() const { return IsKeyWord(tag_); }
+  bool IsPunctuator() const { return 0 <= tag_ && tag_ <= ELLIPSIS; }
+  bool IsLiteral() const { return tag_ == LITERAL; }
+  bool IsConstant() const { return CONSTANT <= tag_ && tag_ <= F_CONSTANT; }
+  bool IsIdentifier() const { return IDENTIFIER == tag_; }
+  bool IsEOF() const { return tag_ == Token::END; }
+  bool IsTypeSpecQual() const { return CONST <= tag_ && tag_ <= ENUM; }
+  bool IsDecl() const { return CONST <= tag_ && tag_ <= GLOBAL; }
+  static const char* Lexeme(int tag) {
+    auto iter = tagLexemeMap_.find(tag);
+    if (iter == tagLexemeMap_.end())
+      return nullptr;
+
+    return iter->second;
+  }
+
+  int tag_;
+
+  // 'ws_' standards for weither there is preceding white space
+  // This is to simplify the '#' operator(stringize) in macro expansion
+  bool ws_ { false };
+  SourceLocation loc_;
+
+  std::string str_;
+  HideSet* hs_ { nullptr };
+
+private:
+  explicit Token(int tag): tag_(tag) {}
+  Token(int tag, const SourceLocation& loc,
+        const std::string& str, bool ws=false)
+      : tag_(tag), ws_(ws), loc_(loc), str_(str) {}
+
+  Token(const Token& other) {
+    *this = other;
+  }
+
+  static const std::unordered_map<std::string, int> kwTypeMap_;
+  static const std::unordered_map<int, const char*> tagLexemeMap_;
+};
+
+
+class TokenSequence {
+  friend class Preprocessor;
+
+public:
+  TokenSequence(): tokList_(new TokenList()),
+                   begin_(tokList_->begin()), end_(tokList_->end()) {}
+  explicit TokenSequence(Token* tok) {
+    TokenSequence();
+    InsertBack(tok);
+  }
+  explicit TokenSequence(TokenList* tokList)
+      : tokList_(tokList),
+        begin_(tokList->begin()),
+        end_(tokList->end()) {}
+  TokenSequence(TokenList* tokList,
+                TokenList::iterator begin,
+                TokenList::iterator end)
+      : tokList_(tokList), begin_(begin), end_(end) {}
+  ~TokenSequence() {}
+  TokenSequence(const TokenSequence& other) { *this = other; }
+  const TokenSequence& operator=(const TokenSequence& other) {
+    tokList_ = other.tokList_;
+    begin_ = other.begin_;
+    end_ = other.end_;
+    return *this;
+  }
+  void Copy(const TokenSequence& other) {
+    tokList_ = new TokenList(other.begin_, other.end_);
+    begin_ = tokList_->begin();
+    end_ = tokList_->end();
+    for (auto iter = begin_; iter != end_; ++iter)
+      *iter = Token::New(**iter);
+  }
+  void UpdateHeadLocation(const SourceLocation& loc) {
+    assert(!Empty());
+    auto tok = const_cast<Token*>(Peek());
+    tok->loc_ = loc;
+  }
+  void FinalizeSubst(bool leadingWS, const HideSet& hs) {
+    auto ts = *this;
+    while (!ts.Empty()) {
+      auto tok = const_cast<Token*>(ts.Next());
+      if (!tok->hs_)
+        tok->hs_ = new HideSet(hs);
+      else
+        tok->hs_->insert(hs.begin(), hs.end());
+    }
+    // Even if the token sequence is empty
+    const_cast<Token*>(Peek())->ws_ = leadingWS;
+  }
+
+  const Token* Expect(int expect);
+  bool Try(int tag) {
+    if (Peek()->tag_ == tag) {
+      Next();
+      return true;
+    }
+    return false;
+  }
+  bool Test(int tag) { return Peek()->tag_ == tag; }
+  const Token* Next() {
+    auto ret = Peek();
+    if (!ret->IsEOF()) {
+      ++begin_;
+      Peek(); // May skip newline token, but why ?
+    } else {
+      ++exceed_end;
+    }
+    return ret;
+  }
+  void PutBack() {
+    assert(begin_ != tokList_->begin());
+    if (exceed_end > 0) {
+      --exceed_end;
+    } else {
+      --begin_;
+      if ((*begin_)->tag_ == Token::NEW_LINE)
+        PutBack();
+    }
+  }
+  const Token* Peek() const;
+  const Token* Peek2() {
+    if (Empty())
+      return Peek(); // Return the Token::END
+    Next();
+    auto ret = Peek();
+    PutBack();
+    return ret;
+  }
+  const Token* Back() const {
+    auto back = end_;
+    return *--back;
+  }
+  void PopBack() {
+    assert(!Empty());
+    assert(end_ == tokList_->end());
+    auto size_eq1 = tokList_->back() == *begin_;
+    tokList_->pop_back();
+    end_ = tokList_->end();
+    if (size_eq1)
+      begin_ = end_;
+  }
+  TokenList::iterator Mark() { return begin_; }
+  void ResetTo(TokenList::iterator mark) { begin_ = mark; }
+  bool Empty() const { return Peek()->tag_ == Token::END; }
+  void InsertBack(TokenSequence& ts) {
+    auto pos = tokList_->insert(end_, ts.begin_, ts.end_);
+    if (begin_ == end_) {
+      begin_ = pos;
+    }
+  }
+  void InsertBack(const Token* tok) {
+    auto pos = tokList_->insert(end_, tok);
+    if (begin_ == end_) {
+      begin_ = pos;
+    }
+  }
+
+  // If there is preceding newline
+  void InsertFront(TokenSequence& ts) {
+    auto pos = GetInsertFrontPos();
+    begin_ = tokList_->insert(pos, ts.begin_, ts.end_);
+  }
+  void InsertFront(const Token* tok) {
+    auto pos = GetInsertFrontPos();
+    begin_ = tokList_->insert(pos, tok);
+  }
+  bool IsBeginOfLine() const;
+  TokenSequence GetLine();
+  void SetParser(Parser* parser) { parser_ = parser; }
+  void Print(FILE* fp=stdout) const;
+  void Print(std::string *str) const;
+
+private:
+  // Find a insert position with no preceding newline
+  TokenList::iterator GetInsertFrontPos() {
+    auto pos = begin_;
+    if (pos == tokList_->begin())
+      return pos;
+    --pos;
+    while (pos != tokList_->begin() && (*pos)->tag_ == Token::NEW_LINE)
+      --pos;
+    return ++pos;
+  }
+
+  TokenList* tokList_;
+  mutable TokenList::iterator begin_;
+  TokenList::iterator end_;
+  Parser* parser_ {nullptr};
+  int exceed_end {0};
+};
+
+#endif
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Philippe Tillet	925be2ec4e	[PYTHON] Modified version number to v0.4	2021-05-06 02:58:42 -04:00
Philippe Tillet	bf7de6b4b2	[DOCS] Various improvements and typo fixes	2021-05-06 02:58:14 -04:00
Philippe Tillet	4d41796d61	[CODEGEN] Make sure peephole is called before anything else in codegen	2021-03-28 17:08:38 -04:00
Philippe Tillet	6c2e3d064d	[PYTHON] Fixed formatting issue in conv.c	2021-03-26 01:37:23 -04:00
Philippe Tillet	8e15a54d58	[BUILD] Remove compilation warnings	2021-03-24 01:24:50 -04:00
Philippe Tillet	ce7c0a2b10	[CI] Changed triton-nightly to --pre triton (#78 ) The solution proposed in #77 can create namespace conflicts when triton and triton-nightly have both been pip installed. Therefore, this PR is moving nightly releases to pre-releases in the main triton index.	2021-03-23 03:32:51 -04:00
Philippe Tillet	9c05ec148f	[BUILD] Added automatic nightly build releases to pip in CI; removed build-time dependence on LLVM and PyTorch (#77 ) Recently there has been more and more report about installation issues: - Installing Triton before upgrading pytorch can create some issues because Triton uses some torch headers - llvm-10-dev not available on some platform; llvm-11-dev not available on e.g. Ubuntu. absence of nightly builds This PR should fix all these issues. Some CMake tricks are used to download and install llvm at build time. Triton Python bindings were modified to remove dependence on pytorch ops. Midnight CI job added to generate binary wheels for all Triton version and update them on pypi's new triton-nightly project. This PR will also make it very easy to use LLVM forks in the future for whatever needs we have.	2021-03-22 20:03:37 -04:00
Philippe Tillet	a905fe6ec5	[DOCS] Uncommented sphinx gallery	2021-03-19 16:19:13 -04:00
Philippe Tillet	0042d7e390	[DOCS] Improved index	2021-03-19 15:37:15 -04:00
Philippe Tillet	ec51a2e9a5	[DOCS] Added non-tutorial documentation pages	2021-03-19 15:27:19 -04:00
Philippe Tillet	2f8f0042a9	[DOCS] Added matrix multiplication tutorial	2021-03-15 13:57:41 -04:00
Philippe Tillet	d1c0bf2bea	[DOCS] Removed pip installation instruction as version on Pip is not up-to-date	2021-03-11 12:05:34 -05:00
Philippe Tillet	134e246117	[DOCS] Improved plots in tutorials	2021-03-11 00:42:29 -05:00
Philippe Tillet	58207d4647	[PYTHON] CUTLASS wrapper for fair benchmarks (#75 ) Before this commit, the benchmarking infrastructure used heterogeneous protocols between library (e.g., CUTLASS uses a C++ binary that reports mean TFLOPS; torch and triton use python call and report 10th, 50th and 90th quantiles). For the sake of uniformity and fair benchmark practices, this PR adds a python wrapper for auto-tuned CUTLASS matrix multiplication. Benchmarks have been rewritten to use this wrapper with `triton.testing.do_bench` rather than system calls to CUTLASS profiler. Importantly, this also ensures that all the matmuls are done on the same input data which should stabilize clock across providers.	2021-03-09 16:32:44 -05:00
Philippe Tillet	d25b7bc115	[README] Now linking to the documentation	2021-03-08 20:22:32 -05:00
Philippe Tillet	4781f979b2	[PYTHON] Made `bench_blocksparse` and `bench_cross_entropy` compatible with the new performance report API	2021-03-08 20:19:10 -05:00
Philippe Tillet	061ef3920e	[CODEGEN] Fixed bug that caused conditional operator to not always properly mask load operations Also includes minor improvement to benchmarking infrastructure	2021-03-08 20:04:26 -05:00
Philippe Tillet	dfa0d45ffe	[DOCS] Improved tutorials documentation	2021-03-06 22:04:00 -05:00
Philippe Tillet	b8f2875d28	[PYTHON] Changed benchmarking strategy. Instead of enqueueing many kernels before synchronizing, the kernels are now enqueued one by one. This makes it possible to clear the L2 cache before running the workload, and also potentially collect some variance data for error bars in plots	2021-03-06 22:02:18 -05:00
Philippe Tillet	e78211c8f5	[DOCS] Re-structured documentation hierarchy	2021-03-06 17:26:49 -05:00
Philippe Tillet	85d1b02e16	[DOCS] Switched tutorials to Python and use Sphinx Gallery	2021-03-06 14:03:01 -05:00
Philippe Tillet	5dd4cfc077	[DOCS] Added .ipynb tutorials in docs	2021-03-06 02:57:41 -05:00