From acd5e44611e33b05a9ab168ddda14bea498ba0d1 Mon Sep 17 00:00:00 2001
From: Philippe Tillet <phil@openai.com>
Date: Wed, 28 Jul 2021 01:51:17 -0700
Subject: [PATCH] [GENERAL] Some minor improvements here and there to build
 systems and docs (#148)

---
 .../{build-website.yml => documentation.yml}  |  4 +--
 .../{build-wheels.yml => wheels.yml}          |  2 +-
 CMakeLists.txt                                | 29 ++++++++++++-------
 README.md                                     | 20 ++++++++++---
 docs/getting-started/installation.rst         | 29 +++++++------------
 python/setup.py                               | 17 ++++++-----
 python/tutorials/01-vector-add.py             |  8 ++---
 7 files changed, 60 insertions(+), 49 deletions(-)
 rename .github/workflows/{build-website.yml => documentation.yml} (94%)
 rename .github/workflows/{build-wheels.yml => wheels.yml} (98%)
diff --git a/.github/workflows/build-website.yml b/.github/workflows/documentation.yml
similarity index 94%
rename from .github/workflows/build-website.yml
rename to .github/workflows/documentation.yml
index f991d2ef4..a61a77ee9 100644
--- a/.github/workflows/build-website.yml
+++ b/.github/workflows/documentation.yml
@@ -1,4 +1,4 @@
-name: Build Website
+name: Documentation
 on:
   workflow_dispatch:
   schedule:    
@@ -6,7 +6,7 @@ on:
 
 jobs:
 
-  Build-Website:
+  Build-Documentation:
     
     runs-on: self-hosted
 
diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/wheels.yml
similarity index 98%
rename from .github/workflows/build-wheels.yml
rename to .github/workflows/wheels.yml
index 7175428c5..1d8d450f2 100644
--- a/.github/workflows/build-wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -1,4 +1,4 @@
-name: Build Wheels
+name: Wheels
 on:
   workflow_dispatch:
   schedule:    
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a66601cd4..bc3fccd65 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,21 +25,28 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__STDC_FORMAT_MACROS  -std=gnu++17")
 
 
-# if(APPLE)
-#     set(CMAKE_OSX_SYSROOT "/")
-#     set(CMAKE_OSX_DEPLOYMENT_TARGET "")
-# endif()
-
-
 ##########
 # LLVM
 ##########
-find_package(LLVM 11 REQUIRED COMPONENTS "nvptx")
-message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
-include_directories("${LLVM_INCLUDE_DIRS}")
-if(APPLE)
-  set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14")
+if("${LLVM_LIBRARY_DIR}" STREQUAL "")
+    find_package(LLVM 11 REQUIRED COMPONENTS "nvptx")
+    message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
+    if(APPLE)
+      set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14")
+    endif()
+# sometimes we don't want to use llvm-config, since it may have been downloaded for some specific linux distros
+else()
+    set(LLVM_LDFLAGS "-L${LLVM_LIBRARY_DIR}")
+    set(LLVM_LIBRARIES libLLVMNVPTXCodeGen.a libLLVMSelectionDAG.a libLLVMipo.a libLLVMInstrumentation.a
+                       libLLVMVectorize.a libLLVMLinker.a libLLVMIRReader.a libLLVMAsmParser.a libLLVMFrontendOpenMP.a
+                       libLLVMAsmPrinter.a libLLVMDebugInfoDWARF.a libLLVMCodeGen.a libLLVMTarget.a libLLVMScalarOpts.a
+                       libLLVMInstCombine.a libLLVMAggressiveInstCombine.a libLLVMTransformUtils.a libLLVMBitWriter.a
+                       libLLVMAnalysis.a libLLVMProfileData.a libLLVMObject.a libLLVMTextAPI.a libLLVMMCParser.a
+                       libLLVMBitReader.a libLLVMCore.a libLLVMRemarks.a libLLVMBitstreamReader.a libLLVMNVPTXDesc.a
+                       libLLVMMC.a libLLVMDebugInfoCodeView.a libLLVMDebugInfoMSF.a libLLVMBinaryFormat.a libLLVMNVPTXInfo.a
+                       libLLVMSupport.a libLLVMDemangle.a)
 endif()
+include_directories("${LLVM_INCLUDE_DIRS}")
 
 # Python module
 if(BUILD_PYTHON_MODULE)
diff --git a/README.md b/README.md
index 78c3e0664..0eabf8129 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,20 @@
-<img src="https://cdn.openai.com/triton/assets/triton-logo.png" alt="Triton logo" width="80" height="91">
+<div align="center">
+  <img src="https://cdn.openai.com/triton/assets/triton-logo.png" alt="Triton logo" width="88" height="100">
+</div>
+
+[![Wheels](https://github.com/openai/triton/actions/workflows/wheels.yml/badge.svg?branch=master)](https://github.com/openai/triton/actions/workflows/wheels.yml)
+
+
+**`Documentation`** |
+------------------- |
+[![Documentation](https://github.com/openai/triton/actions/workflows/documentation.yml/badge.svg)](https://triton-lang.org/)
+
 
 # Triton
 
 This is the development repository of Triton, a language and compiler for writing highly efficient custom Deep-Learning primitives. The aim of Triton is to provide an open-source environment to write fast code at higher productivity than CUDA, but also with higher flexibility than other existing DSLs.
 
-[![Build Status](https://dev.azure.com/triton-lang/Triton/_apis/build/status/ptillet.triton?branchName=master)](https://dev.azure.com/triton-lang/Triton/_build/latest?definitionId=10&branchName=master)
-
-The foundations of this project are described in the following MAPL2019 publication: [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). Please consider citing us if you use our work!
+The foundations of this project are described in the following MAPL2019 publication: [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). Please consider citing this work if you use Triton!
 
 The [official documentation](https://triton-lang.org) contains installation instructions and tutorials.
 
@@ -18,3 +26,7 @@ Supported Platforms:
 Supported Hardware:
   * NVIDIA GPUs (Compute Capability 7.0+)
   * Under development: AMD GPUs, CPUs
+
+# Disclaimer
+
+Triton is a fairly recent project, and it is under active development. We expect it to be pretty useful in a wide variety of cases, but don't be surprised if it's a bit rough around the edges :)
\ No newline at end of file
diff --git a/docs/getting-started/installation.rst b/docs/getting-started/installation.rst
index 1a514fe9f..57fd589a2 100644
--- a/docs/getting-started/installation.rst
+++ b/docs/getting-started/installation.rst
@@ -6,7 +6,13 @@ Installation
 Binary Distributions
 ---------------------
 
-You can install the latest nightly release of Triton from pip:
+You can install the latest stable release of Triton from pip:
+
+      pip install triton
+
+Binary wheels are available for CPython 3.6-3.9 and PyPy 3.6-3.7.
+
+And the latest nightly release:
 
 .. code-block:: bash
   
@@ -27,9 +33,10 @@ You can install the Python package from source by running the following commands
 
       git clone https://github.com/ptillet/triton.git;
       cd triton/python;
+      pip install cmake; # build time dependency
       pip install -e .
 
-This may take a while (10-20 minutes) as it will download and compile LLVM from source.
+Note that, if llvm-11 is not present on your system, the setup.py script will download LLVM static libraries on the web and link against that.
 
 You can then test your installation by running the unit tests:
 
@@ -42,20 +49,4 @@ and the benchmarks
 .. code-block:: bash
       
       cd bench/
-      python -m run --with-plots --result-dir /tmp/triton-bench
-
-+++++++++++++++
-C++ Package
-+++++++++++++++
-
-Those not interested in Python integration may want to use the internals of Triton (i.e, runtime, parser, codegen, driver, intermediate representation) directly. This can be done by running the following commands:
-
-.. code-block:: bash
-
-      git clone https://github.com/ptillet/triton.git;
-      mkdir build;
-      cd build;
-      cmake ../;
-      make -j8;
-
-Note that while direct usage of the C++ API is not officially supported, a usage tutorial can be found  `here <https://github.com/ptillet/triton/blob/master/tutorials/01-matmul.cc>`_
+      python -m run --with-plots --result-dir /tmp/triton-bench
\ No newline at end of file
diff --git a/python/setup.py b/python/setup.py
index 8ae2402d1..d53a6310d 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -23,12 +23,13 @@ def get_llvm():
     paths = [distutils.spawn.find_executable(cfg) for cfg in supported]
     paths = [p for p in paths if p is not None]
     if paths:
-      return paths[0]
+      return '', ''
     # download if nothing is installed
     name = 'clang+llvm-11.0.1-x86_64-linux-gnu-ubuntu-16.04'
     dir = '/tmp'
-    llvm_config = '{dir}/{name}/bin/llvm-config'.format(dir=dir, name=name)
-    if not os.path.exists(llvm_config):
+    llvm_include_dir = '{dir}/{name}/include'.format(dir=dir, name=name)
+    llvm_library_dir = '{dir}/{name}/lib'.format(dir=dir, name=name)
+    if not os.path.exists(llvm_library_dir):
         try:
             shutil.rmtree(os.path.join(dir, name))
         except:
@@ -38,7 +39,7 @@ def get_llvm():
         ftpstream = urllib.request.urlopen(url)
         file = tarfile.open(fileobj=ftpstream, mode="r|xz")
         file.extractall(path=dir)
-    return llvm_config
+    return llvm_include_dir, llvm_library_dir
 
 
 class CMakeExtension(Extension):
@@ -76,7 +77,7 @@ class CMakeBuild(build_ext):
             self.build_extension(ext)
 
     def build_extension(self, ext):
-        llvm_config = get_llvm()
+        llvm_include_dir, llvm_library_dir = get_llvm()
         # self.debug = True
         extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.path)))
         # create build directories
@@ -88,12 +89,12 @@ class CMakeBuild(build_ext):
             os.makedirs(llvm_build_dir)
         # python directories
         python_include_dirs = [distutils.sysconfig.get_python_inc()] + ['/usr/local/cuda/include']
-        python_lib_dirs = distutils.sysconfig.get_config_var("LIBDIR")
         cmake_args = [
             "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
             "-DBUILD_TUTORIALS=OFF",
             "-DBUILD_PYTHON_MODULE=ON",
-            "-DLLVM_CONFIG=" + llvm_config,
+            "-DLLVM_INCLUDE_DIRS=" + llvm_include_dir,
+            "-DLLVM_LIBRARY_DIR=" + llvm_library_dir,
             #'-DPYTHON_EXECUTABLE=' + sys.executable,
             #'-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON',
             "-DTRITON_LLVM_BUILD_DIR=" + llvm_build_dir,
@@ -126,7 +127,7 @@ setup(
     description="A language and compiler for custom Deep Learning operations",
     long_description="",
     packages=["triton", "triton/_C", "triton/tools", "triton/ops", "triton/ops/blocksparse"],
-    install_requires=["numpy", "torch"],
+    install_requires=["torch"],
     package_data={"triton/ops": ["*.c"], "triton/ops/blocksparse": ["*.c"]},
     include_package_data=True,
     ext_modules=[CMakeExtension("triton", "triton/_C/")],
diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py
index 122e1ac13..ad6303858 100644
--- a/python/tutorials/01-vector-add.py
+++ b/python/tutorials/01-vector-add.py
@@ -41,8 +41,8 @@ def _add(
 
 
 # %%
-# Let's also declare a helper function that to (1) allocate the output vector
-# and (2) enqueueing the above kernel.
+# Let's also declare a helper function to (1) allocate the `z` tensor
+# and (2) enqueue the above kernel with appropriate grid/block sizes.
 
 
 def add(x, y):
@@ -80,7 +80,7 @@ print(f'The maximum difference between torch and triton is ' f'{torch.max(torch.
 # %%
 # Benchmark
 # -----------
-# We can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch.
+# We can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch.
 # To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops
 # for different problem sizes.
 
@@ -111,6 +111,6 @@ def benchmark(size, provider):
 
 
 # %%
-# We can now run the decorated function above. Pass `show_plots=True` to see the plots and/or
+# We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or
 # `save_path='/path/to/results/' to save them to disk along with raw CSV data
 benchmark.run(print_data=True, show_plots=True)
\ No newline at end of file