diff --git a/_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py b/_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py
index 122e1ac13..ad6303858 100644
--- a/_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py
+++ b/_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py
@@ -41,8 +41,8 @@ def _add(
# %%
-# Let's also declare a helper function that to (1) allocate the output vector
-# and (2) enqueueing the above kernel.
+# Let's also declare a helper function to (1) allocate the `z` tensor
+# and (2) enqueue the above kernel with appropriate grid/block sizes.
def add(x, y):
@@ -80,7 +80,7 @@ print(f'The maximum difference between torch and triton is ' f'{torch.max(torch.
# %%
# Benchmark
# -----------
-# We can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch.
+# We can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch.
# To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops
# for different problem sizes.
@@ -111,6 +111,6 @@ def benchmark(size, provider):
# %%
-# We can now run the decorated function above. Pass `show_plots=True` to see the plots and/or
+# We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or
# `save_path='/path/to/results/' to save them to disk along with raw CSV data
benchmark.run(print_data=True, show_plots=True)
\ No newline at end of file
diff --git a/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip b/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip
index a64b0a1fe..85fcdbb4b 100644
Binary files a/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip and b/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip differ
diff --git a/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip b/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip
index 4bf5a9d63..7ab3fb329 100644
Binary files a/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip and b/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip differ
diff --git a/_downloads/f191ee1e78dc52eb5f7cba88f71cef2f/01-vector-add.ipynb b/_downloads/f191ee1e78dc52eb5f7cba88f71cef2f/01-vector-add.ipynb
index facb8e259..19763a6af 100644
--- a/_downloads/f191ee1e78dc52eb5f7cba88f71cef2f/01-vector-add.ipynb
+++ b/_downloads/f191ee1e78dc52eb5f7cba88f71cef2f/01-vector-add.ipynb
@@ -40,7 +40,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Let's also declare a helper function that to (1) allocate the output vector\nand (2) enqueueing the above kernel.\n\n"
+ "Let's also declare a helper function to (1) allocate the `z` tensor\nand (2) enqueue the above kernel with appropriate grid/block sizes.\n\n"
]
},
{
@@ -83,7 +83,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Benchmark\nWe can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch.\nTo make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops\nfor different problem sizes.\n\n"
+ "## Benchmark\nWe can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch.\nTo make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops\nfor different problem sizes.\n\n"
]
},
{
@@ -101,7 +101,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "We can now run the decorated function above. Pass `show_plots=True` to see the plots and/or\n`save_path='/path/to/results/' to save them to disk along with raw CSV data\n\n"
+ "We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or\n`save_path='/path/to/results/' to save them to disk along with raw CSV data\n\n"
]
},
{
diff --git a/_images/sphx_glr_01-vector-add_001.png b/_images/sphx_glr_01-vector-add_001.png
index c0a985d8f..eca8ea46e 100644
Binary files a/_images/sphx_glr_01-vector-add_001.png and b/_images/sphx_glr_01-vector-add_001.png differ
diff --git a/_images/sphx_glr_01-vector-add_thumb.png b/_images/sphx_glr_01-vector-add_thumb.png
index 392112bdd..912915ae6 100644
Binary files a/_images/sphx_glr_01-vector-add_thumb.png and b/_images/sphx_glr_01-vector-add_thumb.png differ
diff --git a/_images/sphx_glr_02-fused-softmax_001.png b/_images/sphx_glr_02-fused-softmax_001.png
index ad6d87c1e..c1332c255 100644
Binary files a/_images/sphx_glr_02-fused-softmax_001.png and b/_images/sphx_glr_02-fused-softmax_001.png differ
diff --git a/_images/sphx_glr_02-fused-softmax_thumb.png b/_images/sphx_glr_02-fused-softmax_thumb.png
index 6e732fde8..7dad44800 100644
Binary files a/_images/sphx_glr_02-fused-softmax_thumb.png and b/_images/sphx_glr_02-fused-softmax_thumb.png differ
diff --git a/_images/sphx_glr_03-matrix-multiplication_001.png b/_images/sphx_glr_03-matrix-multiplication_001.png
index 7b9e8d723..5ef51868d 100644
Binary files a/_images/sphx_glr_03-matrix-multiplication_001.png and b/_images/sphx_glr_03-matrix-multiplication_001.png differ
diff --git a/_images/sphx_glr_03-matrix-multiplication_thumb.png b/_images/sphx_glr_03-matrix-multiplication_thumb.png
index 9102541d7..5ab57c015 100644
Binary files a/_images/sphx_glr_03-matrix-multiplication_thumb.png and b/_images/sphx_glr_03-matrix-multiplication_thumb.png differ
diff --git a/_sources/getting-started/installation.rst.txt b/_sources/getting-started/installation.rst.txt
index 1a514fe9f..57fd589a2 100644
--- a/_sources/getting-started/installation.rst.txt
+++ b/_sources/getting-started/installation.rst.txt
@@ -6,7 +6,13 @@ Installation
Binary Distributions
---------------------
-You can install the latest nightly release of Triton from pip:
+You can install the latest stable release of Triton from pip:
+
+ pip install triton
+
+Binary wheels are available for CPython 3.6-3.9 and PyPy 3.6-3.7.
+
+And the latest nightly release:
.. code-block:: bash
@@ -27,9 +33,10 @@ You can install the Python package from source by running the following commands
git clone https://github.com/ptillet/triton.git;
cd triton/python;
+ pip install cmake; # build time dependency
pip install -e .
-This may take a while (10-20 minutes) as it will download and compile LLVM from source.
+Note that, if llvm-11 is not present on your system, the setup.py script will download LLVM static libraries on the web and link against that.
You can then test your installation by running the unit tests:
@@ -42,20 +49,4 @@ and the benchmarks
.. code-block:: bash
cd bench/
- python -m run --with-plots --result-dir /tmp/triton-bench
-
-+++++++++++++++
-C++ Package
-+++++++++++++++
-
-Those not interested in Python integration may want to use the internals of Triton (i.e, runtime, parser, codegen, driver, intermediate representation) directly. This can be done by running the following commands:
-
-.. code-block:: bash
-
- git clone https://github.com/ptillet/triton.git;
- mkdir build;
- cd build;
- cmake ../;
- make -j8;
-
-Note that while direct usage of the C++ API is not officially supported, a usage tutorial can be found `here You can install the latest nightly release of Triton from pip: You can install the latest stable release of Triton from pip: pip install triton Binary wheels are available for CPython 3.6-3.9 and PyPy 3.6-3.7. And the latest nightly release: You can install the Python package from source by running the following commands: This may take a while (10-20 minutes) as it will download and compile LLVM from source. Note that, if llvm-11 is not present on your system, the setup.py script will download LLVM static libraries on the web and link against that. You can then test your installation by running the unit tests: Those not interested in Python integration may want to use the internals of Triton (i.e, runtime, parser, codegen, driver, intermediate representation) directly. This can be done by running the following commands: Note that while direct usage of the C++ API is not officially supported, a usage tutorial can be found hereInstallation¶
Binary Distributions¶
-
+
+pip install -U --pre triton
git clone https://github.com/ptillet/triton.git;
cd triton/python;
+pip install cmake; # build time dependency
pip install -e .
pytest -vs .
C++ Package¶
-git clone https://github.com/ptillet/triton.git;
-mkdir build;
-cd build;
-cmake ../;
-make -j8;
-
Let’s also declare a helper function that to (1) allocate the output vector -and (2) enqueueing the above kernel.
+Let’s also declare a helper function to (1) allocate the z tensor +and (2) enqueue the above kernel with appropriate grid/block sizes.
def add(x, y):
z = torch.empty_like(x)
N = z.shape[0]
@@ -268,7 +268,7 @@ The maximum difference between torch and triton is 0.0
We can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch. +
We can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch. To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops for different problem sizes.
@triton.testing.perf_report(
@@ -296,7 +296,7 @@ for different problem sizes.
return gbps(ms), gbps(max_ms), gbps(min_ms)
We can now run the decorated function above. Pass show_plots=True to see the plots and/or +
We can now run the decorated function above. Pass print_data=True to see the performance number, show_plots=True to plot them, and/or `save_path=’/path/to/results/’ to save them to disk along with raw CSV data
benchmark.run(print_data=True, show_plots=True)
Out:
vector-add-performance:
size Triton Torch
-0 4096.0 9.600000 9.600000
+0 4096.0 9.540372 9.600000
1 8192.0 19.200000 19.200000
2 16384.0 38.400001 38.400001
3 32768.0 76.800002 76.800002
4 65536.0 127.999995 127.999995
5 131072.0 219.428568 219.428568
-6 262144.0 341.333321 341.333321
+6 262144.0 341.333321 384.000001
7 524288.0 472.615390 472.615390
8 1048576.0 614.400016 614.400016
9 2097152.0 722.823517 722.823517
@@ -323,7 +323,7 @@ for different problem sizes.
15 134217728.0 851.577704 850.656574
Total running time of the script: ( 0 minutes 11.009 seconds)
+Total running time of the script: ( 0 minutes 10.979 seconds)