[STYLE] run autopep8 and isort (#421)
Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review.
This commit is contained in:
committed by
GitHub
parent
120cda015e
commit
8bf551ae7a
@@ -16,6 +16,8 @@ You will learn about:
|
||||
# Custom GPU kernels for elementwise additions are educationally valuable but won't get you very far in practice.
|
||||
# Let us consider instead the case of a simple (numerically stabilized) softmax operation:
|
||||
|
||||
import triton.language as tl
|
||||
import triton
|
||||
import torch
|
||||
|
||||
|
||||
@@ -59,13 +61,10 @@ def naive_softmax(x):
|
||||
# power-of-two number of elements, so we need to internally "pad" each row and guard the
|
||||
# memory operations properly if we want to handle any possible input shapes:
|
||||
|
||||
import triton
|
||||
import triton.language as tl
|
||||
|
||||
|
||||
@triton.jit
|
||||
def softmax_kernel(
|
||||
output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols,
|
||||
output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols,
|
||||
BLOCK_SIZE: tl.constexpr
|
||||
):
|
||||
# The rows of the softmax are independent, so we parallelize across those
|
||||
@@ -136,7 +135,7 @@ y_triton = softmax(x)
|
||||
y_torch = torch.softmax(x, axis=1)
|
||||
print(torch.allclose(y_triton, y_torch))
|
||||
|
||||
#%%
|
||||
# %%
|
||||
# As expected, the results are identical.
|
||||
|
||||
# %%
|
||||
@@ -187,5 +186,5 @@ benchmark.run(show_plots=True, print_data=True)
|
||||
# In the above plot, we can see that:
|
||||
#
|
||||
# - Triton is 4x faster than the Torch JIT. This confirms our suspicions that the Torch JIT does not do any fusion here.
|
||||
# - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**.
|
||||
# - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**.
|
||||
# Note however that the PyTorch `softmax` operation is more general and will works on tensors of any shape.
|
||||
|
Reference in New Issue
Block a user