[STYLE] run autopep8 and isort (#421)

Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review.
2022-01-06 14:34:17 -08:00
parent 120cda015e
commit 8bf551ae7a
30 changed files with 742 additions and 623 deletions
--- a/python/tutorials/02-fused-softmax.py
+++ b/python/tutorials/02-fused-softmax.py
@@ -16,6 +16,8 @@ You will learn about:
 # Custom GPU kernels for elementwise additions are educationally valuable but won't get you very far in practice.
 # Let us consider instead the case of a simple (numerically stabilized) softmax operation:

+import triton.language as tl
+import triton
 import torch


@@ -59,13 +61,10 @@ def naive_softmax(x):
 # power-of-two number of elements, so we need to internally "pad" each row and guard the
 # memory operations properly if we want to handle any possible input shapes:

-import triton
-import triton.language as tl
-

@triton.jit
 def softmax_kernel(
-    output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols, 
+    output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols,
    BLOCK_SIZE: tl.constexpr
 ):
    # The rows of the softmax are independent, so we parallelize across those
@@ -136,7 +135,7 @@ y_triton = softmax(x)
 y_torch = torch.softmax(x, axis=1)
 print(torch.allclose(y_triton, y_torch))

-#%%
+# %%
 # As expected, the results are identical.

 # %%
@@ -187,5 +186,5 @@ benchmark.run(show_plots=True, print_data=True)
 # In the above plot, we can see that:
 #
 #  - Triton is 4x faster than the Torch JIT. This confirms our suspicions that the Torch JIT does not do any fusion here.
-#  - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**. 
+#  - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**.
 #    Note however that the PyTorch `softmax` operation is more general and will works on tensors of any shape.