[GH-PAGES] Updated website

2022-07-23 00:49:35 +00:00
parent 10f1d77697
commit 5f9c7bc693
165 changed files with 273 additions and 273 deletions
--- a/master/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
+++ b/master/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
@@ -262,8 +262,8 @@ Final Result
            b_ptrs += BLOCK_SIZE_K * stride_bk
        # you can fuse arbitrary activation functions here
        # while the accumulator is still in FP32!
-        if ACTIVATION:
-            accumulator = ACTIVATION(accumulator)
+        if ACTIVATION == "leaky_relu":
+            accumulator = leaky_relu(accumulator)
        c = accumulator.to(tl.float16)

        # -----------------------------------------------------------
@@ -300,7 +300,7 @@ and (1) checks any shape constraint; (2) allocates the output; (3) launches the



-    def matmul(a, b, activation=None):
+    def matmul(a, b, activation=""):
        # checks constraints
        assert a.shape[1] == b.shape[0], "incompatible dimensions"
        assert a.is_contiguous(), "matrix A must be contiguous"
@@ -436,7 +436,7 @@ We can now compare the performance of our kernel against that of cuBLAS. Here we
            )
        if provider == 'triton + relu':
            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: matmul(a, b, activation=leaky_relu)
+                lambda: matmul(a, b, activation="leaky_relu")
            )
        perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3)
        return perf(ms), perf(max_ms), perf(min_ms)
@@ -459,37 +459,37 @@ We can now compare the performance of our kernel against that of cuBLAS. Here we

    matmul-performance:
             M     cuBLAS  ...     Triton  Triton (+ LeakyReLU)
-    0    256.0   2.730667  ...   2.978909              2.978909
-    1    384.0   7.372800  ...   7.899428              7.899428
+    0    256.0   2.978909  ...   2.978909              2.978909
+    1    384.0   7.372800  ...   8.507077              8.507077
    2    512.0  14.563555  ...  15.420235             15.420235
    3    640.0  22.260869  ...  24.380953             24.380953
    4    768.0  32.768000  ...  34.028308             34.028308
    5    896.0  39.025776  ...  40.140799             39.025776
    6   1024.0  49.932191  ...  53.773130             52.428801
-    7   1152.0  45.242181  ...  47.396572             47.396572
+    7   1152.0  45.242181  ...  48.161033             47.396572
    8   1280.0  51.200001  ...  57.690139             57.690139
    9   1408.0  64.138541  ...  68.147202             67.305878
-    10  1536.0  80.430545  ...  80.430545             79.526831
+    10  1536.0  80.430545  ...  81.355034             79.526831
    11  1664.0  63.372618  ...  63.372618             62.492442
    12  1792.0  72.983276  ...  73.460287             59.467852
-    13  1920.0  68.776119  ...  71.626943             71.257735
-    14  2048.0  73.908442  ...  78.398206             77.314362
-    15  2176.0  83.500614  ...  87.494120             85.998493
-    16  2304.0  68.446623  ...  78.064941             77.307030
-    17  2432.0  71.125224  ...  86.179335             85.653855
-    18  2560.0  77.833728  ...  82.331658             81.108913
-    19  2688.0  83.737433  ...  91.185232             89.888756
-    20  2816.0  83.233216  ...  84.441840             84.197315
-    21  2944.0  81.564701  ...  83.758038             82.373605
-    22  3072.0  82.540970  ...  89.593522             88.335577
-    23  3200.0  83.989503  ...  95.096582             89.012517
-    24  3328.0  82.464255  ...  82.939284             84.596116
-    25  3456.0  81.932484  ...  90.994998             91.200871
-    26  3584.0  87.127323  ...  99.354022             92.600816
-    27  3712.0  84.159518  ...  89.353616             83.247783
-    28  3840.0  85.136259  ...  93.484358             86.738820
-    29  3968.0  92.302520  ...  87.976885             90.926929
-    30  4096.0  91.741443  ...  90.933416             91.304576
+    13  1920.0  68.776119  ...  71.257735             70.892307
+    14  2048.0  73.584279  ...  78.033565             76.959706
+    15  2176.0  83.155572  ...  87.494120             85.998493
+    16  2304.0  68.446623  ...  78.320893             77.558029
+    17  2432.0  71.305746  ...  86.711310             75.320281
+    18  2560.0  77.833728  ...  82.747477             81.715711
+    19  2688.0  83.552988  ...  90.532356             89.676257
+    20  2816.0  83.552120  ...  84.035084             83.392363
+    21  2944.0  81.832567  ...  83.758038             81.967162
+    22  3072.0  82.540970  ...  89.877939             89.170242
+    23  3200.0  84.321474  ...  96.822991             95.380032
+    24  3328.0  83.034941  ...  85.806075             84.596116
+    25  3456.0  82.183044  ...  91.928814             87.632137
+    26  3584.0  87.381330  ...  92.696281             96.891584
+    27  3712.0  84.694652  ...  87.244203             88.092894
+    28  3840.0  85.136259  ...  88.900318             90.279183
+    29  3968.0  88.008611  ...  92.547541             84.268854
+    30  4096.0  93.368854  ...  87.781379             86.592080

    [31 rows x 5 columns]

@@ -499,7 +499,7 @@ We can now compare the performance of our kernel against that of cuBLAS. Here we

 .. rst-class:: sphx-glr-timing

-   **Total running time of the script:** ( 6 minutes  44.471 seconds)
+   **Total running time of the script:** ( 6 minutes  21.318 seconds)


 .. _sphx_glr_download_getting-started_tutorials_03-matrix-multiplication.py: