diff --git a/docs/getting-started/tutorials/grouped_vs_row_major_ordering.png b/docs/getting-started/tutorials/grouped_vs_row_major_ordering.png
new file mode 100644
index 000000000..46a356de7
Binary files /dev/null and b/docs/getting-started/tutorials/grouped_vs_row_major_ordering.png differ
diff --git a/python/tutorials/03-matrix-multiplication.py b/python/tutorials/03-matrix-multiplication.py
index 838ddc0b0..e71fae2d6 100644
--- a/python/tutorials/03-matrix-multiplication.py
+++ b/python/tutorials/03-matrix-multiplication.py
@@ -46,7 +46,7 @@ You will specifically learn about:
 #
 # The above algorithm is, actually, fairly straightforward to implement in Triton.
 # The main difficulty comes from the computation of the memory locations at which blocks
-#  of :code:`A` and :code:`B` must be read in the inner loop. For that, we need
+# of :code:`A` and :code:`B` must be read in the inner loop. For that, we need
 # multi-dimensional pointer arithmetics.
 #
 # Pointer Arithmetics
@@ -88,7 +88,7 @@ You will specifically learn about:
 # ~~~~~~~~~~~~~~~~~~~~~~~~
 #
 # As mentioned above, each program instance computes a :code:`[BLOCK_SIZE_M, BLOCK_SIZE_N]`
-#  block of :code:`C`.
+# block of :code:`C`.
 # It is important to remember that the order in which these blocks are computed does
 # matter, since it affects the L2 cache hit rate of our program. and unfortunately, a
 # a simple row-major ordering
@@ -116,7 +116,7 @@ You will specifically learn about:
 #    group_size = min(grid_m - group_id * GROUP_M, GROUP_M);
 #    pid_m = group_id * GROUP_M + (pid % group_size);
 #    pid_n = (pid % width) // (group_size);
-
+#
 # For example, in the following matmul where each matrix is 9 blocks by 9 blocks,
 # we can see that if we compute the output in row-major ordering, we need to load 90
 # blocks into SRAM to compute the first 9 output blocks, but if we do it in grouped
@@ -310,8 +310,8 @@ a = torch.randn((512, 512), device='cuda', dtype=torch.float16)
 b = torch.randn((512, 512), device='cuda', dtype=torch.float16)
 triton_output = matmul(a, b, activation=None)
 torch_output = torch.matmul(a, b)
-print(f"{triton_output=}")
-print(f"{torch_output=}")
+print(f"triton_output={triton_output}")
+print(f"torch_output={torch_output}")
 if triton.testing.allclose(triton_output, torch_output):
     print("✅ Triton and Torch match")
 else:
diff --git a/python/tutorials/grouped_vs_row_major_ordering.png b/python/tutorials/grouped_vs_row_major_ordering.png
deleted file mode 100644
index 1a162e4f4..000000000
Binary files a/python/tutorials/grouped_vs_row_major_ordering.png and /dev/null differ