From 3ecf834a692b8b3f0d7cfaef91845a3e870cd3a7 Mon Sep 17 00:00:00 2001
From: Philippe Tillet <phil@openai.com>
Date: Thu, 4 Mar 2021 02:06:57 -0500
Subject: [PATCH] [PYTHON] Deleted 01-vector-add.py: it is an unnecessary
 duplicate of 01-vector-add.ipynb

---
 python/tutorials/01-vector-add.py | 76 -------------------------------
 1 file changed, 76 deletions(-)
 delete mode 100644 python/tutorials/01-vector-add.py

diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py
deleted file mode 100644
index 9163e4efd..000000000
--- a/python/tutorials/01-vector-add.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import torch
-import triton
-
-# source-code for Triton compute kernel
-# here we just copy-paste the above code without the extensive comments.
-# you may prefer to store it in a .c file and load it from there instead.
-_src = """
-__global__ void add(float* z, float* x, float* y, int N){
-    // program id
-    int pid = get_program_id(0);
-    // create arrays of pointers
-    int offset[BLOCK] = pid * BLOCK + 0 ... BLOCK;
-    float* pz[BLOCK] = z + offset;
-    float* px[BLOCK] = x + offset;
-    float* py[BLOCK] = y + offset;
-    // bounds checking
-    bool check[BLOCK] = offset < N;
-    // write-back
-    *?(check)pz = *?(check)px + *?(check)py;
-}
-    """
-# This function returns a callable `triton.kernel` object
-# created from the above source code.
-# For portability, we maintain a cache of kernels for different `torch.device`
-# We compile the kernel with -DBLOCK=1024
-_kernels = dict()
-
-def make_add_kernel(device):
-    if device not in _kernels:
-        defines = {'BLOCK': 1024}
-        autotune_vals = [({'BLOCK': '1024'}, 4), ({'BLOCK': '2048'}, 4)]
-        autotune_key = ["N"]
-        _kernels[device] = triton.kernel(_src, device=device, defines=defines, autotune_vals=autotune_vals,
-                                         autotune_key=autotune_key)
-    return _kernels[device]
-
-# This is a standard torch custom autograd Function
-# The only difference is that we can now use the above kernel
-# in the `forward` and `backward` functions.`
-class _add(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, y):
-        # constraints of the op
-        assert x.dtype == torch.float32
-        # *allocate output*
-        z = torch.empty_like(x)
-        # *create launch grid*:
-        # this is a function which takes compilation parameters `opt`
-        # as input and returns a tuple of int (i.e., launch grid) for the kernel.
-        # triton.cdiv is a shortcut for ceil division:
-        # triton.cdiv(a, b) = (a + b - 1) // b
-        grid = lambda opt: (triton.cdiv(z.shape[0], opt.BLOCK), )
-        # *launch kernel*:
-        # pointer to the data of torch tensors can be retrieved with
-        # the `.data_ptr()` method
-        kernel = make_add_kernel(z.device)
-        kernel(z.data_ptr(), x.data_ptr(), y.data_ptr(), z.shape[0], grid=grid)
-        return z
-
-# Just like we standard PyTorch ops
-# We use the `.apply` method to create a
-# callable object for our function
-add = _add.apply
-
-torch.manual_seed(0)
-x = torch.rand(32, device='cuda')
-y = torch.rand(32, device='cuda')
-za = x + y
-zb = add(x, y)
-print(za)
-print(zb)
-print(f'The maximum difference between torch and triton is ' f'{torch.max(torch.abs(za - zb))}')
-
-th_ms = triton.testing.do_bench(lambda: x + y)
-tr_ms = triton.testing.do_bench(lambda: add(x, y))
-print(th_ms, tr_ms)
\ No newline at end of file