[RUNTIME] Lower-level interface for executing functions

2020-08-11 20:10:39 -04:00
parent f4f216b88a
commit acff1b5e05
25 changed files with 219 additions and 916 deletions
--- a/python/examples/tutorials/conv2d.py
+++ b/python/examples/tutorials/conv2d.py
@@ -8,7 +8,9 @@ class _conv(torch.autograd.Function):
                         TYPE *C __noalias __aligned(16), 
                         float alpha,
                         // equivalent matmul
-                         int M, int N, int K,
+                         int M __retune,
+                         int N __retune,
+                         int K __retune,
                         // convolution properties
                         int pad_h, int pad_w, int stride_h, int stride_w,
                         // pointer increment
@@ -197,4 +199,4 @@ c = conv(a, b, pad, stride, time)
 print((cc - c).abs().max() / max(cc.max(), c.max()))
 print(time[0], 2*Z*H*W*CI*CO*R*S/(time[0]*1e-9)*1e-12)
 #zc  = torch.matmul(a,b)
-#zc_ = dot(a,b)
+#zc_ = dot(a,b)
--- a/python/examples/tutorials/mat_copy.py
+++ b/python/examples/tutorials/mat_copy.py
@@ -4,7 +4,9 @@ import triton
 class _copy(torch.autograd.Function):
    src = """
    __global__ void copy(TYPE * X, TYPE * Y,
-                                  int M, int N, int ldx __multipleof(8)) {
+                         int M __retune,
+                         int N __retune,
+                         int ldx __multipleof(8)) {
          // extract program ID
          int pidm = get_program_id(0); //(1)
          int pidn = get_program_id(1); //(2)
--- a/python/examples/tutorials/mat_mul.py
+++ b/python/examples/tutorials/mat_mul.py
@@ -7,7 +7,9 @@ class _dot(torch.autograd.Function):
                        TYPE *B __noalias __readonly __aligned(16), 
                        TYPE *C __noalias __aligned(16), 
                        float alpha,
-                        int M, int N, int K,
+                        int M __retune,
+                        int N __retune,
+                        int K __retune,
                        int lda __multipleof(8), 
                        int ldb __multipleof(8), 
                        int ldc __multipleof(8)) {
@@ -128,4 +130,4 @@ b = torch.rand((K, N)).cuda()
 #zc  = torch.matmul(a,b)
 zc_ = dot(a,b)

-#print(torch.allclose(zc, zc_))
+#print(torch.allclose(zc, zc_))
--- a/python/examples/tutorials/mat_transpose.py
+++ b/python/examples/tutorials/mat_transpose.py
@@ -4,7 +4,9 @@ import triton
 class _transpose(torch.autograd.Function):
    src = """
    __global__ void transpose(TYPE * X, TYPE * Y,
-                                  int M, int N, int ldx __multipleof(8), int ldy __multipleof(8)) {
+                              int M __retune,
+                              int N __retune,
+                              int ldx __multipleof(8), int ldy __multipleof(8)) {
          // extract program ID
          int pidm = get_program_id(0); //(1)
          int pidn = get_program_id(1); //(2)