[Backend] Vectorize Load/Store Ops (#86)

This PR does the following things:

- Code refactoring on Load and Store op codegen, rewrite with same logic
and share much code
- Support the vectorized load/store
This commit is contained in:
Yan Chunwei
2022-09-07 03:28:09 +08:00
committed by GitHub
parent 35e346bcff
commit a9464f4993
10 changed files with 433 additions and 295 deletions

View File

@@ -29,7 +29,6 @@ def test_empty_kernel_cubin_compile():
def test_empty_kernel_launch():
device = torch.cuda.current_device()
binary = runtime.build_kernel(empty_kernel, "*fp32,i32,i32",
device=device,
constants={"BLOCK": 256},
num_warps=4,
num_stages=3)
@@ -38,11 +37,9 @@ def test_empty_kernel_launch():
)
A = torch.zeros([1024], device="cuda")
runtime.launch_kernel(fn=empty_kernel,
binary=binary,
runtime.launch_kernel(kernel=binary,
grid=grid,
num_warps=4,
num_stages=3,
device=device,
X=A,
stride_xm=256,
BLOCK=tl.constexpr(256))