- Unifying several interfaces with different types to a single one, e.g. `fsub_ru` and `dsub_ru` -> `sub_ru`; - Minor bug fix: `fast_pow` is incorrectly classified into the `pow` interface, of which arguments are the same as `powf`; - Explicit interfaces for casting functions, e.g. decoupling `ll2float_ru` to `ll2float_ru` and `ull2float_ru`; - Removing interfaces that are not in NVIDIA's official documents, e.g. `fmaf_ieee_rn`, which is confusing together with `fmaf_rn`. Co-authored-by: Keren Zhou <kerenzhou@openai.com>
34 lines
1.1 KiB
Python
34 lines
1.1 KiB
Python
|
|
import triton
|
|
import triton.language as tl
|
|
|
|
|
|
@triton.jit
|
|
def math_kernel(x1_ptr, x2_ptr, x3_ptr, x4_ptr, n, BLOCK_SIZE: tl.constexpr):
|
|
offsets = tl.arange(0, BLOCK_SIZE)
|
|
x1 = tl.load(x1_ptr + offsets, mask=offsets < n)
|
|
x2 = tl.load(x2_ptr + offsets, mask=offsets < n)
|
|
x3 = tl.load(x3_ptr + offsets, mask=offsets < n)
|
|
x4 = tl.load(x4_ptr + offsets, mask=offsets < n)
|
|
|
|
y1 = tl.sin(x1)
|
|
y2 = tl.libdevice.sin(x2)
|
|
y3 = tl.libdevice.div_rn(x3, x3)
|
|
y4 = tl.libdevice.fma_rd(x4, x4, x4)
|
|
|
|
tl.store(x1_ptr + offsets, y1, mask=offsets < n)
|
|
tl.store(x2_ptr + offsets, y2, mask=offsets < n)
|
|
tl.store(x3_ptr + offsets, y3, mask=offsets < n)
|
|
tl.store(x4_ptr + offsets, y4, mask=offsets < n)
|
|
|
|
|
|
def test_empty_kernel_cubin_compile():
|
|
kernel = triton.compiler._compile(math_kernel,
|
|
"*fp32,*fp32,*fp32,*fp32,i32",
|
|
device=0,
|
|
constants={"BLOCK_SIZE": 256},
|
|
output="ttgir") # "cubin"
|
|
assert kernel
|
|
# TODO: Check if the values are correct.
|
|
# TODO: Cover all the math operators
|