Files
triton/python/tests/test_math_ops.py
Chenggang Zhao c9d84237e8 [Triton-MLIR][Frontend] Interface fixes for libdevice (#829)
- Unifying several interfaces with different types to a single one, e.g.
`fsub_ru` and `dsub_ru` -> `sub_ru`;
- Minor bug fix: `fast_pow` is incorrectly classified into the `pow`
interface, of which arguments are the same as `powf`;
- Explicit interfaces for casting functions, e.g. decoupling
`ll2float_ru` to `ll2float_ru` and `ull2float_ru`;
- Removing interfaces that are not in NVIDIA's official documents, e.g.
`fmaf_ieee_rn`, which is confusing together with `fmaf_rn`.

Co-authored-by: Keren Zhou <kerenzhou@openai.com>
2022-11-01 10:51:32 -07:00

34 lines
1.1 KiB
Python

import triton
import triton.language as tl
@triton.jit
def math_kernel(x1_ptr, x2_ptr, x3_ptr, x4_ptr, n, BLOCK_SIZE: tl.constexpr):
offsets = tl.arange(0, BLOCK_SIZE)
x1 = tl.load(x1_ptr + offsets, mask=offsets < n)
x2 = tl.load(x2_ptr + offsets, mask=offsets < n)
x3 = tl.load(x3_ptr + offsets, mask=offsets < n)
x4 = tl.load(x4_ptr + offsets, mask=offsets < n)
y1 = tl.sin(x1)
y2 = tl.libdevice.sin(x2)
y3 = tl.libdevice.div_rn(x3, x3)
y4 = tl.libdevice.fma_rd(x4, x4, x4)
tl.store(x1_ptr + offsets, y1, mask=offsets < n)
tl.store(x2_ptr + offsets, y2, mask=offsets < n)
tl.store(x3_ptr + offsets, y3, mask=offsets < n)
tl.store(x4_ptr + offsets, y4, mask=offsets < n)
def test_empty_kernel_cubin_compile():
kernel = triton.compiler._compile(math_kernel,
"*fp32,*fp32,*fp32,*fp32,i32",
device=0,
constants={"BLOCK_SIZE": 256},
output="ttgir") # "cubin"
assert kernel
# TODO: Check if the values are correct.
# TODO: Cover all the math operators