Merge branch 'master' into IFU_11_1_2022
This commit is contained in:
@@ -33,6 +33,15 @@ And the latest nightly release:
|
||||
pip install -U --pre triton
|
||||
```
|
||||
|
||||
# Install from source
|
||||
|
||||
```
|
||||
git clone https://github.com/openai/triton.git;
|
||||
cd triton/python;
|
||||
pip install cmake; # build time dependency
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
# Changelog
|
||||
|
||||
Version 1.1 is out! New features include:
|
||||
|
@@ -1593,9 +1593,9 @@ def test_num_warps_pow2():
|
||||
[('int32', 'libdevice.ffs', ''),
|
||||
('float32', 'libdevice.pow', '/usr/local/cuda/nvvm/libdevice/libdevice.10.bc'),
|
||||
('float64', 'libdevice.norm4d', '')])
|
||||
def test_libdevice(dtype_str, expr, lib_path):
|
||||
def test_libdevice_tensor(dtype_str, expr, lib_path):
|
||||
if torch.version.hip is not None:
|
||||
pytest.skip(f"test_libdevice currently has segfaults on ROCM")
|
||||
pytest.skip(f"test_libdevice_tensor currently has segfaults on ROCM")
|
||||
@triton.jit
|
||||
def kernel(X, Y, BLOCK: tl.constexpr):
|
||||
x = tl.load(X + tl.arange(0, BLOCK))
|
||||
@@ -1630,3 +1630,32 @@ def test_libdevice(dtype_str, expr, lib_path):
|
||||
np.testing.assert_equal(y_ref, to_numpy(y_tri))
|
||||
else:
|
||||
np.testing.assert_allclose(y_ref, to_numpy(y_tri), rtol=0.01)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype_str, expr, lib_path",
|
||||
[('float32', 'libdevice.pow', '')])
|
||||
def test_libdevice_scalar(dtype_str, expr, lib_path):
|
||||
|
||||
@triton.jit
|
||||
def kernel(X, Y, BLOCK: tl.constexpr):
|
||||
x = X
|
||||
y = GENERATE_TEST_HERE
|
||||
tl.store(Y + tl.arange(0, BLOCK), y)
|
||||
|
||||
shape = (128, )
|
||||
rs = RandomState(17)
|
||||
# limit the range of integers so that the sum does not overflow
|
||||
x = numpy_random((1,), dtype_str=dtype_str, rs=rs)
|
||||
y_ref = np.zeros(shape, dtype=x.dtype)
|
||||
|
||||
# numpy does not allow negative factors in power, so we use abs()
|
||||
x = np.abs(x)
|
||||
kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': 'tl.libdevice.pow(x, x)'})
|
||||
y_ref[:] = np.power(x, x)
|
||||
|
||||
# triton result
|
||||
x_tri = to_triton(x)[0].item()
|
||||
y_tri = to_triton(numpy_random((shape[0],), dtype_str=dtype_str, rs=rs), device='cuda')
|
||||
kernel[(1,)](x_tri, y_tri, BLOCK=shape[0], extern_libs={'libdevice': lib_path})
|
||||
# compare
|
||||
np.testing.assert_allclose(y_ref, to_numpy(y_tri), rtol=0.01)
|
||||
|
@@ -922,7 +922,10 @@ def ty_to_cpp(ty):
|
||||
"i64": "int64_t",
|
||||
"u32": "uint32_t",
|
||||
"u64": "uint64_t",
|
||||
"fp16": "float",
|
||||
"bf16": "float",
|
||||
"fp32": "float",
|
||||
"fp64": "double",
|
||||
}[ty]
|
||||
|
||||
|
||||
@@ -952,6 +955,8 @@ def generate_launcher(identifier, constants, signature):
|
||||
'i64': 'int64_t',
|
||||
'u32': 'uint32_t',
|
||||
'u64': 'uint64_t',
|
||||
'fp16': 'float',
|
||||
'bf16': 'float',
|
||||
'fp32': 'float',
|
||||
'fp64': 'double',
|
||||
}[ty]
|
||||
|
@@ -59,28 +59,34 @@ def elementwise(lib_name: str, lib_path: str, args: list, arg_type_symbol_dict:
|
||||
:return: the return value of the function
|
||||
'''
|
||||
dispatch_args = args.copy()
|
||||
if len(args) == 1:
|
||||
dispatch_args[0] = core._to_tensor(dispatch_args[0], _builder)
|
||||
ret_shape = dispatch_args[0].shape
|
||||
elif len(args) == 2:
|
||||
dispatch_args[0] = core._to_tensor(dispatch_args[0], _builder)
|
||||
dispatch_args[1] = core._to_tensor(dispatch_args[1], _builder)
|
||||
dispatch_args[0], dispatch_args[1] = semantic.binary_op_type_checking_impl(
|
||||
dispatch_args[0], dispatch_args[1], _builder)
|
||||
ret_shape = dispatch_args[0].shape
|
||||
else:
|
||||
for i in range(len(dispatch_args)):
|
||||
dispatch_args[i] = core._to_tensor(dispatch_args[i], _builder)
|
||||
broadcast_arg = dispatch_args[0]
|
||||
# Get the broadcast shape over all the arguments
|
||||
for i in range(len(dispatch_args)):
|
||||
_, broadcast_arg = semantic.binary_op_type_checking_impl(
|
||||
dispatch_args[i], broadcast_arg, _builder)
|
||||
# Change the shape of each argument based on the broadcast shape
|
||||
for i in range(len(dispatch_args)):
|
||||
dispatch_args[i], _ = semantic.binary_op_type_checking_impl(
|
||||
dispatch_args[i], broadcast_arg, _builder)
|
||||
ret_shape = broadcast_arg.shape
|
||||
all_scalar = True
|
||||
ret_shape = None
|
||||
for dispatch_arg in dispatch_args:
|
||||
if dispatch_arg.type.is_block():
|
||||
all_scalar = False
|
||||
if not all_scalar:
|
||||
if len(args) == 1:
|
||||
dispatch_args[0] = core._to_tensor(dispatch_args[0], _builder)
|
||||
ret_shape = dispatch_args[0].shape
|
||||
elif len(args) == 2:
|
||||
dispatch_args[0] = core._to_tensor(dispatch_args[0], _builder)
|
||||
dispatch_args[1] = core._to_tensor(dispatch_args[1], _builder)
|
||||
dispatch_args[0], dispatch_args[1] = semantic.binary_op_type_checking_impl(
|
||||
dispatch_args[0], dispatch_args[1], _builder)
|
||||
ret_shape = dispatch_args[0].shape
|
||||
else:
|
||||
for i in range(len(dispatch_args)):
|
||||
dispatch_args[i] = core._to_tensor(dispatch_args[i], _builder)
|
||||
broadcast_arg = dispatch_args[0]
|
||||
# Get the broadcast shape over all the arguments
|
||||
for i in range(len(dispatch_args)):
|
||||
_, broadcast_arg = semantic.binary_op_type_checking_impl(
|
||||
dispatch_args[i], broadcast_arg, _builder)
|
||||
# Change the shape of each argument based on the broadcast shape
|
||||
for i in range(len(dispatch_args)):
|
||||
dispatch_args[i], _ = semantic.binary_op_type_checking_impl(
|
||||
dispatch_args[i], broadcast_arg, _builder)
|
||||
ret_shape = broadcast_arg.shape
|
||||
func = getattr(_builder, "create_extern_elementwise")
|
||||
return dispatch(func, lib_name, lib_path, dispatch_args, arg_type_symbol_dict, ret_shape, _builder)
|
||||
|
||||
|
Reference in New Issue
Block a user