diff --git a/README.md b/README.md index ed0fc71b1..bab417daa 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,15 @@ And the latest nightly release: pip install -U --pre triton ``` +# Install from source + +``` +git clone https://github.com/openai/triton.git; +cd triton/python; +pip install cmake; # build time dependency +pip install -e . +``` + # Changelog Version 1.1 is out! New features include: diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index cad10c871..7e6071926 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -1593,9 +1593,9 @@ def test_num_warps_pow2(): [('int32', 'libdevice.ffs', ''), ('float32', 'libdevice.pow', '/usr/local/cuda/nvvm/libdevice/libdevice.10.bc'), ('float64', 'libdevice.norm4d', '')]) -def test_libdevice(dtype_str, expr, lib_path): +def test_libdevice_tensor(dtype_str, expr, lib_path): if torch.version.hip is not None: - pytest.skip(f"test_libdevice currently has segfaults on ROCM") + pytest.skip(f"test_libdevice_tensor currently has segfaults on ROCM") @triton.jit def kernel(X, Y, BLOCK: tl.constexpr): x = tl.load(X + tl.arange(0, BLOCK)) @@ -1630,3 +1630,32 @@ def test_libdevice(dtype_str, expr, lib_path): np.testing.assert_equal(y_ref, to_numpy(y_tri)) else: np.testing.assert_allclose(y_ref, to_numpy(y_tri), rtol=0.01) + + +@pytest.mark.parametrize("dtype_str, expr, lib_path", + [('float32', 'libdevice.pow', '')]) +def test_libdevice_scalar(dtype_str, expr, lib_path): + + @triton.jit + def kernel(X, Y, BLOCK: tl.constexpr): + x = X + y = GENERATE_TEST_HERE + tl.store(Y + tl.arange(0, BLOCK), y) + + shape = (128, ) + rs = RandomState(17) + # limit the range of integers so that the sum does not overflow + x = numpy_random((1,), dtype_str=dtype_str, rs=rs) + y_ref = np.zeros(shape, dtype=x.dtype) + + # numpy does not allow negative factors in power, so we use abs() + x = np.abs(x) + kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': 'tl.libdevice.pow(x, x)'}) + y_ref[:] = np.power(x, x) + + # triton result + x_tri = to_triton(x)[0].item() + y_tri = to_triton(numpy_random((shape[0],), dtype_str=dtype_str, rs=rs), device='cuda') + kernel[(1,)](x_tri, y_tri, BLOCK=shape[0], extern_libs={'libdevice': lib_path}) + # compare + np.testing.assert_allclose(y_ref, to_numpy(y_tri), rtol=0.01) diff --git a/python/triton/compiler.py b/python/triton/compiler.py index e39eb053a..6eb331dbb 100644 --- a/python/triton/compiler.py +++ b/python/triton/compiler.py @@ -922,7 +922,10 @@ def ty_to_cpp(ty): "i64": "int64_t", "u32": "uint32_t", "u64": "uint64_t", + "fp16": "float", + "bf16": "float", "fp32": "float", + "fp64": "double", }[ty] @@ -952,6 +955,8 @@ def generate_launcher(identifier, constants, signature): 'i64': 'int64_t', 'u32': 'uint32_t', 'u64': 'uint64_t', + 'fp16': 'float', + 'bf16': 'float', 'fp32': 'float', 'fp64': 'double', }[ty] diff --git a/python/triton/language/extern.py b/python/triton/language/extern.py index a306a2e9a..1f3c9371c 100644 --- a/python/triton/language/extern.py +++ b/python/triton/language/extern.py @@ -59,28 +59,34 @@ def elementwise(lib_name: str, lib_path: str, args: list, arg_type_symbol_dict: :return: the return value of the function ''' dispatch_args = args.copy() - if len(args) == 1: - dispatch_args[0] = core._to_tensor(dispatch_args[0], _builder) - ret_shape = dispatch_args[0].shape - elif len(args) == 2: - dispatch_args[0] = core._to_tensor(dispatch_args[0], _builder) - dispatch_args[1] = core._to_tensor(dispatch_args[1], _builder) - dispatch_args[0], dispatch_args[1] = semantic.binary_op_type_checking_impl( - dispatch_args[0], dispatch_args[1], _builder) - ret_shape = dispatch_args[0].shape - else: - for i in range(len(dispatch_args)): - dispatch_args[i] = core._to_tensor(dispatch_args[i], _builder) - broadcast_arg = dispatch_args[0] - # Get the broadcast shape over all the arguments - for i in range(len(dispatch_args)): - _, broadcast_arg = semantic.binary_op_type_checking_impl( - dispatch_args[i], broadcast_arg, _builder) - # Change the shape of each argument based on the broadcast shape - for i in range(len(dispatch_args)): - dispatch_args[i], _ = semantic.binary_op_type_checking_impl( - dispatch_args[i], broadcast_arg, _builder) - ret_shape = broadcast_arg.shape + all_scalar = True + ret_shape = None + for dispatch_arg in dispatch_args: + if dispatch_arg.type.is_block(): + all_scalar = False + if not all_scalar: + if len(args) == 1: + dispatch_args[0] = core._to_tensor(dispatch_args[0], _builder) + ret_shape = dispatch_args[0].shape + elif len(args) == 2: + dispatch_args[0] = core._to_tensor(dispatch_args[0], _builder) + dispatch_args[1] = core._to_tensor(dispatch_args[1], _builder) + dispatch_args[0], dispatch_args[1] = semantic.binary_op_type_checking_impl( + dispatch_args[0], dispatch_args[1], _builder) + ret_shape = dispatch_args[0].shape + else: + for i in range(len(dispatch_args)): + dispatch_args[i] = core._to_tensor(dispatch_args[i], _builder) + broadcast_arg = dispatch_args[0] + # Get the broadcast shape over all the arguments + for i in range(len(dispatch_args)): + _, broadcast_arg = semantic.binary_op_type_checking_impl( + dispatch_args[i], broadcast_arg, _builder) + # Change the shape of each argument based on the broadcast shape + for i in range(len(dispatch_args)): + dispatch_args[i], _ = semantic.binary_op_type_checking_impl( + dispatch_args[i], broadcast_arg, _builder) + ret_shape = broadcast_arg.shape func = getattr(_builder, "create_extern_elementwise") return dispatch(func, lib_name, lib_path, dispatch_args, arg_type_symbol_dict, ret_shape, _builder)