diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index fd7447e51..a6e343b80 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -116,7 +116,7 @@ def test_empty_kernel(dtype_x, device='cuda'): @triton.jit def kernel(X, SIZE: tl.constexpr): pass - check_type_supported(dtype_x) + # check_type_supported(dtype_x) x = to_triton(numpy_random(SIZE, dtype_str=dtype_x), device=device, dst_type=dtype_x) kernel[(1, )](x, SIZE=SIZE, num_warps=4) diff --git a/python/triton/compiler.py b/python/triton/compiler.py index 1332f2c76..bf1a97482 100644 --- a/python/triton/compiler.py +++ b/python/triton/compiler.py @@ -904,7 +904,7 @@ def _compile(fn, signature: str, device: int = -1, constants=dict(), def ty_to_cpp(ty): if ty[0] == '*': - return "CUdeviceptr" + return "hipDeviceptr_t" return { "i1": "int32_t", "i8": "int8_t", @@ -962,17 +962,18 @@ def generate_launcher(identifier, constants, signature): format = "iiiiiKKOOO" + ''.join([format_of(_extracted_type(ty)) for ty in signature.values()]) # generate glue code - src = f""" -#include \"cuda.h\" + if torch.version.hip is not None: + src = f""" +#define __HIP_PLATFORM_AMD__ +#include #include -static inline void gpuAssert(CUresult code, const char *file, int line) +static inline void gpuAssert(hipError_t code, const char *file, int line) {{ - if (code != CUDA_SUCCESS) + if (code != HIP_SUCCESS) {{ const char* prefix = "Triton Error [CUDA]: "; - const char* str; - cuGetErrorString(code, &str); + const char* str = hipGetErrorString(code); char err[1024] = {{0}}; strcat(err, prefix); strcat(err, str); @@ -982,20 +983,20 @@ static inline void gpuAssert(CUresult code, const char *file, int line) #define CUDA_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }} -void _launch(int gridX, int gridY, int gridZ, int num_warps, int shared_memory, CUstream stream, CUfunction function, {arg_decls}) {{ +void _launch(int gridX, int gridY, int gridZ, int num_warps, int shared_memory, hipStream_t stream, hipFunction_t function, {arg_decls}) {{ void *params[] = {{ {', '.join(f"&arg{i}" for i in signature.keys() if i not in constants)} }}; if(gridX*gridY*gridZ > 0){{ - CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0)); + hipLaunchKernel(function, dim3(gridX, gridY, gridZ), dim3(32*num_warps, 1, 1), 0 ,0, stream); }} }} -static inline CUdeviceptr getPointer(PyObject *obj, int idx) {{ +static inline hipDeviceptr_t getPointer(PyObject *obj, int idx) {{ if (PyLong_Check(obj)) {{ - return (CUdeviceptr)PyLong_AsUnsignedLongLong(obj); + return (hipDeviceptr_t)PyLong_AsUnsignedLongLong(obj); }} if (obj == Py_None) {{ - return (CUdeviceptr)0; + return (hipDeviceptr_t)0; }} PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr"); if(ptr){{ @@ -1006,10 +1007,10 @@ static inline CUdeviceptr getPointer(PyObject *obj, int idx) {{ if (!PyLong_Check(ret)) {{ PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int"); }} - return (CUdeviceptr)PyLong_AsUnsignedLongLong(ret); + return (hipDeviceptr_t)PyLong_AsUnsignedLongLong(ret); }} PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method"); - return (CUdeviceptr)0; + return (hipDeviceptr_t)0; }} @@ -1034,7 +1035,7 @@ static PyObject* launch(PyObject* self, PyObject* args) {{ Py_DECREF(new_args); }} - _launch(gridX, gridY, gridZ, num_warps, shared_memory, (CUstream)_stream, (CUfunction)_function, {', '.join(f"getPointer(_arg{i},{i})" if ty[0]=="*" else f"_arg{i}"for i, ty in signature.items())}); + _launch(gridX, gridY, gridZ, num_warps, shared_memory, (hipStream_t)_stream, (hipFunction_t)_function, {', '.join(f"getPointer(_arg{i},{i})" if ty[0]=="*" else f"_arg{i}"for i, ty in signature.items())}); if (launch_exit_hook != Py_None) {{ PyObject *new_args = NULL; @@ -1129,12 +1130,20 @@ def libcuda_dirs(): locs = subprocess.check_output(["whereis", "libcuda.so"]).decode().strip().split()[1:] return [os.path.dirname(loc) for loc in locs] +@functools.lru_cache() +def libhip_dirs(): + return ["/opt/rocm/lib/libamdhip64.so"] + @functools.lru_cache() def cuda_home_dirs(): default_dir = "/usr/local/cuda" return os.getenv("CUDA_HOME", default=default_dir) +@functools.lru_cache() +def hip_home_dirs(): + default_dir = "/opt/rocm" + return os.getenv("HIP_HOME", default=default_dir) @contextlib.contextmanager def quiet(): @@ -1147,8 +1156,13 @@ def quiet(): def _build(name, src, srcdir): - cuda_lib_dirs = libcuda_dirs() - cu_include_dir = os.path.join(cuda_home_dirs(), "include") + if torch.version.hip is not None: + hip_lib_dirs = libhip_dirs() + hip_include_dir = os.path.join(hip_home_dirs(), "include") + else: + cuda_lib_dirs = libcuda_dirs() + cu_include_dir = os.path.join(cuda_home_dirs(), "include") + suffix = sysconfig.get_config_var('EXT_SUFFIX') so = os.path.join(srcdir, '{name}{suffix}'.format(name=name, suffix=suffix)) # try to avoid setuptools if possible @@ -1159,16 +1173,26 @@ def _build(name, src, srcdir): gcc = shutil.which("gcc") cc = gcc if gcc is not None else clang py_include_dir = get_paths()["include"] - cc_cmd = [cc, src, "-O3", f"-I{cu_include_dir}", f"-I{py_include_dir}", f"-I{srcdir}", "-shared", "-fPIC", "-lcuda", "-o", so] - cc_cmd += [f"-L{dir}" for dir in cuda_lib_dirs] + if torch.version.hip is not None: + cc_cmd = [cc, src, "-O3", f"-I{hip_include_dir}", f"-I{py_include_dir}", f"-I{srcdir}", "-shared", "-fPIC", "-lcuda", "-o", so] + cc_cmd += [f"-L{dir}" for dir in hip_lib_dirs] + else: + cc_cmd = [cc, src, "-O3", f"-I{cu_include_dir}", f"-I{py_include_dir}", f"-I{srcdir}", "-shared", "-fPIC", "-lcuda", "-o", so] + cc_cmd += [f"-L{dir}" for dir in cuda_lib_dirs] ret = subprocess.check_call(cc_cmd) if ret == 0: return so # fallback on setuptools extra_compile_args = [] - library_dirs = cuda_lib_dirs - include_dirs = [srcdir, cu_include_dir] - libraries = ['cuda'] + if torch.version.hip is not None: + library_dirs = hip_lib_dirs + include_dirs = [srcdir, hip_include_dir] + libraries = ['cuda'] + else: + library_dirs = cuda_lib_dirs + include_dirs = [srcdir, cu_include_dir] + libraries = ['cuda'] + # extra arguments extra_link_args = [] # create extension module diff --git a/scripts/amd/test.sh b/scripts/amd/test.sh index 65901c627..7fa7947cb 100755 --- a/scripts/amd/test.sh +++ b/scripts/amd/test.sh @@ -26,7 +26,8 @@ rm -rf /tmp/triton # python python/test/test_empty.py # -ex 'ignore 1 472' \ -pytest --verbose python/test/unit/language/test_core.py 2>&1 | tee /dockerx/triton/test_core.log +# pytest --verbose python/test/unit/language/test_core.py 2>&1 | tee /dockerx/triton/test_core.log +pytest --verbose python/test/unit/language/test_core.py::test_empty_kernel[float32] 2>&1 | tee /dockerx/triton/test_empty_kernel.log # pytest --capture=tee-sys --verbose python/test/regression/test_performance.py | tee /dockerx/triton/test_performance.log # pytest --capture=tee-sys --verbose python/test/regression/test_performance.py::test_matmul | tee /dockerx/triton/test_performance_matmul.log