hip src

2022-10-17 20:18:44 +00:00
parent 32dbc08c05
commit 4d6d4c9431
3 changed files with 49 additions and 24 deletions
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -116,7 +116,7 @@ def test_empty_kernel(dtype_x, device='cuda'):
    @triton.jit
    def kernel(X, SIZE: tl.constexpr):
        pass
-    check_type_supported(dtype_x)
+    # check_type_supported(dtype_x)
    x = to_triton(numpy_random(SIZE, dtype_str=dtype_x), device=device, dst_type=dtype_x)
    kernel[(1, )](x, SIZE=SIZE, num_warps=4)

--- a/python/triton/compiler.py
+++ b/python/triton/compiler.py
@@ -904,7 +904,7 @@ def _compile(fn, signature: str, device: int = -1, constants=dict(),

 def ty_to_cpp(ty):
    if ty[0] == '*':
-        return "CUdeviceptr"
+        return "hipDeviceptr_t"
    return {
        "i1": "int32_t",
        "i8": "int8_t",
@@ -962,17 +962,18 @@ def generate_launcher(identifier, constants, signature):
    format = "iiiiiKKOOO" + ''.join([format_of(_extracted_type(ty)) for ty in signature.values()])

    # generate glue code
-    src = f"""
-#include \"cuda.h\"
+    if torch.version.hip is not None:
+        src = f"""
+#define __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
 #include <Python.h>

-static inline void gpuAssert(CUresult code, const char *file, int line)
+static inline void gpuAssert(hipError_t code, const char *file, int line)
 {{
-   if (code != CUDA_SUCCESS)
+   if (code != HIP_SUCCESS)
   {{
      const char* prefix = "Triton Error [CUDA]: ";
-      const char* str;
-      cuGetErrorString(code, &str);
+      const char* str = hipGetErrorString(code);
      char err[1024] = {{0}};
      strcat(err, prefix);
      strcat(err, str);
@@ -982,20 +983,20 @@ static inline void gpuAssert(CUresult code, const char *file, int line)
 #define CUDA_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}


-void _launch(int gridX, int gridY, int gridZ, int num_warps, int shared_memory, CUstream stream, CUfunction function, {arg_decls}) {{
+void _launch(int gridX, int gridY, int gridZ, int num_warps, int shared_memory, hipStream_t stream, hipFunction_t function, {arg_decls}) {{
  void *params[] = {{ {', '.join(f"&arg{i}" for i in signature.keys() if i not in constants)} }};
  if(gridX*gridY*gridZ > 0){{
-    CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0));
+    hipLaunchKernel(function, dim3(gridX, gridY, gridZ), dim3(32*num_warps, 1, 1), 0 ,0, stream);
  }}
 }}


-static inline CUdeviceptr getPointer(PyObject *obj, int idx) {{
+static inline hipDeviceptr_t getPointer(PyObject *obj, int idx) {{
  if (PyLong_Check(obj)) {{
-    return (CUdeviceptr)PyLong_AsUnsignedLongLong(obj);
+    return (hipDeviceptr_t)PyLong_AsUnsignedLongLong(obj);
  }}
  if (obj == Py_None) {{
-    return (CUdeviceptr)0;
+    return (hipDeviceptr_t)0;
  }}
  PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
  if(ptr){{
@@ -1006,10 +1007,10 @@ static inline CUdeviceptr getPointer(PyObject *obj, int idx) {{
    if (!PyLong_Check(ret)) {{
      PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
    }}
-    return (CUdeviceptr)PyLong_AsUnsignedLongLong(ret);
+    return (hipDeviceptr_t)PyLong_AsUnsignedLongLong(ret);
  }}
  PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
-  return (CUdeviceptr)0;
+  return (hipDeviceptr_t)0;
 }}


@@ -1034,7 +1035,7 @@ static PyObject* launch(PyObject* self, PyObject* args) {{
    Py_DECREF(new_args);
  }}

-  _launch(gridX, gridY, gridZ, num_warps, shared_memory, (CUstream)_stream, (CUfunction)_function, {', '.join(f"getPointer(_arg{i},{i})" if ty[0]=="*" else f"_arg{i}"for i, ty in signature.items())});
+  _launch(gridX, gridY, gridZ, num_warps, shared_memory, (hipStream_t)_stream, (hipFunction_t)_function, {', '.join(f"getPointer(_arg{i},{i})" if ty[0]=="*" else f"_arg{i}"for i, ty in signature.items())});

  if (launch_exit_hook != Py_None) {{
    PyObject *new_args = NULL;
@@ -1129,12 +1130,20 @@ def libcuda_dirs():
    locs = subprocess.check_output(["whereis", "libcuda.so"]).decode().strip().split()[1:]
    return [os.path.dirname(loc) for loc in locs]

+@functools.lru_cache()
+def libhip_dirs():
+    return ["/opt/rocm/lib/libamdhip64.so"]
+

@functools.lru_cache()
 def cuda_home_dirs():
    default_dir = "/usr/local/cuda"
    return os.getenv("CUDA_HOME", default=default_dir)

+@functools.lru_cache()
+def hip_home_dirs():
+    default_dir = "/opt/rocm"
+    return os.getenv("HIP_HOME", default=default_dir)

@contextlib.contextmanager
 def quiet():
@@ -1147,8 +1156,13 @@ def quiet():


 def _build(name, src, srcdir):
-    cuda_lib_dirs = libcuda_dirs()
-    cu_include_dir = os.path.join(cuda_home_dirs(), "include")
+    if torch.version.hip is not None:
+        hip_lib_dirs = libhip_dirs()
+        hip_include_dir = os.path.join(hip_home_dirs(), "include")
+    else:
+        cuda_lib_dirs = libcuda_dirs()
+        cu_include_dir = os.path.join(cuda_home_dirs(), "include")
+        
    suffix = sysconfig.get_config_var('EXT_SUFFIX')
    so = os.path.join(srcdir, '{name}{suffix}'.format(name=name, suffix=suffix))
    # try to avoid setuptools if possible
@@ -1159,16 +1173,26 @@ def _build(name, src, srcdir):
        gcc = shutil.which("gcc")
        cc = gcc if gcc is not None else clang
    py_include_dir = get_paths()["include"]
-    cc_cmd = [cc, src, "-O3", f"-I{cu_include_dir}", f"-I{py_include_dir}", f"-I{srcdir}", "-shared", "-fPIC", "-lcuda", "-o", so]
-    cc_cmd += [f"-L{dir}" for dir in cuda_lib_dirs]
+    if torch.version.hip is not None:
+       cc_cmd = [cc, src, "-O3", f"-I{hip_include_dir}", f"-I{py_include_dir}", f"-I{srcdir}", "-shared", "-fPIC", "-lcuda", "-o", so]
+       cc_cmd += [f"-L{dir}" for dir in hip_lib_dirs]
+    else:
+        cc_cmd = [cc, src, "-O3", f"-I{cu_include_dir}", f"-I{py_include_dir}", f"-I{srcdir}", "-shared", "-fPIC", "-lcuda", "-o", so]
+        cc_cmd += [f"-L{dir}" for dir in cuda_lib_dirs]
    ret = subprocess.check_call(cc_cmd)
    if ret == 0:
        return so
    # fallback on setuptools
    extra_compile_args = []
-    library_dirs = cuda_lib_dirs
-    include_dirs = [srcdir, cu_include_dir]
-    libraries = ['cuda']
+    if torch.version.hip is not None:
+        library_dirs = hip_lib_dirs
+        include_dirs = [srcdir, hip_include_dir]
+        libraries = ['cuda']
+    else:
+        library_dirs = cuda_lib_dirs
+        include_dirs = [srcdir, cu_include_dir]
+        libraries = ['cuda']
+    
    # extra arguments
    extra_link_args = []
    # create extension module
--- a/scripts/amd/test.sh
+++ b/scripts/amd/test.sh
@@ -26,7 +26,8 @@ rm -rf /tmp/triton
 # python python/test/test_empty.py
 # -ex 'ignore 1 472' \

-pytest --verbose python/test/unit/language/test_core.py 2>&1 | tee /dockerx/triton/test_core.log
+# pytest --verbose python/test/unit/language/test_core.py 2>&1 | tee /dockerx/triton/test_core.log
+pytest --verbose python/test/unit/language/test_core.py::test_empty_kernel[float32] 2>&1 | tee /dockerx/triton/test_empty_kernel.log

 # pytest --capture=tee-sys --verbose  python/test/regression/test_performance.py | tee /dockerx/triton/test_performance.log
 # pytest --capture=tee-sys --verbose  python/test/regression/test_performance.py::test_matmul | tee /dockerx/triton/test_performance_matmul.log