[FRONTEND] Now using raw compiler syscalls when possible (#678)
This commit is contained in:
@@ -6,11 +6,13 @@ import functools
|
|||||||
import hashlib
|
import hashlib
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import sysconfig
|
import sysconfig
|
||||||
import tempfile
|
import tempfile
|
||||||
import warnings
|
import warnings
|
||||||
|
from sysconfig import get_paths
|
||||||
from typing import Any, Dict, Set, Tuple, Union
|
from typing import Any, Dict, Set, Tuple, Union
|
||||||
|
|
||||||
import setuptools
|
import setuptools
|
||||||
@@ -917,58 +919,6 @@ def generate_name_initializer(signature):
|
|||||||
src
|
src
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
|
||||||
def quiet():
|
|
||||||
old_stdout, old_stderr = sys.stdout, sys.stderr
|
|
||||||
sys.stdout, sys.stderr = io.StringIO(), io.StringIO()
|
|
||||||
try:
|
|
||||||
yield
|
|
||||||
finally:
|
|
||||||
sys.stdout, sys.stderr = old_stdout, old_stderr
|
|
||||||
|
|
||||||
|
|
||||||
@functools.lru_cache()
|
|
||||||
def libcuda_dir():
|
|
||||||
loc = subprocess.check_output(["whereis", "libcuda.so"]).decode().strip().split()[-1]
|
|
||||||
return os.path.dirname(loc)
|
|
||||||
|
|
||||||
|
|
||||||
def _build(name, src, path):
|
|
||||||
# add framework
|
|
||||||
extra_compile_args = []
|
|
||||||
library_dirs = [libcuda_dir()]
|
|
||||||
include_dirs = [path, "/usr/local/cuda/include/"]
|
|
||||||
libraries = ['cuda']
|
|
||||||
# extra arguments
|
|
||||||
extra_link_args = []
|
|
||||||
# create extension module
|
|
||||||
ext = setuptools.Extension(
|
|
||||||
name=name,
|
|
||||||
language='c++',
|
|
||||||
sources=[src],
|
|
||||||
include_dirs=include_dirs,
|
|
||||||
extra_compile_args=extra_compile_args + ['-O3'],
|
|
||||||
extra_link_args=extra_link_args,
|
|
||||||
library_dirs=library_dirs,
|
|
||||||
libraries=libraries,
|
|
||||||
)
|
|
||||||
# build extension module
|
|
||||||
args = ['build_ext']
|
|
||||||
args.append('--build-temp=' + path)
|
|
||||||
args.append('--build-lib=' + path)
|
|
||||||
args.append('-q')
|
|
||||||
args = dict(
|
|
||||||
name=name,
|
|
||||||
ext_modules=[ext],
|
|
||||||
script_args=args,
|
|
||||||
)
|
|
||||||
# with quiet():
|
|
||||||
setuptools.setup(**args)
|
|
||||||
suffix = sysconfig.get_config_var('EXT_SUFFIX')
|
|
||||||
so = os.path.join(path, '{name}{suffix}'.format(name=name, suffix=suffix))
|
|
||||||
return so
|
|
||||||
|
|
||||||
|
|
||||||
def binary_name_to_header_name(name):
|
def binary_name_to_header_name(name):
|
||||||
if len(name) > 128:
|
if len(name) > 128:
|
||||||
# avoid filename too long errors (filename limit is 255)
|
# avoid filename too long errors (filename limit is 255)
|
||||||
@@ -1030,7 +980,7 @@ unsigned int {name}_shmem = {shmem_size};"""
|
|||||||
#include \"cuda.h\"
|
#include \"cuda.h\"
|
||||||
#include <Python.h>
|
#include <Python.h>
|
||||||
|
|
||||||
inline void gpuAssert(CUresult code, const char *file, int line)
|
static inline void gpuAssert(CUresult code, const char *file, int line)
|
||||||
{{
|
{{
|
||||||
if (code != CUDA_SUCCESS)
|
if (code != CUDA_SUCCESS)
|
||||||
{{
|
{{
|
||||||
@@ -1048,7 +998,7 @@ inline void gpuAssert(CUresult code, const char *file, int line)
|
|||||||
static CUmodule module = 0;
|
static CUmodule module = 0;
|
||||||
static CUfunction function = 0;
|
static CUfunction function = 0;
|
||||||
|
|
||||||
static void init_function(const char* name, const unsigned char* src, size_t n_shared_bytes, int64_t device){{
|
static inline void init_function(const char* name, const unsigned char* src, size_t n_shared_bytes, int64_t device){{
|
||||||
CUmodule mod;
|
CUmodule mod;
|
||||||
CUfunction fun;
|
CUfunction fun;
|
||||||
CUDA_CHECK(cuModuleLoadData(&mod, src));
|
CUDA_CHECK(cuModuleLoadData(&mod, src));
|
||||||
@@ -1070,7 +1020,7 @@ static void init_function(const char* name, const unsigned char* src, size_t n_s
|
|||||||
function = fun;
|
function = fun;
|
||||||
}}
|
}}
|
||||||
|
|
||||||
static void init_module(CUdevice device) {{
|
static inline void init_module(CUdevice device) {{
|
||||||
{func_init}
|
{func_init}
|
||||||
}}
|
}}
|
||||||
|
|
||||||
@@ -1209,16 +1159,72 @@ def make_cache_key(fn, signature, configs, constants, num_warps, num_stages):
|
|||||||
key = hashlib.md5(key.encode("utf-8")).hexdigest()
|
key = hashlib.md5(key.encode("utf-8")).hexdigest()
|
||||||
return key
|
return key
|
||||||
|
|
||||||
|
# utilties for generating and compiling C wrappers
|
||||||
|
|
||||||
def make_shared_object(fn, constants, signature, num_warps, binaries, tmpdir):
|
|
||||||
src = generate_torch_glue(fn.__name__, constants, signature, num_warps, binaries, tmpdir)
|
@functools.lru_cache()
|
||||||
src_path = os.path.join(tmpdir, "main.c")
|
def libcuda_dir():
|
||||||
with open(src_path, "w") as f:
|
loc = subprocess.check_output(["whereis", "libcuda.so"]).decode().strip().split()[-1]
|
||||||
f.write(src)
|
return os.path.dirname(loc)
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def quiet():
|
||||||
|
old_stdout, old_stderr = sys.stdout, sys.stderr
|
||||||
|
sys.stdout, sys.stderr = io.StringIO(), io.StringIO()
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
sys.stdout, sys.stderr = old_stdout, old_stderr
|
||||||
|
|
||||||
|
|
||||||
|
def _build(name, src, srcdir):
|
||||||
|
cuda_lib_dir = libcuda_dir()
|
||||||
|
cu_include_dir = "/usr/local/cuda/include"
|
||||||
|
suffix = sysconfig.get_config_var('EXT_SUFFIX')
|
||||||
|
so = os.path.join(srcdir, '{name}{suffix}'.format(name=name, suffix=suffix))
|
||||||
|
# try to avoid setuptools if possible
|
||||||
|
cc = os.environ.get("CC")
|
||||||
|
if cc is None:
|
||||||
|
# TODO: support more things here.
|
||||||
|
clang = shutil.which("clang")
|
||||||
|
gcc = shutil.which("gcc")
|
||||||
|
cc = gcc if gcc is not None else clang
|
||||||
|
py_include_dir = get_paths()["include"]
|
||||||
|
ret = subprocess.check_call([cc, src, "-O3", f"-I{cu_include_dir}", f"-I{py_include_dir}", f"-I{srcdir}", "-shared", "-fPIC", f"-L{cuda_lib_dir}", "-lcuda", "-o", so])
|
||||||
|
if ret == 0:
|
||||||
|
return so
|
||||||
|
# fallback on setuptools
|
||||||
|
extra_compile_args = []
|
||||||
|
library_dirs = [cuda_lib_dir]
|
||||||
|
include_dirs = [srcdir, cu_include_dir]
|
||||||
|
libraries = ['cuda']
|
||||||
|
# extra arguments
|
||||||
|
extra_link_args = []
|
||||||
|
# create extension module
|
||||||
|
ext = setuptools.Extension(
|
||||||
|
name=name,
|
||||||
|
language='c',
|
||||||
|
sources=[src],
|
||||||
|
include_dirs=include_dirs,
|
||||||
|
extra_compile_args=extra_compile_args + ['-O3'],
|
||||||
|
extra_link_args=extra_link_args,
|
||||||
|
library_dirs=library_dirs,
|
||||||
|
libraries=libraries,
|
||||||
|
)
|
||||||
|
# build extension module
|
||||||
|
args = ['build_ext']
|
||||||
|
args.append('--build-temp=' + srcdir)
|
||||||
|
args.append('--build-lib=' + srcdir)
|
||||||
|
args.append('-q')
|
||||||
|
args = dict(
|
||||||
|
name=name,
|
||||||
|
ext_modules=[ext],
|
||||||
|
script_args=args,
|
||||||
|
)
|
||||||
with quiet():
|
with quiet():
|
||||||
bin_path = _build(fn.__name__, src_path, tmpdir)
|
setuptools.setup(**args)
|
||||||
with open(bin_path, "rb") as f:
|
return so
|
||||||
return f.read()
|
|
||||||
|
|
||||||
|
|
||||||
def compile(fn, signature: str, device: int = -1, constants=dict(), num_warps: int = 4, num_stages: int = 3, extern_libs=None, configs=None):
|
def compile(fn, signature: str, device: int = -1, constants=dict(), num_warps: int = 4, num_stages: int = 3, extern_libs=None, configs=None):
|
||||||
@@ -1243,10 +1249,14 @@ def compile(fn, signature: str, device: int = -1, constants=dict(), num_warps: i
|
|||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
all_constants = set(constants.keys())
|
all_constants = set(constants.keys())
|
||||||
all_constants.update(configs[0].equal_to_1)
|
all_constants.update(configs[0].equal_to_1)
|
||||||
so = make_shared_object(fn, all_constants, signature, num_warps, binaries, tmpdir)
|
src = generate_torch_glue(fn.__name__, constants, signature, num_warps, binaries, tmpdir)
|
||||||
|
src_path = os.path.join(tmpdir, "main.c")
|
||||||
|
with open(src_path, "w") as f:
|
||||||
|
f.write(src)
|
||||||
|
so = _build(fn.__name__, src_path, tmpdir)
|
||||||
|
with open(so, "rb") as f:
|
||||||
|
cache_manager.put(f.read())
|
||||||
|
|
||||||
# write shared object to cache
|
|
||||||
cache_manager.put(so)
|
|
||||||
return CompiledKernel(fn.__name__, cache_manager.bin_path)
|
return CompiledKernel(fn.__name__, cache_manager.bin_path)
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user