[CODEGEN] Major performance improvements on A100 (#70)
Improved handling of asynchronous copy, scheduling and synchronization for A100. Now achieving CUTLASS-like performance on large square dense matrix multiplication tasks
This commit is contained in:
committed by
Philippe Tillet
parent
045ab5d62a
commit
5b83259592
102
python/setup.py
102
python/setup.py
@@ -15,21 +15,21 @@ import distutils.spawn
|
||||
import torch
|
||||
|
||||
def find_llvm():
|
||||
versions = ['-10', '-10.0', '']
|
||||
supported = ['llvm-config{v}'.format(v=v) for v in versions]
|
||||
versions = ["-10", "-10.0", ""]
|
||||
supported = ["llvm-config{v}".format(v=v) for v in versions]
|
||||
paths = [distutils.spawn.find_executable(cfg) for cfg in supported]
|
||||
paths = [p for p in paths if p is not None]
|
||||
if paths:
|
||||
return paths[0]
|
||||
config = distutils.spawn.find_executable('llvm-config')
|
||||
instructions = 'Please install llvm-10-dev'
|
||||
config = distutils.spawn.find_executable("llvm-config")
|
||||
instructions = "Please install llvm-10-dev"
|
||||
if config is None:
|
||||
raise RuntimeError('Could not find llvm-config. ' + instructions)
|
||||
version = os.popen('{config} --version'.format(config=config)).read()
|
||||
raise RuntimeError('Version {v} not supported. '.format(v=version) + instructions)
|
||||
raise RuntimeError("Could not find llvm-config. " + instructions)
|
||||
version = os.popen("{config} --version".format(config=config)).read()
|
||||
raise RuntimeError("Version {v} not supported. ".format(v=version) + instructions)
|
||||
|
||||
class CMakeExtension(Extension):
|
||||
def __init__(self, name, path, sourcedir=''):
|
||||
def __init__(self, name, path, sourcedir=""):
|
||||
Extension.__init__(self, name, sources=[])
|
||||
self.sourcedir = os.path.abspath(sourcedir)
|
||||
self.path = path
|
||||
@@ -37,84 +37,84 @@ class CMakeExtension(Extension):
|
||||
class CMakeBuild(build_ext):
|
||||
def run(self):
|
||||
try:
|
||||
out = subprocess.check_output(['cmake', '--version'])
|
||||
out = subprocess.check_output(["cmake", "--version"])
|
||||
except OSError:
|
||||
raise RuntimeError("CMake must be installed to build the following extensions: " +
|
||||
", ".join(e.name for e in self.extensions))
|
||||
|
||||
if platform.system() == "Windows":
|
||||
cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
|
||||
if cmake_version < '3.1.0':
|
||||
cmake_version = LooseVersion(re.search(r"version\s*([\d.]+)", out.decode()).group(1))
|
||||
if cmake_version < "3.1.0":
|
||||
raise RuntimeError("CMake >= 3.1.0 is required on Windows")
|
||||
|
||||
for ext in self.extensions:
|
||||
self.build_extension(ext)
|
||||
|
||||
def build_extension(self, ext):
|
||||
#self.debug = True
|
||||
# self.debug = True
|
||||
self.debug = False
|
||||
extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.path)))
|
||||
# python directories
|
||||
python_include_dirs = distutils.sysconfig.get_python_inc()
|
||||
python_lib_dirs = distutils.sysconfig.get_config_var('LIBDIR')
|
||||
python_lib_dirs = distutils.sysconfig.get_config_var("LIBDIR")
|
||||
torch_include_dirs = include_paths(True)
|
||||
torch_library_dirs = library_paths(True)
|
||||
cxx11abi = str(int(torch._C._GLIBCXX_USE_CXX11_ABI))
|
||||
cmake_args = [
|
||||
'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
|
||||
'-DBUILD_TUTORIALS=OFF',
|
||||
'-DBUILD_PYTHON_MODULE=ON',
|
||||
"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
|
||||
"-DBUILD_TUTORIALS=OFF",
|
||||
"-DBUILD_PYTHON_MODULE=ON",
|
||||
#'-DPYTHON_EXECUTABLE=' + sys.executable,
|
||||
#'-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON,
|
||||
'-DPYTHON_INCLUDE_DIRS=' + ';'.join([python_include_dirs] + include_paths(True)),
|
||||
'-DPYTHON_LINK_DIRS=' + ';'.join(library_paths(True)),
|
||||
'-DTORCH_CXX11_ABI=' + cxx11abi,
|
||||
'-DTORCH_LIBRARIES=c10;c10_cuda;torch;torch_cuda;torch_cpu;torch_python;triton',
|
||||
'-DLLVM_CONFIG=' + find_llvm()
|
||||
"-DPYTHON_INCLUDE_DIRS=" + ";".join([python_include_dirs] + include_paths(True)),
|
||||
"-DPYTHON_LINK_DIRS=" + ";".join(library_paths(True)),
|
||||
"-DTORCH_CXX11_ABI=" + cxx11abi,
|
||||
"-DTORCH_LIBRARIES=c10;c10_cuda;torch;torch_cuda;torch_cpu;torch_python;triton",
|
||||
"-DLLVM_CONFIG=" + find_llvm(),
|
||||
]
|
||||
# configuration
|
||||
cfg = 'Debug' if self.debug else 'Release'
|
||||
cfg = 'Release'
|
||||
build_args = ['--config', cfg]
|
||||
cfg = "Debug" if self.debug else "Release"
|
||||
build_args = ["--config", cfg]
|
||||
|
||||
if platform.system() == "Windows":
|
||||
cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)]
|
||||
cmake_args += ["-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), extdir)]
|
||||
if sys.maxsize > 2**32:
|
||||
cmake_args += ['-A', 'x64']
|
||||
build_args += ['--', '/m']
|
||||
cmake_args += ["-A", "x64"]
|
||||
build_args += ["--", "/m"]
|
||||
else:
|
||||
cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
|
||||
build_args += ['--', '-j4']
|
||||
cmake_args += ["-DCMAKE_BUILD_TYPE=" + cfg]
|
||||
build_args += ["--", "-j4"]
|
||||
|
||||
env = os.environ.copy()
|
||||
if not os.path.exists(self.build_temp):
|
||||
os.makedirs(self.build_temp)
|
||||
sourcedir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'src'))
|
||||
subprocess.check_call(['cmake', sourcedir] + cmake_args, cwd=self.build_temp, env=env)
|
||||
subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
|
||||
sourcedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "src"))
|
||||
subprocess.check_call(["cmake", sourcedir] + cmake_args, cwd=self.build_temp, env=env)
|
||||
subprocess.check_call(["cmake", "--build", "."] + build_args, cwd=self.build_temp)
|
||||
|
||||
setup(
|
||||
name='triton',
|
||||
version='1.0.0',
|
||||
author='Philippe Tillet',
|
||||
author_email='phil@openai.com',
|
||||
description='A language and compiler for custom Deep Learning operations',
|
||||
long_description='',
|
||||
packages=['triton', 'triton/_C', 'triton/ops', 'triton/ops/blocksparse'],
|
||||
install_requires=['numpy', 'torch'],
|
||||
package_data={'triton/ops': ['*.c'], 'triton/ops/blocksparse': ['*.c']},
|
||||
name="triton",
|
||||
version="1.0.0",
|
||||
author="Philippe Tillet",
|
||||
author_email="phil@openai.com",
|
||||
description="A language and compiler for custom Deep Learning operations",
|
||||
long_description="",
|
||||
packages=["triton", "triton/_C", "triton/ops", "triton/ops/blocksparse"],
|
||||
install_requires=["numpy", "torch"],
|
||||
package_data={"triton/ops": ["*.c"], "triton/ops/blocksparse": ["*.c"]},
|
||||
include_package_data=True,
|
||||
ext_modules=[CMakeExtension('triton', 'triton/_C/')],
|
||||
cmdclass={'build_ext': CMakeBuild},
|
||||
ext_modules=[CMakeExtension("triton", "triton/_C/")],
|
||||
cmdclass={"build_ext": CMakeBuild},
|
||||
zip_safe=False,
|
||||
# for PyPI
|
||||
keywords=['Compiler', 'Deep Learning'],
|
||||
url='https://github.com/ptillet/triton/',
|
||||
download_url='https://github.com/ptillet/triton/archive/v0.1.tar.gz',
|
||||
keywords=["Compiler", "Deep Learning"],
|
||||
url="https://github.com/ptillet/triton/",
|
||||
download_url="https://github.com/ptillet/triton/archive/v0.1.tar.gz",
|
||||
classifiers=[
|
||||
'Development Status :: 3 - Alpha', # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package
|
||||
'Intended Audience :: Developers', # Define that your audience are developers
|
||||
'Topic :: Software Development :: Build Tools',
|
||||
'License :: OSI Approved :: MIT License', # Again, pick a license
|
||||
'Programming Language :: Python :: 3.6',
|
||||
"Development Status :: 3 - Alpha", # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package
|
||||
"Intended Audience :: Developers", # Define that your audience are developers
|
||||
"Topic :: Software Development :: Build Tools",
|
||||
"License :: OSI Approved :: MIT License", # Again, pick a license
|
||||
"Programming Language :: Python :: 3.6",
|
||||
],
|
||||
)
|
||||
|
Reference in New Issue
Block a user