From 969d6de8a20c4ddcb69b00cb56eaf2602240382d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 29 Oct 2021 01:24:19 -0700 Subject: [PATCH 1/7] [PACKAGING] Bumped dev version to 1.1.2 --- .github/workflows/integration-tests.yml | 2 +- python/setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index ac0fe14f0..987b346a3 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -5,7 +5,7 @@ on: pull_request: branches: - master - - v2 + - v2.0 jobs: diff --git a/python/setup.py b/python/setup.py index 0d962355f..f77b92666 100644 --- a/python/setup.py +++ b/python/setup.py @@ -121,7 +121,7 @@ class CMakeBuild(build_ext): setup( name="triton", - version="1.1.1", + version="1.1.2", author="Philippe Tillet", author_email="phil@openai.com", description="A language and compiler for custom Deep Learning operations", From b7f0e87dc2143cdf9cbc050e34ba80440074c53b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 29 Oct 2021 10:42:10 -0700 Subject: [PATCH 2/7] [DRIVER] Removed std::cout log message --- lib/driver/llvm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/driver/llvm.cc b/lib/driver/llvm.cc index 3c11fbf35..f3c76ce77 100644 --- a/lib/driver/llvm.cc +++ b/lib/driver/llvm.cc @@ -178,7 +178,7 @@ std::string ptx_to_cubin(const std::string& ptx, int cc) { ofs.close(); std::string cmd; int err; - cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc + " -o " + fsrc + ".o"; + cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc + " -o " + fsrc + ".o 2> " + flog; err = system(cmd.c_str()); CUmodule ret; std::ifstream _cubin(_fbin, std::ios::binary ); From 0ff1a26b70d816056c1cb1abb67710128dd27624 Mon Sep 17 00:00:00 2001 From: Victor Date: Mon, 6 Dec 2021 18:14:03 -0800 Subject: [PATCH 3/7] fixed p2p tests failing when there are no supported p2p devices (#386) --- python/test/unit/runtime/test_comm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/test/unit/runtime/test_comm.py b/python/test/unit/runtime/test_comm.py index ae843a15f..520462870 100644 --- a/python/test/unit/runtime/test_comm.py +++ b/python/test/unit/runtime/test_comm.py @@ -23,13 +23,13 @@ def get_p2p_matrix(): def get_p2p_devices(): matrix = get_p2p_matrix() idx = np.where(matrix == "OK") - return f"cuda:{idx[0][0]}", f"cuda:{idx[1][0]}" + return [f"cuda:{idx[0][0]}", f"cuda:{idx[1][0]}"] if len(idx[0]) > 0 else [] def get_non_p2p_devices(): matrix = get_p2p_matrix() idx = np.where(matrix == "NS") - return f"cuda:{idx[0][0]}", f"cuda:{idx[1][0]}" + return [f"cuda:{idx[0][0]}", f"cuda:{idx[1][0]}"] if len(idx[0]) > 0 else [] p2p_devices = get_p2p_devices() From 73b04d71b22b36fbe823b579922d6a5986f51a7b Mon Sep 17 00:00:00 2001 From: Victor Date: Tue, 7 Dec 2021 14:10:58 -0800 Subject: [PATCH 4/7] Fixes for building on Windows (#382) * make C++ code compatible with Windows + MSVC * added dlfcn-win32 for cross-platform dlopen * fixed building and pip install on Windows * fixed shared library file name under Windows --- .gitignore | 3 ++ .gitmodules | 3 ++ CMakeLists.txt | 51 +++++++++++++++++++++++++---- deps/dlfcn-win32 | 1 + include/triton/tools/sys/exec.hpp | 8 +++++ include/triton/tools/sys/getenv.hpp | 11 +------ lib/codegen/selection/generator.cc | 20 +++++------ lib/driver/llvm.cc | 28 ++++++++-------- python/setup.py | 4 ++- python/src/triton.cc | 1 + 10 files changed, 90 insertions(+), 40 deletions(-) create mode 100644 .gitmodules create mode 160000 deps/dlfcn-win32 diff --git a/.gitignore b/.gitignore index c10863ae9..b32df68cc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ +build/ + __pycache__ .pytest_cache python/build/ python/triton.egg-info/ +python/triton/_C/libtriton.pyd python/triton/_C/libtriton.so diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..2754cffc4 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "deps/dlfcn-win32"] + path = deps/dlfcn-win32 + url = https://github.com/dlfcn-win32/dlfcn-win32.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 8fb73e678..f44c35aa7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,8 @@ cmake_minimum_required(VERSION 3.6) include(ExternalProject) +set(CMAKE_CXX_STANDARD 17) + if(NOT TRITON_LLVM_BUILD_DIR) set(TRITON_LLVM_BUILD_DIR ${CMAKE_BINARY_DIR}) endif() @@ -8,7 +10,9 @@ endif() project(triton) include(CTest) -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +if(NOT WIN32) + list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +endif() # Options option(BUILD_TUTORIALS "Build C++ Triton tutorials" ON) @@ -20,10 +24,19 @@ if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release") endif() -find_library(TERMINFO_LIBRARY tinfo) +if(NOT WIN32) + find_library(TERMINFO_LIBRARY tinfo) +endif() # Compiler flags include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) + +if(WIN32) + SET(BUILD_SHARED_LIBS OFF) + include_directories(${CMAKE_CURRENT_SOURCE_DIR}/deps/dlfcn-win32/src) + add_subdirectory(deps/dlfcn-win32/src ${CMAKE_BINARY_DIR}/dlfcn-win32) +endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__STDC_FORMAT_MACROS -std=gnu++17") @@ -31,7 +44,20 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__STDC_FORMAT_MACROS -std=gnu++17") # LLVM ########## if("${LLVM_LIBRARY_DIR}" STREQUAL "") - find_package(LLVM 11 REQUIRED COMPONENTS "nvptx;amdgpu") + if(WIN32) + find_package(LLVM 13 REQUIRED COMPONENTS nvptx amdgpu) + + include_directories(${LLVM_INCLUDE_DIRS}) + separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS}) + add_definitions(${LLVM_DEFINITIONS_LIST}) + + llvm_map_components_to_libnames(LLVM_LIBRARIES support core + NVPTXInfo nvptxcodegen + AMDGPUInfo AMDGPUcodegen + ) + else() + find_package(LLVM 11 REQUIRED COMPONENTS "nvptx;amdgpu") + endif() message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") if(APPLE) set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14") @@ -108,12 +134,25 @@ endif() # Triton file(GLOB_RECURSE LIBTRITON_SRC lib/*.cc) -add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC}) +if (WIN32 AND BUILD_PYTHON_MODULE) + find_package(Python3 REQUIRED COMPONENTS Development) + Python3_add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC}) + set_target_properties(triton PROPERTIES SUFFIX ".pyd") + set_target_properties(triton PROPERTIES PREFIX "lib") +else() + add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC}) +endif() + target_link_options(triton PRIVATE ${LLVM_LDFLAGS}) -target_link_libraries(triton ${LLVM_LIBRARIES} z ${TERMINFO_LIBRARY}) + +if(WIN32) + target_link_libraries(triton PRIVATE ${LLVM_LIBRARIES} dl) # dl is from dlfcn-win32 +else() + target_link_libraries(triton ${LLVM_LIBRARIES} z ${TERMINFO_LIBRARY}) +endif() -if(BUILD_PYTHON_MODULE) +if(BUILD_PYTHON_MODULE AND NOT WIN32) set(CMAKE_SHARED_LIBRARY_SUFFIX ".so") # Check if the platform is MacOS if(APPLE) diff --git a/deps/dlfcn-win32 b/deps/dlfcn-win32 new file mode 160000 index 000000000..522c301ec --- /dev/null +++ b/deps/dlfcn-win32 @@ -0,0 +1 @@ +Subproject commit 522c301ec366e9b42205ae21617780d37cc0e9f0 diff --git a/include/triton/tools/sys/exec.hpp b/include/triton/tools/sys/exec.hpp index 243f0f482..5b664553e 100644 --- a/include/triton/tools/sys/exec.hpp +++ b/include/triton/tools/sys/exec.hpp @@ -13,6 +13,14 @@ namespace tools { +#ifdef _WIN32 +#define popen _popen +#define pclose _pclose +#endif + +#ifndef WEXITSTATUS +#define WEXITSTATUS(stat_val) ((unsigned)(stat_val) & 255) +#endif int exec(const std::string& cmd, std::string& result) { char buffer[128]; diff --git a/include/triton/tools/sys/getenv.hpp b/include/triton/tools/sys/getenv.hpp index 0319d8868..755a84a66 100755 --- a/include/triton/tools/sys/getenv.hpp +++ b/include/triton/tools/sys/getenv.hpp @@ -33,19 +33,10 @@ namespace tools inline std::string getenv(const char * name) { - #ifdef _MSC_VER - char* cache_path = 0; - std::size_t sz = 0; - _dupenv_s(&cache_path, &sz, name); - #else - const char * cstr = std::getenv(name); - #endif + const char * cstr = std::getenv(name); if(!cstr) return ""; std::string result(cstr); - #ifdef _MSC_VER - free(cache_path); - #endif return result; } diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 7316e047a..9253dd319 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -441,18 +441,18 @@ std::tuple generator::fp8x4_to_fp16x4(Value *in0 "lop3.b32 $1, b1, 0x80008000, a1, 0xf8; \n\t" "}", "=r,=r,r", false); Value *packed_in = UndefValue::get(vec_ty(i8_ty, 4)); - packed_in = insert_elt(packed_in, in0, (int)0); - packed_in = insert_elt(packed_in, in1, (int)1); - packed_in = insert_elt(packed_in, in2, (int)2); - packed_in = insert_elt(packed_in, in3, (int)3); + packed_in = insert_elt(packed_in, in0, (uint64_t)0); + packed_in = insert_elt(packed_in, in1, (uint64_t)1); + packed_in = insert_elt(packed_in, in2, (uint64_t)2); + packed_in = insert_elt(packed_in, in3, (uint64_t)3); Value *in = bit_cast(packed_in, i32_ty); Value *ret = call(ptx, {in}); Value *packed_ret0 = extract_val(ret, {0}); Value *packed_ret1 = extract_val(ret, {1}); - Value *ret0 = extract_elt(packed_ret0, (int)0); - Value *ret1 = extract_elt(packed_ret0, (int)1); - Value *ret2 = extract_elt(packed_ret1, (int)0); - Value *ret3 = extract_elt(packed_ret1, (int)1); + Value *ret0 = extract_elt(packed_ret0, (uint64_t)0); + Value *ret1 = extract_elt(packed_ret0, (uint64_t)1); + Value *ret2 = extract_elt(packed_ret1, (uint64_t)0); + Value *ret3 = extract_elt(packed_ret1, (uint64_t)1); return std::make_tuple(ret0, ret1, ret2, ret3); } @@ -694,11 +694,11 @@ void generator::visit_load_inst(ir::load_inst* x){ // --- // finally call inline ASM // --- - InlineAsm *_asm = InlineAsm::get(asm_ty, asm_oss.str(), asm_cstrt, true); + InlineAsm *inlineAsm = InlineAsm::get(asm_ty, asm_oss.str(), asm_cstrt, true); std::vector args = {pred, ptr}; for(Value *v: others) args.push_back(v); - Value *_ret = call(_asm, args); + Value *_ret = call(inlineAsm, args); // --- // extract and store return values // --- diff --git a/lib/driver/llvm.cc b/lib/driver/llvm.cc index f3c76ce77..0a355d106 100644 --- a/lib/driver/llvm.cc +++ b/lib/driver/llvm.cc @@ -20,7 +20,9 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include -#include +#if __has_include() + #include +#endif #include #include #include "triton/driver/llvm.h" @@ -165,10 +167,10 @@ std::string ptx_to_cubin(const std::string& ptx, int cc) { return ""; // compile ptx with ptxas - char _fsrc[] = "/tmp/triton_k_XXXXXX"; - char _flog[] = "/tmp/triton_l_XXXXXX"; - mkstemp(_fsrc); - mkstemp(_flog); + char _fsrc[L_tmpnam]; + char _flog[L_tmpnam]; + std::tmpnam(_fsrc); + std::tmpnam(_flog); std::string fsrc = _fsrc; std::string flog = _flog; std::string fbin = fsrc + ".o"; @@ -202,10 +204,10 @@ CUmodule ptx_to_cumodule(const std::string& ptx, int cc) { // Use PTXAS via system call if(use_system_ptxas){ // compile ptx with ptxas - char _fsrc[] = "/tmp/triton_k_XXXXXX"; - char _flog[] = "/tmp/triton_l_XXXXXX"; - mkstemp(_fsrc); - mkstemp(_flog); + char _fsrc[L_tmpnam]; + char _flog[L_tmpnam]; + std::tmpnam(_fsrc); + std::tmpnam(_flog); std::string fsrc = _fsrc; std::string flog = _flog; std::string fbin = fsrc + ".o"; @@ -232,8 +234,8 @@ CUmodule ptx_to_cumodule(const std::string& ptx, int cc) { CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER, CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE}; - unsigned int errbufsize = 8192; - unsigned int logbufsize = 8192; + const unsigned int errbufsize = 8192; + const unsigned int logbufsize = 8192; char _err[errbufsize]; char _log[logbufsize]; void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)_err, (void*)(uintptr_t)logbufsize, (void*)_log, (void*)1}; @@ -344,8 +346,8 @@ hipModule_t amdgpu_to_hipmodule(const std::string& path) { hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes, hipJitOptionErrorLogBuffer, hipJitOptionInfoLogBufferSizeBytes, hipJitOptionInfoLogBuffer, hipJitOptionLogVerbose}; - unsigned int errbufsize = 8192; - unsigned int logbufsize = 8192; + const unsigned int errbufsize = 8192; + const unsigned int logbufsize = 8192; char _err[errbufsize]; char _log[logbufsize]; void* optval[] = {(void*)(uintptr_t)errbufsize, diff --git a/python/setup.py b/python/setup.py index f77b92666..d57ea96ed 100644 --- a/python/setup.py +++ b/python/setup.py @@ -24,6 +24,8 @@ def get_llvm(): paths = [p for p in paths if p is not None] if paths: return '', '' + if platform.system() == "Windows": + return '', '' # download if nothing is installed name = 'clang+llvm-11.0.1-x86_64-linux-gnu-ubuntu-16.04' dir = '/tmp' @@ -105,7 +107,7 @@ class CMakeBuild(build_ext): build_args = ["--config", cfg] if platform.system() == "Windows": - cmake_args += ["-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), extdir)] + cmake_args += ["-DCMAKE_RUNTIME_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), extdir)] if sys.maxsize > 2**32: cmake_args += ["-A", "x64"] build_args += ["--", "/m"] diff --git a/python/src/triton.cc b/python/src/triton.cc index 9298f9db4..7bb94d5fe 100644 --- a/python/src/triton.cc +++ b/python/src/triton.cc @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "llvm/IR/Module.h" #include "llvm/IR/LegacyPassManager.h" From eb077fc993787d7473c33eb8c345fa4cc6745ede Mon Sep 17 00:00:00 2001 From: Victor Date: Thu, 16 Dec 2021 22:09:52 -0800 Subject: [PATCH 5/7] [RUNTIME] fixed NVidia DLL names on Windows (#392) --- lib/driver/dispatch.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/driver/dispatch.cc b/lib/driver/dispatch.cc index 4059ac235..9e2aca432 100755 --- a/lib/driver/dispatch.cc +++ b/lib/driver/dispatch.cc @@ -91,9 +91,13 @@ void* dispatch::fname ## _; bool dispatch::cuinit(){ if(cuda_==nullptr){ + #ifdef _WIN32 + cuda_ = dlopen("cudart64_110.dll", RTLD_LAZY); + #else cuda_ = dlopen("libcuda.so", RTLD_LAZY); if(!cuda_) cuda_ = dlopen("libcuda.so.1", RTLD_LAZY); + #endif if(!cuda_) throw std::runtime_error("Could not find `libcuda.so`. Make sure it is in your LD_LIBRARY_PATH."); } @@ -176,8 +180,13 @@ CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent) * NVML * ------------------- */ bool dispatch::nvmlinit(){ + #ifdef _WIN32 + if(nvml_==nullptr) + nvml_ = dlopen("nvml.dll", RTLD_LAZY); + #else if(nvml_==nullptr) nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY); + #endif nvmlReturn_t (*fptr)(); nvmlInit_v2_ = dlsym(nvml_, "nvmlInit_v2"); *reinterpret_cast(&fptr) = nvmlInit_v2_; From 3edc2633e9876005da2423f1463fe2e215f52f04 Mon Sep 17 00:00:00 2001 From: Noah Ziems Date: Wed, 29 Dec 2021 18:09:34 -0500 Subject: [PATCH 6/7] [TUTORIALS] Fix 01-vector-add.py typo (#406) --- python/tutorials/01-vector-add.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py index 4446cf6e9..b25698a4e 100644 --- a/python/tutorials/01-vector-add.py +++ b/python/tutorials/01-vector-add.py @@ -37,7 +37,7 @@ def add_kernel( offsets = block_start + tl.arange(0, BLOCK_SIZE) # Create a mask to guard memory operations against out-of-bounds accesses mask = offsets < n_elements - # Load x and y from DRAM, masking out any extar elements in case the input is not a + # Load x and y from DRAM, masking out any extra elements in case the input is not a # multiple of the block size x = tl.load(x_ptr + offsets, mask=mask) y = tl.load(y_ptr + offsets, mask=mask) From 6f7acad48fd94384f5b1fe0148e2c7f584a6aefd Mon Sep 17 00:00:00 2001 From: Shantanu <12621235+hauntsaninja@users.noreply.github.com> Date: Thu, 6 Jan 2022 12:04:33 -0800 Subject: [PATCH 7/7] [CODEGEN] Avoid use of deprecated AST nodes (#418) Co-authored-by: hauntsaninja <> --- python/triton/code_gen.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/python/triton/code_gen.py b/python/triton/code_gen.py index 8622333bf..90018028d 100644 --- a/python/triton/code_gen.py +++ b/python/triton/code_gen.py @@ -305,9 +305,6 @@ class CodeGenerator(ast.NodeVisitor): for stmt in node.orelse: ast.NodeVisitor.generic_visit(self, stmt) - def visit_Str(self, node): - return ast.literal_eval(node) - def visit_Subscript(self, node): assert node.ctx.__class__.__name__ == "Load" lhs = self.visit(node.value) @@ -374,9 +371,6 @@ class CodeGenerator(ast.NodeVisitor): def visit_Index(self, node): return self.visit(node.value) - def visit_NameConstant(self, node): - return node.value - def visit_keyword(self, node): return {node.arg: self.visit(node.value)} @@ -393,8 +387,18 @@ class CodeGenerator(ast.NodeVisitor): return fn(*args, _builder=self.builder, **kws) return fn(*args, **kws) - def visit_Num(self, node): - return node.n + def visit_Constant(self, node): + return node.value + + if sys.version_info < (3, 8): + def visit_NameConstant(self, node): + return node.value + + def visit_Num(self, node): + return node.n + + def visit_Str(self, node): + return ast.literal_eval(node) def visit_Attribute(self, node): lhs = self.visit(node.value)