From f45e31ba7c5e9ec81abfc0cf180c11d224fd80d1 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Thu, 29 Sep 2022 21:06:22 -0400
Subject: [PATCH] [FRONTEND] Make sure to hold the gil when creating python
 objects (#726)

Without this patch, a debug version of python complains that:
```
Fatal Python error: Python memory allocator called without holding the GIL
Python runtime state: initialized
```
---
 python/src/triton.cc | 75 +++++++++++++++++++++++---------------------
 1 file changed, 40 insertions(+), 35 deletions(-)

diff --git a/python/src/triton.cc b/python/src/triton.cc
index 2a5052199..8918e8809 100644
--- a/python/src/triton.cc
+++ b/python/src/triton.cc
@@ -437,46 +437,51 @@ typedef std::map<std::string, py::object> asm_map_t;
 void init_triton_codegen(py::module &&m) {
   m.def("compile_ttir",
       [](backend_t backend, ir::module &ir, uint64_t device, int num_warps, int num_stages, py::dict& extern_libs, size_t cc) {
-          py::gil_scoped_release allow_threads;
-          std::string name = ir.get_function_list()[0]->get_name();
-          // record asm as we generate
-          asm_map_t asm_map;
           std::ostringstream ttir;
-          ir.print(ttir);
-          asm_map["ttir"] = py::cast(ttir.str());
-          llvm::LLVMContext ctx;
-          // construct extern lib map
-          triton::codegen::ExternLibMap extern_lib_map;
-          for (auto item : extern_libs) {
-            auto name = item.first.cast<std::string>();
-            auto path = item.second.cast<std::string>();
-            extern_lib_map.emplace(
-                name, triton::codegen::create_extern_lib(name, path));
-          }
-          // device properties
-          if (cc == 0) {
-            CUdevice dev = (CUdevice)device;
-            size_t major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>(dev);
-            size_t minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>(dev);
-            cc = major*10 + minor;
-          }
-          int version;
-          std::string ptxas_path = drv::path_to_ptxas(version);
-          // Triton-IR -> NVPTX LLVM-IR
-          triton::codegen::nvidia_cu_target target(cc);
           int n_shared_bytes;
-          auto llvm = triton::codegen::add_passes_to_emit_bin(
-              ir, ctx, &target, num_warps, num_stages, n_shared_bytes, extern_lib_map);
           std::string tmp;
-          llvm::raw_string_ostream llir(tmp);
-          llir << *llvm;
-          llir.flush();
+          std::string ptx;
+          std::string cubin;
+          std::string name;
+          { // Scope where the GIL is released
+            py::gil_scoped_release allow_threads;
+            name = ir.get_function_list()[0]->get_name();
+            ir.print(ttir);
+            llvm::LLVMContext ctx;
+            // construct extern lib map
+            triton::codegen::ExternLibMap extern_lib_map;
+            for (auto item : extern_libs) {
+              auto name = item.first.cast<std::string>();
+              auto path = item.second.cast<std::string>();
+              extern_lib_map.emplace(
+                  name, triton::codegen::create_extern_lib(name, path));
+            }
+            // device properties
+            if (cc == 0) {
+              CUdevice dev = (CUdevice)device;
+              size_t major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>(dev);
+              size_t minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>(dev);
+              cc = major*10 + minor;
+            }
+            int version;
+            std::string ptxas_path = drv::path_to_ptxas(version);
+            // Triton-IR -> NVPTX LLVM-IR
+            triton::codegen::nvidia_cu_target target(cc);
+            auto llvm = triton::codegen::add_passes_to_emit_bin(
+                ir, ctx, &target, num_warps, num_stages, n_shared_bytes, extern_lib_map);
+            llvm::raw_string_ostream llir(tmp);
+            llir << *llvm;
+            llir.flush();
+            // LLVM-IR -> PTX
+            ptx = drv::llir_to_ptx(llvm.get(), cc, version);
+            // PTX -> Binary
+            cubin = drv::ptx_to_cubin(ptx, ptxas_path, cc);
+          }
+          asm_map_t asm_map;
+          asm_map["ttir"] = py::cast(ttir.str());
           asm_map["llir"] = py::cast(tmp);
-          // LLVM-IR -> PTX
-          std::string ptx = drv::llir_to_ptx(llvm.get(), cc, version);
           asm_map["ptx"] = py::cast(ptx);
-          // PTX -> Binary
-          std::string cubin = drv::ptx_to_cubin(ptx, ptxas_path, cc);
+
           if(!cubin.empty()){
             py::bytes bytes(cubin);
             asm_map["cubin"] = bytes;