From 9b100302d3818e8ac396c743cd691f8147a5edb5 Mon Sep 17 00:00:00 2001
From: Philippe Tillet <phil@openai.com>
Date: Thu, 10 Feb 2022 01:57:39 -0800
Subject: [PATCH] [FRONTEND] Now using pybind11 to release GIL (#458)

---
 python/src/triton.cc | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/python/src/triton.cc b/python/src/triton.cc
index e60f3be37..3410df6b8 100644
--- a/python/src/triton.cc
+++ b/python/src/triton.cc
@@ -328,13 +328,10 @@ void init_triton_runtime(py::module &&m) {
     if(grid_0*grid_1*grid_2 > 0) {
       // release the gil in case the enqueue blocks
       // cuda will block if too many ops are enqueued
-      Py_BEGIN_ALLOW_THREADS
-
+      py::gil_scoped_release allow_threads;
       drv::dispatch::cuLaunchKernel((CUfunction)kernel, grid_0, grid_1, grid_2, 
                                     _num_warps*32, 1, 1, shared_mem, (CUstream)_stream, 
                                      nullptr, config);
-
-       Py_END_ALLOW_THREADS
    }
     return bin;
   });
@@ -394,14 +391,13 @@ void init_triton_runtime(py::module &&m) {
     size_t args_size = args.size();
     // release the gil in case the enqueue blocks
     // cuda will block if too many ops are enqueued
-    Py_BEGIN_ALLOW_THREADS
+    py::gil_scoped_release allow_threads;
     if(backend == HOST)
       host_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
     if(backend == CUDA)
       cu_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
     if(backend == ROCM)
       hip_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
-    Py_END_ALLOW_THREADS
   });
 
   
@@ -468,7 +464,7 @@ std::tuple<std::string, asm_map_t, int> cu_compile_ttir(const std::string& name,
                                                                asm_map_t &asm_map){
 
   int n_shared_bytes;
-  Py_BEGIN_ALLOW_THREADS
+  py::gil_scoped_release allow_threads;
   llvm::LLVMContext ctx;
   // device properties
   CUdevice dev = (CUdevice)device;
@@ -494,7 +490,6 @@ std::tuple<std::string, asm_map_t, int> cu_compile_ttir(const std::string& name,
     py::bytes bytes(cubin);
     asm_map["cubin"] = bytes;
   }
-  Py_END_ALLOW_THREADS
   return std::make_tuple(name, asm_map, n_shared_bytes);
 }