[CODEGEN] Performance improvement on A100 (#125)

Improved codegen for the Ampere GPUs. * Make the layout pass recognize the multistage pipelined pattern. * Now the pipeline pass can automate the multistage pipelining transformation. * Remove extra barriers (from the prefetch pass & WAR) on Ampere. * Update the code generator (generator.cc) to make Triton generate n-buffered shared memory loads/stores.
2021-06-21 14:25:13 +08:00
parent 5a51f3e529
commit d8d6b715c8
21 changed files with 855 additions and 174 deletions
--- a/python/src/triton.cc
+++ b/python/src/triton.cc
@@ -33,7 +33,10 @@ void init_triton_driver(py::module &&m) {
        CUdevice handle;
        drv::dispatch::cuDeviceGet(&handle, dev_id);
        return new drv::cu_device(handle, take_ownership);
-      }));
+      }))
+      .def("max_shared_memory", [](drv::cu_device *self) {
+        return self->max_shared_memory();
+      });
  // host device
  py::class_<drv::host_device, drv::device>(m, "host_device")
      .def(py::init<>());
@@ -75,11 +78,11 @@ void init_triton_driver(py::module &&m) {

 void init_triton_codegen(py::module &&m) {
  m.def(
-      "add_passes_to_emit_bin", [](ir::module &ir, drv::device *dev, int num_warps) {
+      "add_passes_to_emit_bin", [](ir::module &ir, drv::device *dev, int num_warps, int num_stages) {
        drv::module *mod;
        drv::kernel *ker;
        size_t shared_mem;
-        triton::codegen::add_passes_to_emit_bin(ir, dev, num_warps, mod, ker, shared_mem);
+        triton::codegen::add_passes_to_emit_bin(ir, dev, num_warps, num_stages, mod, ker, shared_mem);
        std::stringstream ss;
        ir::print(ir, ss);
        return std::make_tuple(mod, ker, shared_mem, ss.str());