[PYTHON] Merged blocksparse branch:

* Example for blocksparse matrix multiplication * Simplified Triton kernel API * Revived auto-tuning in einsum
2020-03-05 13:08:07 -05:00
parent ea37ba5d35
commit 268894a5ce
8 changed files with 433 additions and 85 deletions
--- a/lib/runtime/function.cc
+++ b/lib/runtime/function.cc
@@ -3,6 +3,7 @@
 #include <regex>
 #include <functional>
 #include <algorithm>
+#include <memory>
 #include "triton/codegen/analysis/axes.h"
 #include "triton/codegen/analysis/allocation.h"
 #include "triton/codegen/analysis/liveness.h"
@@ -343,6 +344,17 @@ function::cache_key_t function::get_key(driver::stream *stream, const std::vecto
 // returns program with best compilation options for given parameter
 function::caller* function::autotune(driver::stream* stream, const grid_fn_ty& grid_fn,
                                       const std::vector<arg>& args) {
+//  // copy buffer argument so that auto-tuning doesn't corrupt data
+//  std::list<std::shared_ptr<driver::cu_buffer>> copies;
+//  std::vector<arg> args = args;
+//  for(arg& x: args)
+//    if(x.type() == BUFFER_T){
+//      driver::buffer* old = x.buffer();
+//      driver::context* ctx = old->context();
+//      size_t size = old->size();
+//      copies.push_back(std::make_shared<driver::cu_buffer>(ctx, size));
+//      x = arg(copies.back().get());
+//    }
  // fast path -- no autotuning necessary
  if(callers_.size() == 1)
    return &*callers_.begin()->second;