[CORE] Auto-tuning now copies scalar buffers. Still needs to copy all buffers that are both read from and written to.

2020-05-15 23:21:42 -04:00
parent 78cd54b0c8
commit 5995cbff8e
2 changed files with 18 additions and 23 deletions
--- a/lib/runtime/function.cc
+++ b/lib/runtime/function.cc
@@ -347,21 +347,24 @@ function::cache_key_t function::get_key(driver::stream *stream, const std::vecto
 // returns program with best compilation options for given parameter
 function::caller* function::autotune(driver::stream* stream, const grid_fn_ty& grid_fn,
                                       const std::vector<arg>& args) {
-//  // copy buffer argument so that auto-tuning doesn't corrupt data
-//  std::list<std::shared_ptr<driver::cu_buffer>> copies;
-//  std::vector<arg> args = args;
-//  for(arg& x: args)
-//    if(x.type() == BUFFER_T){
-//      driver::buffer* old = x.buffer();
-//      driver::context* ctx = old->context();
-//      size_t size = old->size();
-//      copies.push_back(std::make_shared<driver::cu_buffer>(ctx, size));
-//      x = arg(copies.back().get());
-//    }
  // fast path -- no autotuning necessary
  if(callers_.size() == 1)
    return &*callers_.begin()->second;
  // slow path -- autotuning necessary
+  // copy buffer argument so that auto-tuning doesn't corrupt data
+  std::list<std::shared_ptr<driver::cu_buffer>> copies;
+  std::vector<arg> _args = args;
+  for(size_t i = 0; i < args.size(); i++)
+    if(_args[i].type() == BUFFER_T){
+      driver::buffer* old = _args[i].buffer();
+      size_t size = old->size();
+      // only copy scalars
+      // TODO: change that
+      if(size != 4 && size != 2)
+        continue;
+      copies.push_back(std::make_shared<driver::cu_buffer>(old->context(), size));
+      _args[i] = arg(copies.back().get());
+    }
  double best_ts = INFINITY;
  caller* ret = nullptr;
  for(auto &x : callers_){
@@ -373,6 +376,7 @@ function::caller* function::autotune(driver::stream* stream, const grid_fn_ty& g
    ret = (ts < best_ts) ? current : ret;
    best_ts = std::min(ts, best_ts);
  }
+  stream->synchronize();
  return ret;
 }

--- a/python/src/bindings.cc
+++ b/python/src/bindings.cc
@@ -194,7 +194,7 @@ void gen_make_handles(std::ostream &os, const std::vector<ir::argument*>& args)
    if(!arg->get_type()->is_pointer_ty())
      continue;
    const std::string& name = arg->get_name();
-    os << "  drv::cu_buffer cu_" + name + "(ctx, " + name + "->tensor_data().size(), (CUdeviceptr)" + name + "->tensor_data().data(), false);\n  ";
+    os << "  drv::cu_buffer cu_" + name + "(ctx, " + name + "->nbytes(), (CUdeviceptr)" + name + "->tensor_data().data(), false);\n  ";
  }
 }

@@ -524,7 +524,7 @@ void gen_torch_make_handles(std::ostream &os,
      os << "  " << to_c_ty(arg) << " " << name << " = " << th_name << ";" << std::endl;
    else{
      os << "  CHECK_INPUT(" << th_name << ");" << std::endl;
-      os << "  drv::cu_buffer " + name + "(ctx, " + th_name + ".storage().size(), "
+      os << "  drv::cu_buffer " + name + "(ctx, " + th_name + ".nbytes(), "
            " (CUdeviceptr)((char*)" + th_name + ".storage().data() + " + th_name + ".storage_offset() * " + th_name + ".itemsize()), false);" << std::endl;
    }
  }
@@ -561,16 +561,7 @@ void gen_torch_make_launch_function(std::ostream &os,
    os << "args.push_back(rt::arg(ty" << i << ", val" << i << "));\n  ";
  }
  os << "  std::function<void()> run = [&](){\n  ";
-  os << "    (*id_fn_map.at({id, dev_id}))({";
-  for(unsigned i = 0; i < args.size() ; i++){
-    std::string name = "arg_" + std::to_string(i);
-    if(args[i] == rt::BUFFER_T)
-      name = "&" + name;
-    if(i > 0)
-      os << ", ";
-    os << name;
-  }
-  os << "}, *id_grid_map.at({id, dev_id}), &stream);\n";
+  os << "    (*id_fn_map.at({id, dev_id}))(args , *id_grid_map.at({id, dev_id}), &stream);\n";
  os << "  };\n";
  os << "  run();\n";
  os << "  if(bench > 0)\n  ";