[PYTHON][TESTS][DOC] Various improvement of the API and code quality:

* Simplified `triton.kernel` API to achieve lower latency: > .data_ptr() must now be passed as kernel argument. No more implicit conversion from torch.tensor > compilation options are now constant attributes, i.e., opt.d('VAR') becomes opt.VAR > torch.device must now be passed explicitly to triton.kernel (no longer inferred from torch.tensor arguments) * C++ tests moved to `python/tests/` * C++ tutorial created in `tutorials/` * Python tutorial created in python/tutorials/ * Version changed to 1.0alpha * No longer copying C++ headers into the Python package * added python/triton/ops/ package for pre-written Triton ops
2021-01-29 17:27:16 -05:00
parent a5a477c36b
commit 269ebc12e5
63 changed files with 2255 additions and 3883 deletions
--- a/lib/driver/stream.cc
+++ b/lib/driver/stream.cc
@@ -76,7 +76,7 @@ void host_stream::synchronize() {
  hst_->args.clear();
 }

-void host_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void **args, size_t args_size) {
+void host_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size) {
  auto hst = kernel->module()->hst();
  hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]);
  char* params = new char[args_size];
@@ -113,7 +113,7 @@ void cu_stream::synchronize() {
  dispatch::cuStreamSynchronize(*cu_);
 }

-void cu_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void** args, size_t args_size) {
+void cu_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size) {
  void *config[] = {
      CU_LAUNCH_PARAM_BUFFER_POINTER, args,
      CU_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,