diff --git a/lib/driver/dispatch.cc b/lib/driver/dispatch.cc index a6e93db1a..d8350881b 100755 --- a/lib/driver/dispatch.cc +++ b/lib/driver/dispatch.cc @@ -95,8 +95,13 @@ bool dispatch::cuinit(){ if(cuda_==nullptr){ putenv((char*)"CUDA_CACHE_DISABLE=1"); std::string libcuda = tools::getenv("TRITON_LIBCUDA"); - if(libcuda.empty()) + if(libcuda.empty()){ cuda_ = dlopen("libcuda.so", RTLD_LAZY); + if(!cuda_) + cuda_ = dlopen("libcuda.so.1", RTLD_LAZY); + if(!cuda_) + throw std::runtime_error("Could not find `libcuda.so`. Make sure it is in your LD_LIBRARY_PATH."); + } else cuda_ = dlopen(libcuda.c_str(), RTLD_LAZY); } diff --git a/python/bench/bench_cross_entropy.py b/python/bench/bench_cross_entropy.py index 2c4d61d9c..5347ae24a 100644 --- a/python/bench/bench_cross_entropy.py +++ b/python/bench/bench_cross_entropy.py @@ -32,9 +32,9 @@ def bench_op(M, N, dtype, mode, provider): y = op(x, idx) dy = torch.randn_like(y) fn = lambda: y.backward(dy, retain_graph=True) - mean_ms, min_ms, max_ms = triton.testing.do_bench(fn, grad_to_none=x) + mean_ms, min_ms, max_ms = triton.testing.do_bench(fn, grad_to_none=[x]) return gbps(mean_ms), gbps(min_ms), gbps(max_ms) if __name__ == '__main__': - bench_op.run('tmp', False) \ No newline at end of file + bench_op.run(print_data=True) \ No newline at end of file diff --git a/python/triton/language.py b/python/triton/language.py index 87fc30a59..a3ab0df54 100644 --- a/python/triton/language.py +++ b/python/triton/language.py @@ -51,7 +51,7 @@ def builtin(fn): def wrapper(*args, **kwargs): if 'builder' not in kwargs or \ kwargs['builder'] is None: - raise ValueError("Builder argument must be provided outside of JIT functions") + raise ValueError("Builder argument must be provided outside of JIT functions. Did you forget to add @triton.jit ?") return fn(*args, **kwargs) if wrapper.__doc__: diff --git a/python/triton/testing.py b/python/triton/testing.py index eb4d89956..2c9aa092f 100644 --- a/python/triton/testing.py +++ b/python/triton/testing.py @@ -123,7 +123,8 @@ def do_bench(fn, warmup=25, rep=100, grad_to_none=None, percentiles=[0.2, 0.8]): # if it contains a backward pass. So we clear the # provided gradients if grad_to_none is not None: - grad_to_none.grad = None + for x in grad_to_none: + x.grad = None # we clear the L2 cache before each run cache.zero_() # record time of `fn` @@ -246,6 +247,7 @@ class Mark: plt.savefig(os.path.join(save_path, f"{bench.plot_name}.png")) df = df[[bench.x_names[0]] + bench.line_names] if print_data: + print(bench.plot_name + ':') print(df) if save_path: df.to_csv(os.path.join(save_path, f"{bench.plot_name}.csv"), float_format='%.1f', index=False)