2021-04-20 22:29:40 -04:00
|
|
|
#include "triton/codegen/pass.h"
|
2022-07-13 15:52:21 -07:00
|
|
|
|
|
|
|
#include "llvm/IR/Constants.h"
|
|
|
|
#include "llvm/IR/LegacyPassManager.h"
|
|
|
|
#include "llvm/IR/Module.h"
|
|
|
|
#include "llvm/IR/Verifier.h"
|
|
|
|
#include "llvm/IRReader/IRReader.h"
|
|
|
|
#include "llvm/Linker/Linker.h"
|
|
|
|
#include "llvm/Support/SourceMgr.h"
|
|
|
|
#include "llvm/Transforms/IPO.h"
|
|
|
|
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
2021-04-20 22:29:40 -04:00
|
|
|
#include "triton/codegen/analysis/align.h"
|
|
|
|
#include "triton/codegen/analysis/allocation.h"
|
|
|
|
#include "triton/codegen/analysis/axes.h"
|
|
|
|
#include "triton/codegen/analysis/liveness.h"
|
|
|
|
#include "triton/codegen/analysis/swizzle.h"
|
|
|
|
#include "triton/codegen/selection/generator.h"
|
|
|
|
#include "triton/codegen/transform/coalesce.h"
|
|
|
|
#include "triton/codegen/transform/cts.h"
|
|
|
|
#include "triton/codegen/transform/dce.h"
|
|
|
|
#include "triton/codegen/transform/disassociate.h"
|
2022-07-13 15:52:21 -07:00
|
|
|
#include "triton/codegen/transform/inline.h"
|
2021-04-20 22:29:40 -04:00
|
|
|
#include "triton/codegen/transform/membar.h"
|
|
|
|
#include "triton/codegen/transform/peephole.h"
|
|
|
|
#include "triton/codegen/transform/pipeline.h"
|
2021-05-13 10:42:18 +08:00
|
|
|
#include "triton/codegen/transform/prefetch.h"
|
2021-04-20 22:29:40 -04:00
|
|
|
#include "triton/ir/function.h"
|
|
|
|
#include "triton/ir/module.h"
|
|
|
|
#include "triton/ir/print.h"
|
2022-07-13 15:52:21 -07:00
|
|
|
|
2021-04-20 22:29:40 -04:00
|
|
|
namespace triton {
|
|
|
|
namespace codegen {
|
|
|
|
|
2022-07-13 15:52:21 -07:00
|
|
|
static void link_extern_libs(const ExternLibMap& user_extern_lib_map,
|
|
|
|
const ExternLibMap& target_extern_lib_map,
|
|
|
|
ir::module& ir, llvm::LLVMContext& ctx,
|
|
|
|
std::unique_ptr<llvm::Module>& llvm) {
|
|
|
|
for (const auto& iter : target_extern_lib_map) {
|
|
|
|
auto &lib_name = iter.first;
|
|
|
|
if (user_extern_lib_map.count(lib_name) != 0 &&
|
|
|
|
user_extern_lib_map.at(lib_name)->path() != "") {
|
|
|
|
// If the user specified a path for this library, use it.
|
|
|
|
user_extern_lib_map.at(lib_name)->install(ctx, llvm);
|
|
|
|
} else {
|
|
|
|
// Otherwise, use the default path.
|
|
|
|
iter.second->install(ctx, llvm);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::set<llvm::StringRef> function_names;
|
|
|
|
for (auto& func : ir.get_function_list()) {
|
|
|
|
function_names.insert(func->get_name());
|
|
|
|
}
|
|
|
|
llvm::legacy::PassManager pass;
|
|
|
|
pass.add(llvm::createInternalizePass([&](const llvm::GlobalValue& v) -> bool {
|
|
|
|
if (function_names.count(v.getName()) != 0) {
|
|
|
|
// Preserve global functions
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// Internalize all device functions
|
|
|
|
return false;
|
|
|
|
}));
|
|
|
|
|
|
|
|
llvm::legacy::PassManager pm;
|
|
|
|
pm.add(llvm::createVerifierPass());
|
|
|
|
pm.run(*llvm);
|
|
|
|
|
|
|
|
llvm::PassManagerBuilder builder;
|
|
|
|
builder.OptLevel = 3;
|
|
|
|
builder.SizeLevel = 0;
|
|
|
|
builder.populateModulePassManager(pass);
|
|
|
|
|
|
|
|
pass.run(*llvm);
|
|
|
|
}
|
|
|
|
|
2021-04-20 22:29:40 -04:00
|
|
|
// TODO:
|
|
|
|
// There should be a proper pass manager there!
|
2022-07-13 15:52:21 -07:00
|
|
|
std::unique_ptr<llvm::Module> add_passes_to_emit_bin(
|
|
|
|
ir::module& ir, llvm::LLVMContext& ctx, codegen::target* target,
|
|
|
|
int num_warps, int num_stages, int& shared_static,
|
|
|
|
const ExternLibMap& extern_lib_map) {
|
2021-04-20 22:29:40 -04:00
|
|
|
// generate llvm code
|
|
|
|
std::string name = ir.get_function_list()[0]->get_name();
|
|
|
|
std::unique_ptr<llvm::Module> llvm(new llvm::Module(name, ctx));
|
|
|
|
// optimizations
|
2022-06-27 11:49:19 -07:00
|
|
|
bool has_sm80 = target->as_nvidia() && target->as_nvidia()->sm() >= 80;
|
2021-04-20 22:29:40 -04:00
|
|
|
// create passes
|
|
|
|
codegen::analysis::align align;
|
2022-04-03 20:58:16 -07:00
|
|
|
codegen::transform::inliner inliner;
|
2021-04-20 22:29:40 -04:00
|
|
|
codegen::analysis::axes axes;
|
2022-06-27 11:49:19 -07:00
|
|
|
codegen::transform::pipeline pipeline(has_sm80, num_stages);
|
2021-04-20 22:29:40 -04:00
|
|
|
codegen::transform::disassociate disassociate;
|
2021-09-09 00:04:28 -07:00
|
|
|
codegen::analysis::layouts layouts(&axes, &align, num_warps, target);
|
2022-06-27 11:49:19 -07:00
|
|
|
codegen::transform::cts cts(&layouts, has_sm80);
|
2021-04-20 22:29:40 -04:00
|
|
|
codegen::analysis::liveness liveness(&layouts);
|
2021-09-09 00:04:28 -07:00
|
|
|
codegen::analysis::swizzle swizzle(&layouts, target);
|
2021-04-20 22:29:40 -04:00
|
|
|
codegen::analysis::allocation allocation(&liveness);
|
|
|
|
codegen::transform::dce dce;
|
2021-09-09 00:04:28 -07:00
|
|
|
codegen::transform::peephole peephole(target, &layouts);
|
2022-06-27 11:49:19 -07:00
|
|
|
codegen::transform::coalesce coalesce(&align, &layouts, has_sm80);
|
2021-09-09 00:04:28 -07:00
|
|
|
codegen::transform::prefetch prefetch_s(target);
|
2022-07-13 15:52:21 -07:00
|
|
|
codegen::transform::membar barriers(&liveness, &layouts, &allocation,
|
|
|
|
&prefetch_s, target);
|
|
|
|
codegen::generator isel(&axes, &layouts, &align, &allocation, &swizzle,
|
|
|
|
target, num_warps);
|
2021-04-20 22:29:40 -04:00
|
|
|
// run passes
|
2022-04-03 20:58:16 -07:00
|
|
|
inliner.run(ir);
|
2021-04-20 22:29:40 -04:00
|
|
|
dce.run(ir);
|
|
|
|
peephole.run(ir);
|
|
|
|
dce.run(ir);
|
|
|
|
pipeline.run(ir);
|
2022-07-13 15:52:21 -07:00
|
|
|
dce.run(ir);
|
2022-09-18 08:51:48 -07:00
|
|
|
// ir.print(std::cout);
|
2021-04-20 22:29:40 -04:00
|
|
|
disassociate.run(ir);
|
|
|
|
dce.run(ir);
|
|
|
|
align.run(ir);
|
|
|
|
axes.run(ir);
|
|
|
|
layouts.run(ir);
|
|
|
|
peephole.run(ir);
|
|
|
|
dce.run(ir);
|
2022-07-13 15:52:21 -07:00
|
|
|
if (target->is_gpu()) cts.run(ir);
|
2021-04-20 22:29:40 -04:00
|
|
|
align.run(ir);
|
|
|
|
axes.run(ir);
|
|
|
|
layouts.run(ir);
|
|
|
|
coalesce.run(ir);
|
|
|
|
dce.run(ir);
|
|
|
|
align.run(ir);
|
|
|
|
dce.run(ir);
|
2022-07-13 15:52:21 -07:00
|
|
|
if (target->is_gpu()) cts.run(ir);
|
2021-04-20 22:29:40 -04:00
|
|
|
dce.run(ir);
|
|
|
|
align.run(ir);
|
|
|
|
axes.run(ir);
|
|
|
|
layouts.run(ir);
|
|
|
|
peephole.run(ir);
|
|
|
|
dce.run(ir);
|
|
|
|
align.run(ir);
|
|
|
|
axes.run(ir);
|
|
|
|
layouts.run(ir);
|
|
|
|
swizzle.run(ir);
|
2022-06-27 11:49:19 -07:00
|
|
|
// std::cout << "---" << std::endl;
|
|
|
|
// ir.print(std::cout);
|
|
|
|
// std::cout << "---" << std::endl;
|
|
|
|
// ir.print(std::cout);
|
2021-04-20 22:29:40 -04:00
|
|
|
liveness.run(ir);
|
|
|
|
allocation.run(ir);
|
2021-05-13 10:42:18 +08:00
|
|
|
prefetch_s.run(ir);
|
2021-05-25 18:31:48 -04:00
|
|
|
barriers.run(ir);
|
2022-06-27 11:49:19 -07:00
|
|
|
// exit(1);
|
2022-06-03 11:38:52 -07:00
|
|
|
// ir.print(std::cout);
|
2021-04-20 22:29:40 -04:00
|
|
|
isel.visit(ir, *llvm);
|
2021-09-09 00:04:28 -07:00
|
|
|
shared_static = allocation.allocated_size();
|
Better NVIDIA Pascal GPU Support (#827)
This PR clarifies which features are supported on P100 via its tests,
though Pascal is not officially and fully supported by Triton.
## What this PR does
- Skip unsupported tests on P100.
- Atomic RMW
- `tl.dot()` (perhaps not all patterns, but basically most `tl.dot()`
tests do not work on P100).
- Add an explicit error if shared memory size >= 64K on P100.
- Otherwise it causes `Invalid CUDA argument` error at
`cuLaunchKernel()`, but this error is not very straightforward to
understand. Instead of this generic CUDA argument error, this PR makes
Triton show an error during codegen when `sm < 70`. This check happens
in C/C++ so won't add an overhead in Triton's Python runtime.
- 3 tests (see below) are currently failing, but these are not marked as
skipped because any codegen update in the future can change the kernel
size of the other tests.
- This change won't affect Triton-MLIR. Hopefully Triton-MLIR's generic
`tl.dot()` implementation would support P100.
Importantly, Triton passed all the other tests on P100. Though this
support is not official, it is great for, for example, PyTorch's
TorchDynamo/Inductor, which can use Triton (without `tl.dot()`) for its
backend (https://github.com/pytorch/torchdynamo/issues/1591).
### Results on P100 (Google Cloud)
```sh
$ pytest test/unit
...
================================================================================== short test summary info ==================================================================================
FAILED test/unit/language/test_core.py::test_reduce2d[argmin-float32-shape99-1] - RuntimeError: Device does not support shared memory of 65536bytes
FAILED test/unit/language/test_core.py::test_reduce2d[argmax-float32-shape113-1] - RuntimeError: Device does not support shared memory of 65536bytes
FAILED test/unit/language/test_core.py::test_permute[float32-shape5-perm5] - RuntimeError: Device does not support shared memory of 67584bytes
================================================================== 3 failed, 3824 passed, 952 skipped in 470.90s (0:07:50) ==================================================================
```
<details><summary> <b>Environment Details (collapsed)</b></summary>
<p>
### VM details (Google Cloud)
https://cloud.google.com/
```
# You need a paid account (free trial does not cover GPUs)
Google Cloud -> New Project -> Compute-Engine -> VM Instance
Machine:
GPU: NVIDIA Tesla P100 x 1
CPU: 2 vCPUs, 7.5GB memory
Boot disk:
OS: Ubuntu 18.04 LTS
Disk: 40GB (cannot build Triton on the default 10GB disk)
- When I tried, about $1.2 per hour.
- US instances were full when I tried. I used Asia or Australia.
- Needed a paid account (GPU is not covered by free trial)
- Needed quota request for any GPU instance (by default, no GPU instance is allowed). Needed to wait an hour for approval
```
### Reproducer
```sh
## 1. Install CUDA and a driver
# Update the apt key (https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/)
sudo apt-key del 7fa2af80
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb
# Download CUDA as instructed
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
sudo apt-get update
sudo apt-get -y install cuda
# Are you using P100?
nvidia-smi | grep "Tesla P100"
## 2. Setup the build environment
sudo apt update
sudo apt install -y build-essential wget git libz-dev
wget https://repo.anaconda.com/archive/Anaconda3-2022.05-Linux-x86_64.sh
bash Anaconda3-2022.05-Linux-x86_64.sh -b -p $(pwd)/anaconda3
eval "$($(pwd)/anaconda3/bin/conda shell.bash hook)"
conda create -y --name triton_base
conda activate triton_base
conda install -y cmake setuptools
## 3. Build Triton
git clone https://github.com/openai/triton.git
cd triton/python
pip3 install -e '.[tests]'
## 4. Test
pytest test/unit
```
### Environment
```sh
$ nvidia-smi
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05 Driver Version: 520.61.05 CUDA Version: 11.8 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla P100-PCIE... On | 00000000:00:04.0 Off | 0 |
| N/A 36C P0 25W / 250W | 0MiB / 16384MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
```
</p></details>
2022-11-03 00:11:52 -07:00
|
|
|
if (target->as_nvidia() && target->as_nvidia()->sm() < 70) {
|
|
|
|
// sm < 70 (Pascal) has little shared memory resource.
|
|
|
|
// Instead of having "Error: Invalid argument" on launching a kernel, let's throw an error here.
|
|
|
|
if (shared_static >= 65536) {
|
|
|
|
throw std::runtime_error("Device does not support shared memory of " + std::to_string(shared_static) + "bytes");
|
|
|
|
}
|
|
|
|
}
|
2022-07-13 15:52:21 -07:00
|
|
|
|
|
|
|
if (isel.get_extern_lib_map().size() > 0) {
|
|
|
|
// If there's any extern lib calls,
|
|
|
|
// we need to link them in.
|
|
|
|
link_extern_libs(extern_lib_map, isel.get_extern_lib_map(), ir, ctx, llvm);
|
|
|
|
}
|
|
|
|
|
2021-09-09 00:04:28 -07:00
|
|
|
return llvm;
|
2021-04-20 22:29:40 -04:00
|
|
|
}
|
|
|
|
|
2022-07-13 15:52:21 -07:00
|
|
|
} // namespace codegen
|
|
|
|
} // namespace triton
|