`insert_slice_async` is decomposed into `load + insert_slice` in the backend. Not sure if V100 perf can match the master branch though in this way. Maybe the performance can be improved if instructions are arranged in the following form: ``` %0 = load %1 = load %2 = load ... insert_slice %0 insert_slice %1 insert_slice %2 ``` Tested on A100 when manually enabling this decomposition. Tests on V100 haven't been integrated yet, we can divide the tests into two phases: 1. Test only load, insert_slice, and insert_slice_async, given TritonGPU IRs in `test_backend.py`. 2. End to end gemm tests on V100.
41 lines
1.1 KiB
C++
41 lines
1.1 KiB
C++
#ifndef TRITON_TARGET_LLVMIRTRANSLATION_H
|
|
#define TRITON_TARGET_LLVMIRTRANSLATION_H
|
|
#include "llvm/ADT/StringRef.h"
|
|
#include <memory>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
namespace llvm {
|
|
class Module;
|
|
class LLVMContext;
|
|
} // namespace llvm
|
|
|
|
namespace mlir {
|
|
class ModuleOp;
|
|
} // namespace mlir
|
|
|
|
namespace mlir {
|
|
namespace triton {
|
|
|
|
// add external dependent libs
|
|
void addExternalLibs(mlir::ModuleOp &module,
|
|
const std::vector<std::string> &names,
|
|
const std::vector<std::string> &paths);
|
|
|
|
// Translate TritonGPU dialect to LLVMIR, return null if failed.
|
|
std::unique_ptr<llvm::Module>
|
|
translateTritonGPUToLLVMIR(llvm::LLVMContext *llvmContext,
|
|
mlir::ModuleOp module,
|
|
int computeCapability);
|
|
|
|
// Translate mlir LLVM dialect to LLVMIR, return null if failed.
|
|
std::unique_ptr<llvm::Module>
|
|
translateLLVMToLLVMIR(llvm::LLVMContext *llvmContext, mlir::ModuleOp module);
|
|
|
|
bool linkExternLib(llvm::Module &module, llvm::StringRef path);
|
|
|
|
} // namespace triton
|
|
} // namespace mlir
|
|
|
|
#endif // TRITON_TARGET_LLVMIRTRANSLATION_H
|