1, add explicit value cache in emitting indices calculation; 2, move the indices calculation emitting logics into ConvertTritonGPUOpToLLVMPatternBase to avoid the redundant build cost by templates. Refer to the discussion in this thread by @LyricZhao : https://triton-lang.slack.com/archives/C042VBSQWNS/p1671336755922969
27 lines
995 B
C++
27 lines
995 B
C++
#ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_CONVERT_LAYOUT_OP_H
|
|
#define TRITON_CONVERSION_TRITONGPU_TO_LLVM_CONVERT_LAYOUT_OP_H
|
|
|
|
#include "TritonGPUToLLVMBase.h"
|
|
|
|
using namespace mlir;
|
|
using namespace mlir::triton;
|
|
|
|
using ::mlir::triton::gpu::DotOperandEncodingAttr;
|
|
|
|
bool isMmaToDotShortcut(MmaEncodingAttr &mmaLayout,
|
|
DotOperandEncodingAttr &dotOperandLayout);
|
|
|
|
void storeBlockedToShared(Value src, Value llSrc, ArrayRef<Value> srcStrides,
|
|
ArrayRef<Value> srcIndices, Value dst, Value smemBase,
|
|
Type elemPtrTy, Location loc,
|
|
ConversionPatternRewriter &rewriter);
|
|
|
|
void populateConvertLayoutOpToLLVMPatterns(
|
|
mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns,
|
|
int numWarps, AxisInfoAnalysis &axisInfoAnalysis,
|
|
const Allocation *allocation, Value smem,
|
|
ConvertTritonGPUOpToLLVMPatternBase::IndexCacheInfo &indexCacheInfo,
|
|
PatternBenefit benefit);
|
|
|
|
#endif
|