1, add explicit value cache in emitting indices calculation; 2, move the indices calculation emitting logics into ConvertTritonGPUOpToLLVMPatternBase to avoid the redundant build cost by templates. Refer to the discussion in this thread by @LyricZhao : https://triton-lang.slack.com/archives/C042VBSQWNS/p1671336755922969
16 lines
519 B
C++
16 lines
519 B
C++
#ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_REDUCE_OP_H
|
|
#define TRITON_CONVERSION_TRITONGPU_TO_LLVM_REDUCE_OP_H
|
|
|
|
#include "TritonGPUToLLVMBase.h"
|
|
|
|
using namespace mlir;
|
|
using namespace mlir::triton;
|
|
|
|
void populateReduceOpToLLVMPatterns(
|
|
mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns,
|
|
int numWarps, AxisInfoAnalysis &axisInfoAnalysis,
|
|
const Allocation *allocation, Value smem,
|
|
ConvertTritonGPUOpToLLVMPatternBase::IndexCacheInfo &indexCacheInfo,
|
|
PatternBenefit benefit);
|
|
|
|
#endif |