[codegen/selection] bugfix in code generation for reduction instructions
This commit is contained in:
@@ -787,8 +787,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> &
|
|||||||
// reduce within warp
|
// reduce within warp
|
||||||
Value *shfl = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_shfl_sync_bfly_f32);
|
Value *shfl = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_shfl_sync_bfly_f32);
|
||||||
for (int i = 16; i > 0; i >>= 1){
|
for (int i = 16; i > 0; i >>= 1){
|
||||||
Value *rhs = builder.CreateCall(shfl, {builder.getInt32(0x1f), partial,
|
Value *rhs = builder.CreateCall(shfl, {builder.getInt32(0xffffffff), partial,
|
||||||
builder.getInt32(i), builder.getInt32(0xffffffff)});
|
builder.getInt32(i), builder.getInt32(0x1f)});
|
||||||
partial = builder.CreateFAdd(partial, rhs);
|
partial = builder.CreateFAdd(partial, rhs);
|
||||||
}
|
}
|
||||||
// reduce within block
|
// reduce within block
|
||||||
|
@@ -58,7 +58,7 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V,
|
|||||||
restrict read_only fp32 *G,
|
restrict read_only fp32 *G,
|
||||||
restrict read_only fp32 *B,
|
restrict read_only fp32 *B,
|
||||||
int32 DHWN) {
|
int32 DHWN) {
|
||||||
int32 rx[TM] = get_global_range[TM](0);
|
int32 rx[TM] = 0 ... TM;
|
||||||
fp32 *px[TM];
|
fp32 *px[TM];
|
||||||
fp32 x[TM];
|
fp32 x[TM];
|
||||||
int32 c = get_range_id(0);
|
int32 c = get_range_id(0);
|
||||||
@@ -82,6 +82,7 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V,
|
|||||||
x = *px;
|
x = *px;
|
||||||
x = x - mean;
|
x = x - mean;
|
||||||
var = var + x*x;
|
var = var + x*x;
|
||||||
|
px = px + TM;
|
||||||
}
|
}
|
||||||
fp32 v = __sum(var);
|
fp32 v = __sum(var);
|
||||||
fp32 *pv = V + c;
|
fp32 *pv = V + c;
|
||||||
|
Reference in New Issue
Block a user