diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index cd40f7912..da31ea9d8 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -787,8 +787,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // reduce within warp Value *shfl = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_shfl_sync_bfly_f32); for (int i = 16; i > 0; i >>= 1){ - Value *rhs = builder.CreateCall(shfl, {builder.getInt32(0x1f), partial, - builder.getInt32(i), builder.getInt32(0xffffffff)}); + Value *rhs = builder.CreateCall(shfl, {builder.getInt32(0xffffffff), partial, + builder.getInt32(i), builder.getInt32(0x1f)}); partial = builder.CreateFAdd(partial, rhs); } // reduce within block diff --git a/lib/dnn/batchnorm.cpp b/lib/dnn/batchnorm.cpp index db84bf5b9..4bb29db5d 100644 --- a/lib/dnn/batchnorm.cpp +++ b/lib/dnn/batchnorm.cpp @@ -58,7 +58,7 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V, restrict read_only fp32 *G, restrict read_only fp32 *B, int32 DHWN) { - int32 rx[TM] = get_global_range[TM](0); + int32 rx[TM] = 0 ... TM; fp32 *px[TM]; fp32 x[TM]; int32 c = get_range_id(0); @@ -82,6 +82,7 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V, x = *px; x = x - mean; var = var + x*x; + px = px + TM; } fp32 v = __sum(var); fp32 *pv = V + c;