diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp
index cd40f7912..da31ea9d8 100644
--- a/lib/codegen/selection.cpp
+++ b/lib/codegen/selection.cpp
@@ -787,8 +787,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> &
       // reduce within warp
       Value *shfl = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_shfl_sync_bfly_f32);
       for (int i = 16; i > 0; i >>= 1){
-        Value *rhs = builder.CreateCall(shfl, {builder.getInt32(0x1f), partial,
-                                               builder.getInt32(i), builder.getInt32(0xffffffff)});
+        Value *rhs = builder.CreateCall(shfl, {builder.getInt32(0xffffffff), partial,
+                                               builder.getInt32(i), builder.getInt32(0x1f)});
         partial = builder.CreateFAdd(partial, rhs);
       }
       // reduce within block
diff --git a/lib/dnn/batchnorm.cpp b/lib/dnn/batchnorm.cpp
index db84bf5b9..4bb29db5d 100644
--- a/lib/dnn/batchnorm.cpp
+++ b/lib/dnn/batchnorm.cpp
@@ -58,7 +58,7 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V,
                restrict read_only fp32 *G,
                restrict read_only fp32 *B,
                int32 DHWN) {
-  int32 rx[TM] = get_global_range[TM](0);
+  int32 rx[TM] = 0 ... TM;
   fp32 *px[TM];
   fp32 x[TM];
   int32 c = get_range_id(0);
@@ -82,6 +82,7 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V,
     x = *px;
     x = x - mean;
     var = var + x*x;
+    px = px + TM;
   }
   fp32 v = __sum(var);
   fp32 *pv = V + c;