diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc
index 1066a5cae..eda55e451 100644
--- a/lib/codegen/analysis/layout.cc
+++ b/lib/codegen/analysis/layout.cc
@@ -380,7 +380,16 @@ void layout::run(ir::module &mod) {
     if(auto *red = dynamic_cast<ir::reduce_inst*>(i)) {
       id++;
       ir::value *arg = red->get_operand(0);
-      layouts_[id] = new layout_shared_t(get(arg), axes_->get(arg), arg->get_type()->get_tile_shapes(), {red}, red->get_type()->get_scalar_ty(), id, align_);
+      unsigned axis = red->get_axis();
+      // shape
+      auto shapes = arg->get_type()->get_tile_shapes();
+      unsigned shape_ax = shapes[axis];
+      const layout_t *layout = get(arg);
+      unsigned per_thread = layout->nts[axis];
+      unsigned depth = shape_ax / per_thread;
+      shapes[axis] = depth;
+      // create layout
+      layouts_[id] = new layout_shared_t(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), id, align_);
       tmp_[red] = id;
     }
   });
diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc
index 8fbdbeded..d9d1e1cec 100644
--- a/lib/codegen/selection/generator.cc
+++ b/lib/codegen/selection/generator.cc
@@ -784,18 +784,10 @@ void generator::visit_reduce_inst(ir::reduce_inst* x) {
       partial[pidx] = accumulate(partial[pidx], current);
   });
 
-  // depth
-  unsigned shape_ax = arg->get_type()->get_tile_shapes()[axis];
-  unsigned per_thread = arg_tile->axis(axis).values.size();
-  unsigned depth = shape_ax / per_thread;
-
-  // shapes
-  auto shared_shapes = arg_tile->get_shapes();
-  shared_shapes[axis] = depth;
-
   // reduce within blocks
   machine_layout_t *slayout = machine_layouts_.at(layouts_->get(layouts_->tmp(x)));
   shared_tile *stile = (shared_tile*)slayout->create(x);
+  unsigned depth = stile->get_shapes()[axis];
 
   unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace();
   Type *res_ty = builder_->getFloatTy();
@@ -832,7 +824,7 @@ void generator::visit_reduce_inst(ir::reduce_inst* x) {
     }
   }
   tgt_->add_barrier(mod_, *builder_);
-
+  // write back
   distributed_tile* x_tile = (distributed_tile*)tmap_.at(x);
   x_tile->for_each([&](indices_t idx) {
     indices_t red_idx = idx;
diff --git a/tests/common/reduce.h b/tests/common/reduce.h
new file mode 100644
index 000000000..86e066638
--- /dev/null
+++ b/tests/common/reduce.h
@@ -0,0 +1,148 @@
+#include <iomanip>
+#include <cstring>
+#include <sstream>
+#include <cstdio>
+#include "triton/driver/backend.h"
+#include "triton/driver/stream.h"
+#include "triton/tools/bench.hpp"
+#include "triton/external/half.hpp"
+#include "triton/runtime/function.h"
+#include "src/reduce.h"
+#include "util.h"
+
+namespace drv = triton::driver;
+namespace rt = triton::runtime;
+
+template<class T>
+void cc_reduce_nd(std::vector<T> &y, const std::vector<T> &x, reduce_op_t op, size_t axis, const std::vector<int>& shapes) {
+  assert(axis <= shapes.size() - 1);
+  // remove shape at index axis to get outer dimensions
+  std::vector<int> outer = shapes;
+  outer.erase(outer.begin() + axis);
+  // retrieve shape at index axis to get inner dimension
+  int inner = shapes[axis];
+  // accumualtion function
+  auto acc = get_accumulator<T>(op);
+  // iterate over outer dimensions
+  _loop_nest(outer, [&](const std::vector<int>& y_idx) {
+    T ret = 0;
+    auto x_idx = y_idx;
+    x_idx.insert(x_idx.begin() + axis, 0);
+    // accumulate over inner dimensions
+    for(int z = 0; z < inner; z++){
+      x_idx[axis] = z;
+      ret = acc(ret, x[offset(x_idx, shapes)]);
+    }
+    y[offset(y_idx, outer)] = ret;
+  });
+}
+
+enum run_mode_t {
+  BENCH,
+  TEST
+};
+
+void triton_reduce_nd(drv::stream* stream, const std::vector<int32_t>& shape,
+                      int axis, reduce_op_t op,
+                      const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order,
+                      std::vector<std::vector<std::string>> TS,
+                      run_mode_t mode, std::vector<double>& bench, bool &test) {
+  typedef float NumericT;
+  std::string ty = "float";
+  size_t dtsize = sizeof(NumericT);
+  drv::context* context = stream->context();
+  size_t axy = (axis == 0) ? 1 : 0;
+
+  // rank
+  size_t rank = shape.size();
+  // size
+  size_t size = 1;
+  for(int32_t d: shape)
+    size *= d;
+  std::vector<std::string> shapename = {"S0", "S1", "S2"};
+  // strides for x
+  std::vector<std::string> x_strides = {"1"};
+  for(size_t d = 0; d < rank - 1; d++)
+    x_strides.push_back(x_strides[d] + " * " + shapename[x_order[d]]);
+  // strides for y
+  std::vector<std::string> y_strides = {"1"};
+  for(size_t d = 0; d < rank - 1; d++)
+    y_strides.push_back(y_strides[d] + " * " + shapename[y_order[d]]);
+
+  // create inputs
+  auto dx = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size*dtsize));
+  auto dy = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size*dtsize));
+  // create options
+  rt::function::options_space_t opt;
+
+  // type
+  opt.defines.push_back({"TYPE", {ty}});
+  // x strides
+  for(size_t d = 0; d < rank; d++)
+    opt.defines.push_back({"STRIDE_XS" + std::to_string(x_order[d]), {x_strides[d]}});
+  // y strides
+  for(size_t d = 0; d < rank; d++)
+    opt.defines.push_back({"STRIDE_YS" + std::to_string(y_order[d]), {y_strides[d]}});
+  if(TS.empty())
+    TS = tile_nd(rank);
+  // tile size
+  for(size_t d = 0; d < rank; d++)
+    opt.defines.push_back({"TS" + std::to_string(d), TS[d]});
+  // non-reduced axis
+  std::string RY = (axis == 0) ? "rn" : "rm";
+  opt.defines.push_back({"TY", {std::to_string(shape[axy])}});
+  opt.defines.push_back({"RY", {RY}});
+  // reduction broadcasting
+  std::string RED = "";
+  for(int n = 0; n < 2; n++){
+    if(n > 0)
+      RED += ", ";
+    RED += (n==axis) ? to_str(op) : ":";
+  }
+  opt.defines.push_back({"RED", {RED}});
+
+  opt.num_warps = {4};
+
+  // kernel
+  rt::function function(src::reduce2d, opt);
+
+  // grid
+  std::vector<rt::arg> args = {&*dx, &*dy};
+  for(int32_t d: shape)
+    args.push_back(d);
+  args.push_back(shape[0]);
+  std::vector<std::string> ts = {"TS0", "TS1", "TS2"};
+  auto grid = grid_nd(shape, ts);
+
+  // metrics
+  if(mode == BENCH){
+    auto gbps = [&](double ns) { return 2 * size * dtsize / (ns * 1e-9) * 1e-9; };
+    double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream);
+    bench.push_back(gbps(triton_ns));
+  }
+
+  // test triton
+  if(mode == TEST){
+    std::vector<NumericT> hy(shape[axy]);
+    std::vector<NumericT> ry(shape[axy]);
+    std::vector<NumericT> hx(shape[0]*shape[1]);
+    init_zeros(hy);
+    init_rand(hx);
+    stream->write(&*dx, true, 0, hx);
+    function(args, grid, stream);
+    stream->synchronize();
+    stream->read(&*dy, true, 0, hy);
+    cc_reduce_nd(ry, hx, op, axis, shape);
+    test = testing::diff(hy, ry);
+  }
+}
+
+bool do_test(drv::stream* stream, std::vector<int> shape, int axis, reduce_op_t op, int nwarp){
+  std::vector<double> bench;
+  bool test;
+  std::vector<std::vector<std::string>> TSS;
+  for(int32_t d: shape)
+    TSS.push_back({std::to_string(d)});
+  triton_reduce_nd(stream, shape, axis, op, {0, 1}, {0, 1}, TSS, TEST, bench, test);
+  return test;
+}
diff --git a/tests/common/src/reduce.h b/tests/common/src/reduce.h
index 3a77e960e..cc44ca5fc 100644
--- a/tests/common/src/reduce.h
+++ b/tests/common/src/reduce.h
@@ -16,9 +16,9 @@ void reduce2d(TYPE * X __noalias __readonly __aligned(16),
             int M, int N, int ldx) {
   int ridm = get_program_id(0);
   int ridn = get_program_id(1);
-  int rm[TM] = ridm * TM + 0 ... TM;
-  int rn[TN] = ridn * TN + 0 ... TN;
-  TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx;
+  int rm[TS0] = ridm * TS0 + 0 ... TS0;
+  int rn[TS1] = ridn * TS1 + 0 ... TS1;
+  TYPE* px[TS0, TS1] = X + rm[:, newaxis] + rn[newaxis, :] * ldx;
   TYPE* py[TY] = Y + RY;
   *py = (*px)[RED];
 }
diff --git a/tests/unit/reduce.cc b/tests/unit/reduce.cc
index 3c3754133..0ee43cbc5 100644
--- a/tests/unit/reduce.cc
+++ b/tests/unit/reduce.cc
@@ -8,76 +8,10 @@
 #include "triton/tools/bench.hpp"
 #include "triton/external/half.hpp"
 #include "triton/runtime/function.h"
-#include "src/reduce.h"
 #include "cuda/cublas.h"
+#include "reduce.h"
 #include "util.h"
 
-namespace drv = triton::driver;
-namespace rt = triton::runtime;
-
-template<class T>
-void reduce_nd(std::vector<T> &y, const std::vector<T> &x, reduce_op_t op, size_t axis, const std::vector<int>& shapes) {
-  assert(axis <= shapes.size() - 1);
-  // remove shape at index axis to get outer dimensions
-  std::vector<int> outer = shapes;
-  outer.erase(outer.begin() + axis);
-  // retrieve shape at index axis to get inner dimension
-  int inner = shapes[axis];
-  // accumualtion function
-  auto acc = get_accumulator<T>(op);
-  // iterate over outer dimensions
-  _loop_nest(outer, [&](const std::vector<int>& y_idx) {
-    T ret = 0;
-    auto x_idx = y_idx;
-    x_idx.insert(x_idx.begin() + axis, 0);
-    // accumulate over inner dimensions
-    for(int z = 0; z < inner; z++){
-      x_idx[axis] = z;
-      ret = acc(ret, x[offset(x_idx, shapes)]);
-    }
-    y[offset(y_idx, outer)] = ret;
-  });
-}
-
-
-bool do_test(drv::stream* stream, std::vector<int> shape, int axis, reduce_op_t op, int nwarp){
-  typedef float NumericT;
-  std::string ty = "float";
-  size_t dt_nbytes = sizeof(NumericT);
-  drv::context* context = stream->context();
-  size_t axy = (axis == 0) ? 1 : 0;
-  std::string RY = (axis == 0) ? "rn" : "rm";
-  std::vector<NumericT> hy(shape[axy]);
-  std::vector<NumericT> ry(shape[axy]);
-  std::vector<NumericT> hx(shape[0]*shape[1]);
-  srand(0);
-  init_zeros(hy);
-  init_rand(hx);
-  auto dy = std::shared_ptr<drv::buffer>(drv::buffer::create(context, hy.size()*dt_nbytes));
-  auto dx = std::shared_ptr<drv::buffer>(drv::buffer::create(context, hx.size()*dt_nbytes));
-  stream->write(&*dy, true, 0, hy);
-  stream->write(&*dx, true, 0, hx);
-  rt::function::options_space_t opt;
-  opt.defines.push_back({"TYPE", {ty}});
-  opt.defines.push_back({"TM", {std::to_string(shape[0])}});
-  opt.defines.push_back({"TN", {std::to_string(shape[1])}});
-  opt.defines.push_back({"TY", {std::to_string(shape[axy])}});
-  opt.defines.push_back({"RY", {RY}});
-  std::string RED = "";
-  for(int n = 0; n < 2; n++){
-    if(n > 0)
-      RED += ", ";
-    RED += (n==axis) ? to_str(op) : ":";
-  }
-  opt.defines.push_back({"RED", {RED}});
-  opt.num_warps = {nwarp};
-  rt::function function(src::reduce2d, opt);
-  function({&*dx, &*dy, shape[0], shape[1], shape[0]}, grid2d(shape[0], shape[1]), stream);
-  stream->synchronize();
-  stream->read(&*dy, true, 0, hy);
-  reduce_nd(ry, hx, op, axis, shape);
-  return testing::diff(hy, ry);
-}
 
 int main() {
   // initialize default compute device