diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt
index e648f092c..8ad73fb46 100644
--- a/bench/CMakeLists.txt
+++ b/bench/CMakeLists.txt
@@ -1,12 +1,15 @@
+set(CMAKE_BUILD_TYPE Release)
+
+find_package(CUDA)
+
 foreach(PROG blas)
      add_executable(${PROG}-bench ${PROG}.cpp)
      target_link_libraries(${PROG}-bench ${OPENCL_LIBRARIES})
      set_target_properties(${PROG}-bench PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL -Wall -Wextra")
-endforeach(PROG)
 
-# packages
-find_package(CUDA)
-if(CUDA_FOUND)
-    cuda_add_executable(cublas-bench "cublas.cu")
-    target_link_libraries(cublas-bench "cublas")
-endif()
+     if(CUDA_FOUND)
+         set(CUPROG cu${PROG})
+         cuda_add_executable(${CUPROG}-bench ${CUPROG}.cu OPTIONS "-DVIENNACL_WITH_OPENCL")
+         target_link_libraries(${CUPROG}-bench ${CUPROG} OpenCL)
+     endif()
+endforeach(PROG)
diff --git a/bench/blas.cpp b/bench/blas.cpp
index 5e380c579..687525149 100644
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -1,26 +1,31 @@
+//#define VIENNACL_DEBUG_ALL
+
 #include "viennacl/matrix.hpp"
 #include "viennacl/vector.hpp"
 #include "viennacl/tools/timer.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/scheduler/execute.hpp"
 
 #include "atidlas/tools/misc.hpp"
 #include "atidlas/model/import.hpp"
 #include "atidlas/model/model.hpp"
 
+#include "common.hpp"
+
 #include <iomanip>
 #include <stdlib.h>
 
+
 namespace ad = atidlas;
 typedef atidlas::atidlas_int_t int_t;
 
 template<class T>
-float bandwidth(std::size_t N, float t)
+void bench(std::map<std::string, ad::tools::shared_ptr<ad::model> > & models)
 {
-    return N * sizeof(T) * 1e-9 / t;
-}
+  typedef viennacl::matrix<T,viennacl::column_major> matrix_type;
+  typedef viennacl::vector<T> vector_type;
 
-template<class T>
-void bench(std::vector<int_t> BLAS1_N, std::map<std::string, ad::tools::shared_ptr<ad::model> > & models)
-{
   viennacl::tools::timer timer;
   float total_time = 0;
   std::vector<T> times;
@@ -30,7 +35,7 @@ void bench(std::vector<int_t> BLAS1_N, std::map<std::string, ad::tools::shared_p
   total_time = 0;\
   OP;\
   viennacl::backend::finish();\
-  while(total_time < 1e-1){\
+  while(total_time < 1e-2){\
     timer.start(); \
     OP;\
     viennacl::backend::finish();\
@@ -40,36 +45,67 @@ void bench(std::vector<int_t> BLAS1_N, std::map<std::string, ad::tools::shared_p
   viennacl::backend::finish();\
   float resname = ad::tools::median(times);
 
-  std::cout << "#N PerfNaive PerfModel PerfOpt" << std::endl;
-
 #define BENCH(declarations, statement_op, sizes, measure, N, key) \
-    std::cout << "#"  << key << std::endl;\
-    for(std::vector<int_t>::const_iterator it = sizes.begin() ; it != sizes.end() ; ++it)\
-    {\
-      declarations;\
-      viennacl::scheduler::statement statement(statement_op);\
-      BENCHMARK(y = x + y, time_viennacl);\
-      BENCHMARK(models[key]->execute(statement), time_model);\
-      BENCHMARK(models[key]->execute(statement, true), time_unique_kernel);\
-      models[key]->tune(statement);\
-      BENCHMARK(models[key]->execute(statement), time_opt);\
-      std::cout << *it << " " << measure<T>(N, time_viennacl) << " " << measure<T>(N,time_unique_kernel) << " " << measure<T>(N,time_model) << " " << measure<T>(N,time_opt) << std::endl;\
+    if(models.find(key)!=models.end()){\
+        if(!first)\
+        {\
+          std::cout << std::endl;\
+          std::cout << std::endl;\
+        }\
+        std::cout << "#"  << key << std::endl;\
+        for(std::vector<int_t>::const_iterator it = sizes.begin() ; it != sizes.end() ; ++it)\
+        {\
+          declarations;\
+          viennacl::scheduler::statement statement(statement_op);\
+          BENCHMARK(models.at(key)->execute(statement), time_model);\
+          BENCHMARK(models[key]->execute(statement, true), time_unique_kernel);\
+          models[key]->tune(statement);\
+          BENCHMARK(models[key]->execute(statement), time_opt);\
+          std::cout << *it << " " << measure<T>(N,time_unique_kernel) << " " << measure<T>(N,time_model) << " " << measure<T>(N,time_opt) << std::endl;\
+        }\
     }\
 
 #define DECLARE(type, ...) type __VA_ARGS__
 #define ARGS(...) __VA_ARGS__
 
-  BENCH(DECLARE(viennacl::vector<T>, x(*it), y(*it)), ARGS(y, viennacl::op_assign(), x + y), BLAS1_N, bandwidth, 3*(*it), "vector-axpy-float32");
-  std::cout << std::endl;
-  std::cout << std::endl;
-}
+  /*---------*/
+  /*--BLAS1--*/
+  /*---------*/
 
-std::vector<int_t> create_log_range(int_t min, int_t max, int_t N)
-{
-  std::vector<int_t> res(N);
-  for(int_t i = 0 ; i < N ; ++i)
-    res[i] = std::exp(std::log(min) + (float)(std::log(max) - std::log(min))*i/N);
-  return res;
+  //AXPY
+  bool first =true;
+  BENCH(DECLARE(viennacl::vector<T>, x(*it), y(*it)), ARGS(y, viennacl::op_assign(), x + y),
+        BLAS1_N, bandwidth, 3*(*it), "vector-axpy-float32");
+  first=false;
+
+
+  //DOT
+  BENCH(DECLARE(viennacl::scalar<T> s(0)); DECLARE(vector_type, x(*it), y(*it)), ARGS(s, viennacl::op_assign(), viennacl::linalg::inner_prod(x,y)),
+          BLAS1_N, bandwidth, 2*(*it), "reduction-float32");
+
+
+  /*---------*/
+  /*--BLAS2--*/
+  /*---------*/
+
+  //N-layout
+  for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
+  {
+      BENCH(DECLARE(matrix_type, A(*Mit,*it)); DECLARE(vector_type, y(*Mit), x(*it)),ARGS(y, viennacl::op_assign(), viennacl::linalg::prod(A,x)), BLAS2_N,
+             bandwidth, (*Mit)*(*it), "row-wise-reductionN-float32");
+  }
+
+
+  //T-layout
+  for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
+  {
+      BENCH(DECLARE(matrix_type, A(*it,*Mit)) ; DECLARE(vector_type, y(*Mit), x(*it)), ARGS(y, viennacl::op_assign(), viennacl::linalg::prod(viennacl::trans(A),x)), BLAS2_N,
+             bandwidth, (*Mit)*(*it), "row-wise-reductionT-float32");
+  }
+
+  /*---------*/
+  /*--BLAS3--*/
+  /*---------*/
 }
 
 int main(int argc, char* argv[])
@@ -81,9 +117,7 @@ int main(int argc, char* argv[])
   }
   std::map<std::string, ad::tools::shared_ptr<ad::model> > models = ad::import(argv[1]);
 
-  std::vector<int_t> BLAS1_N = create_log_range(1e3, 2e7, 50);
-
   std::cout << "#Benchmark : BLAS" << std::endl;
   std::cout << "#----------------" << std::endl;
-  bench<float>(BLAS1_N, models);
+  bench<float>(models);
 }
diff --git a/bench/common.hpp b/bench/common.hpp
new file mode 100644
index 000000000..3971f1cfb
--- /dev/null
+++ b/bench/common.hpp
@@ -0,0 +1,59 @@
+#ifndef ATIDLAS_BENCH_COMMON_HPP_
+#define ATIDLAS_BENCH_COMMON_HPP_
+
+#include "vector"
+#include "atidlas/tools/misc.hpp"
+
+int ceil(int N, int pad)
+{
+    return (N%pad==0)?N:(N+pad-1)/pad*pad;
+}
+
+std::vector<int> create_log_range(int min, int max, int N, int pad)
+{
+  std::vector<int> res(N);
+  for(int i = 0 ; i < N ; ++i)
+  {
+    res[i] = std::exp(std::log(min) + (float)(std::log(max) - std::log(min))*i/N);
+    res[i] = ceil(res[i], pad);
+  }
+  return res;
+}
+
+std::vector<int> create_full_range(int min, int max, int pad)
+{
+    std::vector<int> N;
+    for(int i = ceil(min, pad) ; i < ceil(max, pad) ; i+=pad)
+        N.push_back(i);
+    return N;
+}
+
+// BLAS1 Sizes
+static const std::vector<int> BLAS1_N = create_log_range(1e3, 2e7, 50, 64);
+
+// BLAS2 Sizes
+static const std::vector<int> BLAS2_M = atidlas::tools::make_vector<int>() << 256;
+static const std::vector<int> BLAS2_N = create_full_range(128, 5000, 64);
+
+// BLAS3 Sizes
+static const std::vector<int> BLAS3_N = create_full_range(128, 5000, 64);
+
+
+template<class T>
+float bandwidth(std::size_t N, float t)
+{
+  return N * sizeof(T) * 1e-9 / t;
+}
+
+template<class T>
+T median(std::vector<T> x)
+{
+  size_t size = x.size();
+  std::sort(x.begin(), x.end());
+  if (size  % 2 == 0)
+      return (x[size / 2 - 1] + x[size / 2]) / 2;
+  else
+      return x[size / 2];
+}
+
+#endif
diff --git a/bench/cublas.cu b/bench/cublas.cu
index 09ffec4b2..ef0a8f7f3 100644
--- a/bench/cublas.cu
+++ b/bench/cublas.cu
@@ -8,8 +8,11 @@
 
 #include "common.hpp"
 
+#include "CL/cl.h"
+#include "atidlas/execute.hpp"
+
 template<class NumericT>
-void run(std::vector<int> const & BLAS1_N, std::vector<int> const & BLAS2_N, std::vector<int> const & BLAS3_N)
+void run()
 {
     #define FILL_TIMINGS(OP, timings) \
     {\
@@ -74,25 +77,28 @@ void run(std::vector<int> const & BLAS1_N, std::vector<int> const & BLAS2_N, std
 
     std::cout << "#GEMV" << std::endl;
     std::cout << "#N Perf" << std::endl;
-    for(std::vector<int>::const_iterator it = BLAS2_N.begin() ; it != BLAS2_N.end() ; ++it)
+    for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
     {
-      int M = *it;
-      int N = *it;
-      NumericT *x, *y, *A;
-      cudaMalloc((void**) &A, M * N * sizeof(NumericT));
-      cudaMalloc((void**) &x, M * sizeof(NumericT));
-      cudaMalloc((void**) &y, N * sizeof(NumericT));
-      //Bench
-      std::vector<float> timings;
-      FILL_TIMINGS(cublasSgemv('T', M, N, 1.0, A, M, x, 1, 1.0, y, 1), timings);
-      std::cout << N << " " << (M + N + M*N)*sizeof(NumericT)*1e-9/median(timings) << std::endl;
-      //Free
-      cudaFree(A);
-      cudaFree(x);
-      cudaFree(y);
+        for(std::vector<int>::const_iterator it = BLAS2_N.begin() ; it != BLAS2_N.end() ; ++it)
+        {
+          int M = *Mit;
+          int N = *it;
+          NumericT *x, *y, *A;
+          cudaMalloc((void**) &A, M * N * sizeof(NumericT));
+          cudaMalloc((void**) &x, M * sizeof(NumericT));
+          cudaMalloc((void**) &y, N * sizeof(NumericT));
+          //Bench
+          std::vector<float> timings;
+          FILL_TIMINGS(cublasSgemv('N', M, N, 1.0, A, M, x, 1, 1.0, y, 1), timings);
+          std::cout << N << " " << (M + N + M*N)*sizeof(NumericT)*1e-9/median(timings) << std::endl;
+          //Free
+          cudaFree(A);
+          cudaFree(x);
+          cudaFree(y);
+        }
+        std::cout << std::endl;
+        std::cout << std::endl;
     }
-    std::cout << std::endl;
-    std::cout << std::endl;
 
     std::cout << "#GEMM" << std::endl;
     std::cout << "#N Perf" << std::endl;
@@ -124,10 +130,5 @@ int main(int argc, char** argv)
   cudaDeviceProp prop;
   cudaGetDeviceProperties(&prop, 0);
   std::cout << "Device: " << prop.name << std::endl;
-
-  std::vector<int> BLAS1_N = create_log_range(1e3, 2e7, 50);
-  std::vector<int> BLAS2_N = create_log_range(100, 4000, 50);
-  std::vector<int> BLAS3_N = create_log_range(100, 4000, 50);
-
-  run<float>(BLAS1_N, BLAS2_N, BLAS3_N);
+  run<float>();
 }
diff --git a/bench/plot.gnuplot b/bench/plot.gnuplot
index eac214b5f..c0b59ddb0 100644
--- a/bench/plot.gnuplot
+++ b/bench/plot.gnuplot
@@ -1,7 +1,14 @@
-set logscale x
-
 set terminal pdf
-set output 'saxpy.pdf'
-plot "out.dat" i 0 using 1:2 with lines title 'Naive', \
-     "out.dat" i 0 using 1:3 with lines title 'Model', \
-     "out.dat" i 0 using 1:4 with lines title 'Optimal'
+set output 'bench.pdf'
+
+set xlabel 'N'
+set ylabel 'Bandwidth (GB/s)'
+set key top left
+stats "out.dat" nooutput
+
+set logscale x
+do for [i=1:STATS_blocks]{
+plot "out.dat" index (i-1) using 1:2 with lines title 'Naive', \
+     "out.dat" index (i-1) using 1:3 with lines title 'Model', \
+     "out.dat" index (i-1) using 1:4 with lines title 'Optimal' 
+}