diff --git a/examples/python/tensorflow/CMakeLists.txt b/examples/python/tensorflow/CMakeLists.txt
index f9b650d1d..c531c23b1 100644
--- a/examples/python/tensorflow/CMakeLists.txt
+++ b/examples/python/tensorflow/CMakeLists.txt
@@ -5,7 +5,7 @@ if(${TensorFlow_FOUND})
   include_directories("${CUDA_HOME}/include")
   link_directories(${TF_LIB})
   add_definitions(-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI})
-  add_library(tf_blocksparse SHARED dot.cpp)
+  add_library(tf_blocksparse SHARED dot.cpp dense_conv)
   target_link_libraries(tf_blocksparse tensorflow_framework triton)
   configure_file(${CMAKE_CURRENT_SOURCE_DIR}/run.py
                   ${CMAKE_CURRENT_BINARY_DIR}/run.py
diff --git a/examples/python/tensorflow/dense_conv.cpp b/examples/python/tensorflow/dense_conv.cpp
new file mode 100644
index 000000000..66e7bfdab
--- /dev/null
+++ b/examples/python/tensorflow/dense_conv.cpp
@@ -0,0 +1,117 @@
+#include <iostream>
+
+#include "triton/driver/buffer.h"
+#include "triton/driver/backend.h"
+#include "triton/driver/stream.h"
+#include "triton/runtime/jit.h"
+#include "triton/tools/bench.hpp"
+#include "triton/dnn/gemm.h"
+#include "triton/dnn/conv.h"
+
+#define EIGEN_USE_GPU
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+
+using namespace tensorflow;
+using GPUDevice = Eigen::GpuDevice;
+
+//torch::Tensor conv_common(
+//    int32_t B, int32_t C, int32_t D, int32_t H, int32_t W,
+//    int32_t T, int32_t R, int32_t S, int32_t NF,
+//    int32_t stride_d, int32_t stride_h, int32_t stride_w,
+//    int32_t pad_d, int32_t pad_h, int32_t pad_w,
+//    triton::dnn::conv::type ty,
+//    torch::Tensor torcha, torch::Tensor torchb, torch::Tensor torchbias,
+//    bool autotune = false
+//    ) {
+
+//}
+
+class DenseConvOp : public OpKernel {
+ public:
+  explicit DenseConvOp(OpKernelConstruction* context) : OpKernel(context) {
+  }
+
+  void Compute(OpKernelContext* context){
+    // get device/stream
+    GPUDevice device =  context->eigen_device<GPUDevice>();
+    triton::driver::cu_stream sstream(device.stream(), false);
+    triton::driver::context* ctx = sstream.context();
+    triton::driver::stream* stream = &sstream;
+    // get inputs
+    const Tensor& tfa = context->input(0);
+    const Tensor& tfb = context->input(1);
+    // get shapes
+    int32_t B  = tfa.dim_size(0);
+    int32_t Ca = tfa.dim_size(1);
+    int32_t D = 1;
+    int32_t H  = tfa.dim_size(2);
+    int32_t W  = tfa.dim_size(3);
+    int32_t Cb = tfb.dim_size(0);
+    int32_t T = 1;
+    int32_t R  = tfb.dim_size(1);
+    int32_t S  = tfb.dim_size(2);
+    int32_t NF  = tfb.dim_size(3);
+    assert(Ca == Cb);
+    int32_t C = Ca;
+    int32_t stride_d = 1, stride_h = 1, stride_w = 1;
+    int32_t pad_d = 0, pad_h = 0, pad_w = 0;
+    bool has_bias = false;
+
+    // get conv configuration
+    triton::dnn::conv configuration(B, C, D, H, W, T, R, S, NF,
+                                    stride_d, stride_h, stride_w,
+                                    pad_d, pad_h, pad_w,
+                                    1, 1, 1,
+                                    triton::dnn::conv::FPROP, has_bias);
+
+    // Bind memory
+    triton::driver::cu_buffer a(ctx, (CUdeviceptr)tfa.flat<float>().data(), false);
+    triton::driver::cu_buffer b(ctx, (CUdeviceptr)tfb.flat<float>().data(), false);
+//    triton::driver::cu_buffer cubias(ctx, (CUdeviceptr)torchbias.storage().data(), false);
+//    triton::driver::buffer* bias = has_bias ? &cubias : nullptr;
+    triton::driver::buffer* bias = nullptr;
+
+    // allocate output
+    auto c_shapes = configuration.c_shapes();
+    Tensor* tfc = nullptr;
+    TensorShape out_shape({c_shapes[0], c_shapes[1], c_shapes[2], c_shapes[3]});
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &tfc));
+    triton::driver::cu_buffer c(ctx, (CUdeviceptr)tfc->flat<float>().data(), false);
+
+    // benchmark a given convolution kernel
+    triton::jit jit(ctx);
+    auto benchmark = [&](triton::driver::kernel* kernel,
+                         triton::jit::launch_information info) {
+      configuration.init(stream, (triton::driver::cu_module*)kernel->module());
+      unsigned TM = info.global_range_size[0];
+      unsigned TN = info.global_range_size[1];
+      unsigned nthreads = info.num_threads;
+      unsigned GZ = jit.get_int("GZ");
+      configuration.enqueue(stream, kernel, &a, &b, &c, bias, TM, TN, GZ, nthreads);
+      stream->synchronize();
+      double ts = triton::tools::bench([&](){ configuration.enqueue(stream, kernel, &a, &b, &c, bias, TM, TN, GZ, nthreads); },
+                        [&](){ stream->synchronize(); }, stream->context()->device());
+      return configuration.get_nflops() / ts * 1e-3;
+    };
+
+    std::ostringstream oss;
+    configuration.src(oss);
+    std::string src = oss.str();
+
+    triton::jit::tune_res_t best = jit.autotune("conv", src.c_str(), benchmark);
+    jit.add_module("conv", src.c_str(), best.params);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("DenseConv").Device(DEVICE_GPU), DenseConvOp);
+REGISTER_OP("DenseConv")
+    .Input("a: float32")
+    .Input("b: float32")
+    .Output("c: float32")
+;
diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp
index 8ff9dc854..bdaab5921 100644
--- a/examples/python/tensorflow/dot.cpp
+++ b/examples/python/tensorflow/dot.cpp
@@ -19,9 +19,9 @@
 using namespace tensorflow;
 using GPUDevice = Eigen::GpuDevice;
 
-class BlockSparseGemmOp : public OpKernel {
+class DotOp : public OpKernel {
  public:
-  explicit BlockSparseGemmOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit DotOp(OpKernelConstruction* context) : OpKernel(context) {
   }
 
   void Compute(OpKernelContext* context){
@@ -52,7 +52,6 @@ class BlockSparseGemmOp : public OpKernel {
     triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat<Eigen::half>().data(), false);
     triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat<float>().data(), false);
     triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks.flat<int32_t>().data(), false);
-    stream->synchronize();
     // benchmark a given matrix multiplication kernel
     auto benchmark = [&](triton::driver::kernel* kernel,
                          triton::jit::launch_information info) {
@@ -85,7 +84,7 @@ class BlockSparseGemmOp : public OpKernel {
 private:
 };
 
-REGISTER_KERNEL_BUILDER(Name("Dot").Device(DEVICE_GPU), BlockSparseGemmOp);
+REGISTER_KERNEL_BUILDER(Name("Dot").Device(DEVICE_GPU), DotOp);
 REGISTER_OP("Dot")
     .Input("a: float16")
     .Input("b: float16")
diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py
index 0788231e0..9756ee340 100644
--- a/examples/python/tensorflow/run.py
+++ b/examples/python/tensorflow/run.py
@@ -6,24 +6,40 @@ data_files_path = tf.resource_loader.get_data_files_path()
 library_dir = os.path.dirname(os.path.realpath(__file__))
 module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so'))
 
-M, N, K = 128,128,128
-a = tf.placeholder(tf.float16, shape=[M, K])
-b = tf.placeholder(tf.float16, shape=[N, K])
-locks = tf.placeholder(tf.int32, shape=[4096])
-# c = tf.matmul(a, b, transpose_a=True)
-c = module.dot(a, b, locks)
+def run_dot():
+    M, N, K = 128,128,128
+    a = tf.placeholder(tf.float16, shape=[M, K])
+    b = tf.placeholder(tf.float16, shape=[N, K])
+    locks = tf.placeholder(tf.int32, shape=[4096])
+    # c = tf.matmul(a, b, transpose_a=True)
+    c = module.dot(a, b, locks)
+    # Reference
+    ha = np.random.rand(M, K).astype(np.float16)
+    hb = np.random.rand(N, K).astype(np.float16)
+    # Run
+    sess = tf.InteractiveSession()
+    sess.run(tf.global_variables_initializer())
+    result = sess.run([c], feed_dict = {locks: np.zeros(4096),
+                                        a: ha,
+                                        b: hb})[0]
+    # Test
+    hresult = np.dot(ha.T, hb).T
+    dif = np.abs(result - hresult)
+    print("dif: %f" % np.max(dif))
 
-# Reference
-ha = np.random.rand(M, K).astype(np.float16)
-hb = np.random.rand(N, K).astype(np.float16)
+def run_conv():
+    BS, C, H, W = 16, 32, 32, 32
+    R, S, NF = 3, 3, 32
+    a = tf.placeholder(tf.float32, shape=[BS, C, H, W])
+    b = tf.placeholder(tf.float32, shape=[C, R, S, NF])
+    c = module.dense_conv(a, b)
+    # Reference
+    ha = np.random.rand(BS, C, H, W)
+    hb = np.random.rand(C, R, S, NF)
+    # Run
+    sess = tf.InteractiveSession()
+    sess.run(tf.global_variables_initializer())
+    result = sess.run([c], feed_dict = {a: ha,
+                                        b: hb})[0]
 
-# Run
-sess = tf.InteractiveSession()
-sess.run(tf.global_variables_initializer())
-result = sess.run([c], feed_dict = {locks: np.zeros(4096),
-                                    a: ha,
-                                    b: hb})[0]
-
-hresult = np.dot(ha.T, hb).T
-dif = np.abs(result - hresult)
-print("dif: %f" % np.max(dif))
+run_conv()