diff --git a/.ci/azure-pipelines.yml b/.ci/azure-pipelines.yml
index 91a825aab..ace29495e 100644
--- a/.ci/azure-pipelines.yml
+++ b/.ci/azure-pipelines.yml
@@ -13,18 +13,11 @@ trigger: [ master ]
 pr:
 - master
 
-# Python version
-strategy:
-  matrix:
-    Python37:
-      python.version: '3.7'
-  maxParallel: 1
-
 # Pipeline
 steps:
 - script: |
     mkdir $(venv)
-    python -m virtualenv $(venv)
+    python -m virtualenv --python=python3 $(venv)
     source $(venv)/bin/activate
     python -m pip install --upgrade pip
     pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio===0.7.2 \
@@ -33,6 +26,15 @@ steps:
     python setup.py install
   displayName: Setup python environment
 
+- script: |
+    source $(venv)/bin/activate
+    pip install matplotlib pandas
+    cd python/bench
+    python -m run
+
+- publish: python/bench/results
+  artifact: Benchmarks
+
 - script: |
     source $(venv)/bin/activate
     pip install pytest
diff --git a/python/bench/run.py b/python/bench/run.py
new file mode 100644
index 000000000..17784947a
--- /dev/null
+++ b/python/bench/run.py
@@ -0,0 +1,41 @@
+import argparse
+import sys
+import os
+import inspect
+import triton
+
+def run_all(result_dir, with_plots, names):
+    if not os.path.exists(result_dir):
+        os.makedirs(result_dir)
+    for mod in os.listdir(os.path.dirname(os.path.realpath(__file__))):
+        # skip non python files
+        if not mod.endswith('.py'):
+            continue
+        # skip file not in provided names
+        if names and names not in mod:
+            continue
+        # skip files that don't start with 'bench_'
+        if not mod.startswith('bench_'):
+            continue
+        print(f'running {mod}...')
+        mod = __import__(os.path.splitext(mod)[0])
+        benchmarks = inspect.getmembers(mod, lambda x: isinstance(x, triton.testing.Mark))
+        for name, bench in benchmarks:
+            curr_dir = os.path.join(result_dir, mod.__name__.replace('bench_', ''))
+            if len(benchmarks) > 1:
+                curr_dir = os.path.join(curr_dir, name.replace('bench_', ''))
+            if not os.path.exists(curr_dir):
+                os.makedirs(curr_dir)
+            bench.run(curr_dir, with_plots)
+
+def main(args):
+    parser = argparse.ArgumentParser(description="Run the benchmark suite.")
+    parser.add_argument("-r", "--result-dir", type=str, default='results', required=False)
+    parser.add_argument("-n", "--names", type=str, default='', required=False)
+    parser.add_argument("-p", "--with-plots", dest='with_plots', action='store_true')
+    parser.set_defaults(feature=False)
+    args = parser.parse_args(args)
+    run_all(args.result_dir, args.with_plots, args.names)
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
\ No newline at end of file
diff --git a/python/setup.py b/python/setup.py
index b85276dd7..a64c08f61 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -92,51 +92,6 @@ class CMakeBuild(build_ext):
         subprocess.check_call(['cmake', sourcedir] + cmake_args, cwd=self.build_temp, env=env)
         subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
 
-class BenchCommand(distutils.cmd.Command):
-
-    description = 'run benchmark suite'
-    user_options = [
-        ('result-dir=', None, 'path to output benchmark results'),\
-        ('with-plots', None, 'plot benchmark results'),\
-        ('filter=' , None, 'filter benchmarks by name')
-    ]
-
-    def initialize_options(self):
-        self.result_dir = 'results'
-        self.filter = ''
-        self.with_plots = False
-
-    def finalize_options(self):
-        if not os.path.exists(self.result_dir):
-            os.makedirs(self.result_dir)
-
-    def run(self):
-        import sys
-        import inspect
-        import triton
-        bench_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'bench')
-        sys.path.append(bench_dir)
-        for mod in os.listdir(bench_dir):
-            # skip non python files
-            if not mod.endswith('.py'):
-                continue
-            # skip file not in provided filter
-            if self.filter and self.filter not in mod:
-                continue
-            # skip files that don't start with 'bench_'
-            if not mod.startswith('bench_'):
-                continue
-            print(f'running {mod}...')
-            mod = __import__(os.path.splitext(mod)[0])
-            benchmarks = inspect.getmembers(mod, lambda x: isinstance(x, triton.testing.Mark))
-            for name, bench in benchmarks:
-                result_dir = os.path.join(self.result_dir, mod.__name__.replace('bench_', ''))
-                if len(benchmarks) > 1:
-                    result_dir = os.path.join(result_dir, name.replace('bench_', ''))
-                if not os.path.exists(result_dir):
-                    os.makedirs(result_dir)
-                bench.run(result_dir, self.with_plots)
-
 setup(
     name='triton',
     version='1.0.0',
@@ -149,7 +104,7 @@ setup(
     package_data={'triton/ops': ['*.c'], 'triton/ops/blocksparse': ['*.c']},
     include_package_data=True,
     ext_modules=[CMakeExtension('triton', 'triton/_C/')],
-    cmdclass={'build_ext': CMakeBuild, 'bench': BenchCommand},
+    cmdclass={'build_ext': CMakeBuild},
     zip_safe=False,
     # for PyPI
     keywords=['Compiler', 'Deep Learning'],
diff --git a/python/test/test_matmul.py b/python/test/test_matmul.py
index 57e45849d..194e3d422 100644
--- a/python/test/test_matmul.py
+++ b/python/test/test_matmul.py
@@ -32,9 +32,9 @@ import torch
             (128, 32, 64, 1, 4, None, None, None, AT, BT, DTYPE),
             (32, 128, 64, 1, 4, None, None, None, AT, BT, DTYPE),
             # 8 warp
-            (128, 256, 16, 1, 8, None, None, None, AT, BT, DTYPE),
-            (256, 128, 16, 1, 8, None, None, None, AT, BT, DTYPE),
-            (256, 128, 32, 1, 8, None, None, None, AT, BT, DTYPE),
+            # (128, 256, 16, 1, 8, None, None, None, AT, BT, DTYPE),
+            # (256, 128, 16, 1, 8, None, None, None, AT, BT, DTYPE),
+            # (256, 128, 32, 1, 8, None, None, None, AT, BT, DTYPE),
             # split-k
             (64, 64, 16, 2, 4, None, None, None, AT, BT, DTYPE),
             (64, 64, 16, 4, 4, None, None, None, AT, BT, DTYPE),