diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index c01a16de1..45798e628 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -36,6 +36,9 @@ jobs:
       - name: Check style
         run: "autopep8 -a -r -d --exit-code ./python || ( echo '::error title=Style issues::Please run \"autopep8 -a -r -i ./python\"' ; exit 1 )"
 
+      - name: Flake8
+        run: "flake8 --config ./python/setup.cfg ./python || ( echo '::error::Flake8 failed; see logs for errors.' ; exit 1 )"
+
       - name: Unit tests
         run: |
           cd python/test/unit
diff --git a/python/bench/bench_matmul.py b/python/bench/bench_matmul.py
index 9db005da0..b776b3dbf 100644
--- a/python/bench/bench_matmul.py
+++ b/python/bench/bench_matmul.py
@@ -50,7 +50,6 @@ def bench_op(M, N, K, AT, BT, dtype, provider, warmup=25, rep=75):
         a = a.t()
     if BT:
         b = b.t()
-    num_flops = 2 * M * N * K
     tflops = lambda ms: 2. * M * N * K / ms * 1e-9
     if provider == "cublas":
         ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.matmul(a, b), warmup=warmup, rep=rep)
diff --git a/python/setup.cfg b/python/setup.cfg
index 9d24c7de7..9af1cf69c 100644
--- a/python/setup.cfg
+++ b/python/setup.cfg
@@ -3,3 +3,6 @@ description_file = README.md
 
 [pycodestyle]
 ignore = E501,E701,E731
+
+[flake8]
+ignore = E501,E701,E731
diff --git a/python/setup.py b/python/setup.py
index 1cc2ea103..db22c14af 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -149,6 +149,7 @@ setup(
     extras_require={
         "tests": [
             "autopep8",
+            "flake8",
             "isort",
             "numpy",
             "pytest",
diff --git a/python/test/regression/test_performance.py b/python/test/regression/test_performance.py
index 84e829aa8..39299a89a 100644
--- a/python/test/regression/test_performance.py
+++ b/python/test/regression/test_performance.py
@@ -3,7 +3,6 @@ import sys
 
 import pytest
 import torch
-from numpy import record
 
 import triton
 import triton.language as tl
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index 7f0af78b4..d8e88a609 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -1,3 +1,4 @@
+# flake8: noqa: F821,F841
 import copy
 import itertools
 import re
diff --git a/python/test/unit/language/test_random.py b/python/test/unit/language/test_random.py
index 82ae7f0c2..042065403 100644
--- a/python/test/unit/language/test_random.py
+++ b/python/test/unit/language/test_random.py
@@ -2,7 +2,6 @@ import numpy as np
 import pytest
 import scipy.stats
 import torch
-from numpy.random import Philox
 
 import triton
 import triton.language as tl
diff --git a/python/triton/__init__.py b/python/triton/__init__.py
index b4a92a8f8..f9982939c 100644
--- a/python/triton/__init__.py
+++ b/python/triton/__init__.py
@@ -1,4 +1,5 @@
 """isort:skip_file"""
+# flake8: noqa: F401
 __version__ = '2.0.0'
 
 # TODO: torch needs to be imported first
diff --git a/python/triton/code_gen.py b/python/triton/code_gen.py
index af95bf280..b7da2047e 100644
--- a/python/triton/code_gen.py
+++ b/python/triton/code_gen.py
@@ -1,19 +1,17 @@
 import ast
 import builtins
-import dbm
 import functools
 import hashlib
 import inspect
 import os
 import pickle
-import struct
 import subprocess
 import sys
 import tempfile
 import textwrap
 import time
 import warnings
-from typing import Dict, Optional
+from typing import Dict
 
 import torch
 from filelock import FileLock
@@ -406,7 +404,7 @@ class CodeGenerator(ast.NodeVisitor):
                                                    self.visit(pos_cond_node),
                                                    self.visit(neg_cond_node),
                                                    _builder=self.builder)
-        #cond_node = neg_cond_node
+        # cond_node = neg_cond_node
         step_node = ast.AugAssign(target=st_target, op=ast.Add(), value=arg_2)
         # code generation
         current_bb = self.builder.get_insert_block()
diff --git a/python/triton/language/__init__.py b/python/triton/language/__init__.py
index a7f341f16..0b04465eb 100644
--- a/python/triton/language/__init__.py
+++ b/python/triton/language/__init__.py
@@ -1,3 +1,4 @@
+# flake8: noqa: F401
 from . import core, random
 from .core import *
 from .random import *
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
index 4f63b33bc..d32da45c3 100644
--- a/python/triton/language/core.py
+++ b/python/triton/language/core.py
@@ -802,14 +802,6 @@ def max_contiguous(input, value, _builder=None):
     return frontend.max_contiguous(input, value, _builder)
 
 
-@builtin
-def max_contiguous(input, value, _builder=None):
-    """
-    Let the compiler knows that the `value` first values in :code:`input` are contiguous.
-    """
-    return frontend.max_contiguous(input, value, _builder)
-
-
 # -----------------------
 # Standard library
 # -----------------------
diff --git a/python/triton/ops/__init__.py b/python/triton/ops/__init__.py
index 7d27ffd20..dcaed8ccf 100644
--- a/python/triton/ops/__init__.py
+++ b/python/triton/ops/__init__.py
@@ -1,3 +1,4 @@
+# flake8: noqa: F401
 #from .conv import _conv, conv
 from . import blocksparse
 from .cross_entropy import _cross_entropy, cross_entropy
diff --git a/python/triton/ops/blocksparse/__init__.py b/python/triton/ops/blocksparse/__init__.py
index 231c27a1f..df3353e12 100644
--- a/python/triton/ops/blocksparse/__init__.py
+++ b/python/triton/ops/blocksparse/__init__.py
@@ -1,2 +1,3 @@
+# flake8: noqa: F401
 from .matmul import matmul
 from .softmax import softmax
diff --git a/python/triton/ops/cross_entropy.py b/python/triton/ops/cross_entropy.py
index dfd4f4487..910417d2c 100644
--- a/python/triton/ops/cross_entropy.py
+++ b/python/triton/ops/cross_entropy.py
@@ -1,5 +1,3 @@
-import os
-
 import torch
 
 import triton
@@ -96,11 +94,9 @@ class _cross_entropy(torch.autograd.Function):
         """
         # load saved tensors
         neg_logprobs, indices = ctx.saved_tensors
-        # make kernel
-        device, dtype = neg_logprobs.device, neg_logprobs.dtype
-        n_cols = neg_logprobs.shape[-1]
         # run the kernel
         # neg_logprobs will be modified in place to become our gradient:
+        n_cols = neg_logprobs.shape[-1]
         grid = lambda opt: (neg_logprobs.numel() // n_cols, )
         _backward[grid](neg_logprobs, indices, dneg_logprobs, n_cols)
         return neg_logprobs, None
diff --git a/python/triton/ops/matmul.py b/python/triton/ops/matmul.py
index 60ecc9f3b..d7af57406 100644
--- a/python/triton/ops/matmul.py
+++ b/python/triton/ops/matmul.py
@@ -2,7 +2,7 @@ import torch
 
 import triton
 import triton.language as tl
-from .matmul_perf_model import *
+from .matmul_perf_model import estimate_matmul_time, prune_num_stages
 
 
 def init_to_zero(name):
diff --git a/python/triton/tools/disasm.py b/python/triton/tools/disasm.py
index b030e72ec..3672d4b05 100644
--- a/python/triton/tools/disasm.py
+++ b/python/triton/tools/disasm.py
@@ -20,7 +20,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-import argparse
 import re
 import subprocess
 
@@ -75,7 +74,7 @@ def extract(file_path, fun):
         # .headerflags: ...
         # /*0000*/ asmstr /*0x...*/
         #                 /*0x...*/
-        fname_match = FNAME_RE.match(line)
+
         # Looking for new function header (function: <name>)
         while FNAME_RE.match(line) is None:
             line_idx += 1
diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py
index c78ccabbc..ca8b463fe 100644
--- a/python/tutorials/01-vector-add.py
+++ b/python/tutorials/01-vector-add.py
@@ -65,7 +65,7 @@ def add(x: torch.Tensor, y: torch.Tensor):
     #  - each torch.tensor object is implicitly converted into a pointer to its first element.
     #  - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
     #  - don't forget to pass meta-parameters as keywords arguments
-    pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
+    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
     # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
     # running asynchronously at this point.
     return output