diff --git a/python/triton/compiler.py b/python/triton/compiler.py
index b49d21b99..67c32cef4 100644
--- a/python/triton/compiler.py
+++ b/python/triton/compiler.py
@@ -6,11 +6,13 @@ import functools
 import hashlib
 import io
 import os
+import shutil
 import subprocess
 import sys
 import sysconfig
 import tempfile
 import warnings
+from sysconfig import get_paths
 from typing import Any, Dict, Set, Tuple, Union
 
 import setuptools
@@ -917,58 +919,6 @@ def generate_name_initializer(signature):
         src
 
 
-@contextlib.contextmanager
-def quiet():
-    old_stdout, old_stderr = sys.stdout, sys.stderr
-    sys.stdout, sys.stderr = io.StringIO(), io.StringIO()
-    try:
-        yield
-    finally:
-        sys.stdout, sys.stderr = old_stdout, old_stderr
-
-
-@functools.lru_cache()
-def libcuda_dir():
-    loc = subprocess.check_output(["whereis", "libcuda.so"]).decode().strip().split()[-1]
-    return os.path.dirname(loc)
-
-
-def _build(name, src, path):
-    # add framework
-    extra_compile_args = []
-    library_dirs = [libcuda_dir()]
-    include_dirs = [path, "/usr/local/cuda/include/"]
-    libraries = ['cuda']
-    # extra arguments
-    extra_link_args = []
-    # create extension module
-    ext = setuptools.Extension(
-        name=name,
-        language='c++',
-        sources=[src],
-        include_dirs=include_dirs,
-        extra_compile_args=extra_compile_args + ['-O3'],
-        extra_link_args=extra_link_args,
-        library_dirs=library_dirs,
-        libraries=libraries,
-    )
-    # build extension module
-    args = ['build_ext']
-    args.append('--build-temp=' + path)
-    args.append('--build-lib=' + path)
-    args.append('-q')
-    args = dict(
-        name=name,
-        ext_modules=[ext],
-        script_args=args,
-    )
-    # with quiet():
-    setuptools.setup(**args)
-    suffix = sysconfig.get_config_var('EXT_SUFFIX')
-    so = os.path.join(path, '{name}{suffix}'.format(name=name, suffix=suffix))
-    return so
-
-
 def binary_name_to_header_name(name):
     if len(name) > 128:
         # avoid filename too long errors (filename limit is 255)
@@ -1030,7 +980,7 @@ unsigned int {name}_shmem = {shmem_size};"""
 #include \"cuda.h\"
 #include <Python.h>
 
-inline void gpuAssert(CUresult code, const char *file, int line)
+static inline void gpuAssert(CUresult code, const char *file, int line)
 {{
    if (code != CUDA_SUCCESS)
    {{
@@ -1048,7 +998,7 @@ inline void gpuAssert(CUresult code, const char *file, int line)
 static CUmodule module = 0;
 static CUfunction function = 0;
 
-static void init_function(const char* name, const unsigned char* src, size_t n_shared_bytes, int64_t device){{
+static inline void init_function(const char* name, const unsigned char* src, size_t n_shared_bytes, int64_t device){{
   CUmodule mod;
   CUfunction fun;
   CUDA_CHECK(cuModuleLoadData(&mod, src));
@@ -1070,7 +1020,7 @@ static void init_function(const char* name, const unsigned char* src, size_t n_s
   function = fun;
 }}
 
-static void init_module(CUdevice device) {{
+static inline void init_module(CUdevice device) {{
   {func_init}
 }}
 
@@ -1209,16 +1159,72 @@ def make_cache_key(fn, signature, configs, constants, num_warps, num_stages):
     key = hashlib.md5(key.encode("utf-8")).hexdigest()
     return key
 
+# utilties for generating and compiling C wrappers
 
-def make_shared_object(fn, constants, signature, num_warps, binaries, tmpdir):
-    src = generate_torch_glue(fn.__name__, constants, signature, num_warps, binaries, tmpdir)
-    src_path = os.path.join(tmpdir, "main.c")
-    with open(src_path, "w") as f:
-        f.write(src)
+
+@functools.lru_cache()
+def libcuda_dir():
+    loc = subprocess.check_output(["whereis", "libcuda.so"]).decode().strip().split()[-1]
+    return os.path.dirname(loc)
+
+
+@contextlib.contextmanager
+def quiet():
+    old_stdout, old_stderr = sys.stdout, sys.stderr
+    sys.stdout, sys.stderr = io.StringIO(), io.StringIO()
+    try:
+        yield
+    finally:
+        sys.stdout, sys.stderr = old_stdout, old_stderr
+
+
+def _build(name, src, srcdir):
+    cuda_lib_dir = libcuda_dir()
+    cu_include_dir = "/usr/local/cuda/include"
+    suffix = sysconfig.get_config_var('EXT_SUFFIX')
+    so = os.path.join(srcdir, '{name}{suffix}'.format(name=name, suffix=suffix))
+    # try to avoid setuptools if possible
+    cc = os.environ.get("CC")
+    if cc is None:
+        # TODO: support more things here.
+        clang = shutil.which("clang")
+        gcc = shutil.which("gcc")
+        cc = gcc if gcc is not None else clang
+    py_include_dir = get_paths()["include"]
+    ret = subprocess.check_call([cc, src, "-O3", f"-I{cu_include_dir}", f"-I{py_include_dir}", f"-I{srcdir}", "-shared", "-fPIC", f"-L{cuda_lib_dir}", "-lcuda", "-o", so])
+    if ret == 0:
+        return so
+    # fallback on setuptools
+    extra_compile_args = []
+    library_dirs = [cuda_lib_dir]
+    include_dirs = [srcdir, cu_include_dir]
+    libraries = ['cuda']
+    # extra arguments
+    extra_link_args = []
+    # create extension module
+    ext = setuptools.Extension(
+        name=name,
+        language='c',
+        sources=[src],
+        include_dirs=include_dirs,
+        extra_compile_args=extra_compile_args + ['-O3'],
+        extra_link_args=extra_link_args,
+        library_dirs=library_dirs,
+        libraries=libraries,
+    )
+    # build extension module
+    args = ['build_ext']
+    args.append('--build-temp=' + srcdir)
+    args.append('--build-lib=' + srcdir)
+    args.append('-q')
+    args = dict(
+        name=name,
+        ext_modules=[ext],
+        script_args=args,
+    )
     with quiet():
-        bin_path = _build(fn.__name__, src_path, tmpdir)
-    with open(bin_path, "rb") as f:
-        return f.read()
+        setuptools.setup(**args)
+    return so
 
 
 def compile(fn, signature: str, device: int = -1, constants=dict(), num_warps: int = 4, num_stages: int = 3, extern_libs=None, configs=None):
@@ -1243,10 +1249,14 @@ def compile(fn, signature: str, device: int = -1, constants=dict(), num_warps: i
     with tempfile.TemporaryDirectory() as tmpdir:
         all_constants = set(constants.keys())
         all_constants.update(configs[0].equal_to_1)
-        so = make_shared_object(fn, all_constants, signature, num_warps, binaries, tmpdir)
+        src = generate_torch_glue(fn.__name__, constants, signature, num_warps, binaries, tmpdir)
+        src_path = os.path.join(tmpdir, "main.c")
+        with open(src_path, "w") as f:
+            f.write(src)
+        so = _build(fn.__name__, src_path, tmpdir)
+        with open(so, "rb") as f:
+            cache_manager.put(f.read())
 
-    # write shared object to cache
-    cache_manager.put(so)
     return CompiledKernel(fn.__name__, cache_manager.bin_path)