diff --git a/README.md b/README.md
index ed0fc71b1..bab417daa 100644
--- a/README.md
+++ b/README.md
@@ -33,6 +33,15 @@ And the latest nightly release:
 pip install -U --pre triton
 ```
 
+# Install from source
+
+```
+git clone https://github.com/openai/triton.git;
+cd triton/python;
+pip install cmake; # build time dependency
+pip install -e .
+```
+
 # Changelog
 
 Version 1.1 is out! New features include:
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index cad10c871..7e6071926 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -1593,9 +1593,9 @@ def test_num_warps_pow2():
                          [('int32', 'libdevice.ffs', ''),
                           ('float32', 'libdevice.pow', '/usr/local/cuda/nvvm/libdevice/libdevice.10.bc'),
                           ('float64', 'libdevice.norm4d', '')])
-def test_libdevice(dtype_str, expr, lib_path):
+def test_libdevice_tensor(dtype_str, expr, lib_path):
     if torch.version.hip is not None:
-        pytest.skip(f"test_libdevice currently has segfaults on ROCM")
+        pytest.skip(f"test_libdevice_tensor currently has segfaults on ROCM")
     @triton.jit
     def kernel(X, Y, BLOCK: tl.constexpr):
         x = tl.load(X + tl.arange(0, BLOCK))
@@ -1630,3 +1630,32 @@ def test_libdevice(dtype_str, expr, lib_path):
         np.testing.assert_equal(y_ref, to_numpy(y_tri))
     else:
         np.testing.assert_allclose(y_ref, to_numpy(y_tri), rtol=0.01)
+
+
+@pytest.mark.parametrize("dtype_str, expr, lib_path",
+                         [('float32', 'libdevice.pow', '')])
+def test_libdevice_scalar(dtype_str, expr, lib_path):
+
+    @triton.jit
+    def kernel(X, Y, BLOCK: tl.constexpr):
+        x = X
+        y = GENERATE_TEST_HERE
+        tl.store(Y + tl.arange(0, BLOCK), y)
+
+    shape = (128, )
+    rs = RandomState(17)
+    # limit the range of integers so that the sum does not overflow
+    x = numpy_random((1,), dtype_str=dtype_str, rs=rs)
+    y_ref = np.zeros(shape, dtype=x.dtype)
+
+    # numpy does not allow negative factors in power, so we use abs()
+    x = np.abs(x)
+    kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': 'tl.libdevice.pow(x, x)'})
+    y_ref[:] = np.power(x, x)
+
+    # triton result
+    x_tri = to_triton(x)[0].item()
+    y_tri = to_triton(numpy_random((shape[0],), dtype_str=dtype_str, rs=rs), device='cuda')
+    kernel[(1,)](x_tri, y_tri, BLOCK=shape[0], extern_libs={'libdevice': lib_path})
+    # compare
+    np.testing.assert_allclose(y_ref, to_numpy(y_tri), rtol=0.01)
diff --git a/python/triton/compiler.py b/python/triton/compiler.py
index e39eb053a..6eb331dbb 100644
--- a/python/triton/compiler.py
+++ b/python/triton/compiler.py
@@ -922,7 +922,10 @@ def ty_to_cpp(ty):
         "i64": "int64_t",
         "u32": "uint32_t",
         "u64": "uint64_t",
+        "fp16": "float",
+        "bf16": "float",
         "fp32": "float",
+        "fp64": "double",
     }[ty]
 
 
@@ -952,6 +955,8 @@ def generate_launcher(identifier, constants, signature):
             'i64': 'int64_t',
             'u32': 'uint32_t',
             'u64': 'uint64_t',
+            'fp16': 'float',
+            'bf16': 'float',
             'fp32': 'float',
             'fp64': 'double',
         }[ty]
diff --git a/python/triton/language/extern.py b/python/triton/language/extern.py
index a306a2e9a..1f3c9371c 100644
--- a/python/triton/language/extern.py
+++ b/python/triton/language/extern.py
@@ -59,28 +59,34 @@ def elementwise(lib_name: str, lib_path: str, args: list, arg_type_symbol_dict:
         :return: the return value of the function
     '''
     dispatch_args = args.copy()
-    if len(args) == 1:
-        dispatch_args[0] = core._to_tensor(dispatch_args[0], _builder)
-        ret_shape = dispatch_args[0].shape
-    elif len(args) == 2:
-        dispatch_args[0] = core._to_tensor(dispatch_args[0], _builder)
-        dispatch_args[1] = core._to_tensor(dispatch_args[1], _builder)
-        dispatch_args[0], dispatch_args[1] = semantic.binary_op_type_checking_impl(
-            dispatch_args[0], dispatch_args[1], _builder)
-        ret_shape = dispatch_args[0].shape
-    else:
-        for i in range(len(dispatch_args)):
-            dispatch_args[i] = core._to_tensor(dispatch_args[i], _builder)
-        broadcast_arg = dispatch_args[0]
-        # Get the broadcast shape over all the arguments
-        for i in range(len(dispatch_args)):
-            _, broadcast_arg = semantic.binary_op_type_checking_impl(
-                dispatch_args[i], broadcast_arg, _builder)
-        # Change the shape of each argument based on the broadcast shape
-        for i in range(len(dispatch_args)):
-            dispatch_args[i], _ = semantic.binary_op_type_checking_impl(
-                dispatch_args[i], broadcast_arg, _builder)
-        ret_shape = broadcast_arg.shape
+    all_scalar = True
+    ret_shape = None
+    for dispatch_arg in dispatch_args:
+        if dispatch_arg.type.is_block():
+            all_scalar = False
+    if not all_scalar:
+        if len(args) == 1:
+            dispatch_args[0] = core._to_tensor(dispatch_args[0], _builder)
+            ret_shape = dispatch_args[0].shape
+        elif len(args) == 2:
+            dispatch_args[0] = core._to_tensor(dispatch_args[0], _builder)
+            dispatch_args[1] = core._to_tensor(dispatch_args[1], _builder)
+            dispatch_args[0], dispatch_args[1] = semantic.binary_op_type_checking_impl(
+                dispatch_args[0], dispatch_args[1], _builder)
+            ret_shape = dispatch_args[0].shape
+        else:
+            for i in range(len(dispatch_args)):
+                dispatch_args[i] = core._to_tensor(dispatch_args[i], _builder)
+            broadcast_arg = dispatch_args[0]
+            # Get the broadcast shape over all the arguments
+            for i in range(len(dispatch_args)):
+                _, broadcast_arg = semantic.binary_op_type_checking_impl(
+                    dispatch_args[i], broadcast_arg, _builder)
+            # Change the shape of each argument based on the broadcast shape
+            for i in range(len(dispatch_args)):
+                dispatch_args[i], _ = semantic.binary_op_type_checking_impl(
+                    dispatch_args[i], broadcast_arg, _builder)
+            ret_shape = broadcast_arg.shape
     func = getattr(_builder, "create_extern_elementwise")
     return dispatch(func, lib_name, lib_path, dispatch_args, arg_type_symbol_dict, ret_shape, _builder)